diff --git a/.bcachefs_revision b/.bcachefs_revision
index abb9e489..34a8011b 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-da7fefde294e3c56359ee498a62a77182a4733cd
+62de7539dc2586b4bd7058b138de89f334d0c6bd
diff --git a/cmd_debug.c b/cmd_debug.c
index 1c5af4d1..a9d59774 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -160,8 +160,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
 		if (bkey_cmp(k.k->p, end) > 0)
 			break;
 
-		bch2_bkey_val_to_text(&PBUF(buf), c,
-				bkey_type(0, btree_id), k);
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		puts(buf);
 	}
 	bch2_btree_iter_unlock(&iter);
@@ -202,8 +201,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id,
 		fputs(buf, stdout);
 
 		for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) {
-			bch2_bkey_val_to_text(&PBUF(buf), c,
-					bkey_type(0, btree_id), k);
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
 			putchar('\t');
 			puts(buf);
 		}
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 497a4182..7863dec7 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -333,7 +333,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 		e->k.p.inode	= dst->bi_inum;
 		e->k.p.offset	= logical + sectors;
 		e->k.size	= sectors;
-		extent_ptr_append(e, (struct bch_extent_ptr) {
+		bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
 					.offset = physical,
 					.dev = 0,
 					.gen = bucket(ca, b)->mark.gen,
@@ -347,8 +347,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 			die("error reserving space in new filesystem: %s",
 			    strerror(-ret));
 
-		bch2_mark_bkey_replicas(c, BCH_DATA_USER,
-					extent_i_to_s_c(e).s_c);
+		bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
 					&res, NULL, 0);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 73be8873..6781a5b5 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -164,7 +164,7 @@ TRACE_EVENT(btree_write,
 	TP_ARGS(b, bytes, sectors),
 
 	TP_STRUCT__entry(
-		__field(enum bkey_type,	type)
+		__field(enum btree_node_type,	type)
 		__field(unsigned,	bytes			)
 		__field(unsigned,	sectors			)
 	),
diff --git a/libbcachefs.c b/libbcachefs.c
index 98f058d7..b24e7f37 100644
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -185,7 +185,8 @@ struct bch_sb *bch2_format(struct format_opts opts,
 	if (bch2_sb_realloc(&sb, 0))
 		die("insufficient memory");
 
-	sb.sb->version		= cpu_to_le64(BCH_SB_VERSION_MAX);
+	sb.sb->version		= le16_to_cpu(bcachefs_metadata_version_current);
+	sb.sb->version_min	= le16_to_cpu(bcachefs_metadata_version_current);
 	sb.sb->magic		= BCACHE_MAGIC;
 	sb.sb->block_size	= cpu_to_le16(opts.block_size);
 	sb.sb->user_uuid	= opts.uuid;
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 741e44ee..348060b2 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -23,9 +23,9 @@ static inline int acl_to_xattr_type(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
 	case ACL_TYPE_DEFAULT:
-		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
 	default:
 		BUG();
 	}
@@ -351,7 +351,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
 
 	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
 			&inode->ei_str_hash, inode->v.i_ino,
-			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
 			BTREE_ITER_INTENT);
 	if (IS_ERR(iter))
 		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 89929163..2e2fb99e 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -75,22 +75,15 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 
 const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
 	if (k.k->p.inode >= c->sb.nr_devices ||
 	    !c->devs[k.k->p.inode])
 		return "invalid device";
 
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
-		/* allow for unknown fields */
-		if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
-			return "incorrect value size";
-		break;
-	}
-	default:
-		return "invalid type";
-	}
+	/* allow for unknown fields */
+	if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+		return "incorrect value size";
 
 	return NULL;
 }
@@ -98,14 +91,9 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-		pr_buf(out, "gen %u", a.v->gen);
-		break;
-	}
-	}
+	pr_buf(out, "gen %u", a.v->gen);
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
@@ -157,7 +145,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 	struct bucket *g;
 	const u8 *d;
 
-	if (k.k->type != BCH_ALLOC)
+	if (k.k->type != KEY_TYPE_alloc)
 		return;
 
 	a = bkey_s_c_to_alloc(k);
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 6911fa69..b382c8b6 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -10,7 +10,7 @@
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_invalid,		\
 	.val_to_text	= bch2_alloc_to_text,		\
 }
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 91ab3369..5024e560 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -922,7 +922,8 @@ err:
  * as allocated out of @ob
  */
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i_extent *e, unsigned sectors)
+				    struct bkey_i *k, unsigned sectors)
+
 {
 	struct open_bucket *ob;
 	unsigned i;
@@ -934,13 +935,11 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 		struct bch_extent_ptr tmp = ob->ptr;
 
-		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
-
-		tmp.cached = bkey_extent_is_cached(&e->k) ||
-			(!ca->mi.durability && wp->type == BCH_DATA_USER);
+		tmp.cached = !ca->mi.durability &&
+			wp->type == BCH_DATA_USER;
 
 		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-		extent_ptr_append(e, tmp);
+		bch2_bkey_append_ptr(k, tmp);
 
 		BUG_ON(sectors > ob->sectors_free);
 		ob->sectors_free -= sectors;
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index a332e9d7..b0e44f75 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -100,7 +100,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
 					     struct closure *);
 
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i_extent *, unsigned);
+				    struct bkey_i *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
 void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 05891a01..d69da3e5 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -222,6 +222,8 @@
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
@@ -331,6 +333,7 @@ enum bch_time_stats {
 struct btree;
 
 enum gc_phase {
+	GC_PHASE_NOT_RUNNING,
 	GC_PHASE_START,
 	GC_PHASE_SB,
 
@@ -535,6 +538,7 @@ struct bch_fs {
 		uuid_le		uuid;
 		uuid_le		user_uuid;
 
+		u16		version;
 		u16		encoded_extent_max;
 
 		u8		nr_devices;
@@ -684,16 +688,17 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
-	/* ERASURE CODING */
-	struct list_head	ec_new_stripe_list;
-	struct mutex		ec_new_stripe_lock;
-
-	GENRADIX(struct ec_stripe) ec_stripes;
-	struct mutex		ec_stripes_lock;
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes[2];
+	struct mutex		ec_stripe_create_lock;
 
 	ec_stripes_heap		ec_stripes_heap;
 	spinlock_t		ec_stripes_heap_lock;
 
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
 	struct bio_set		ec_bioset;
 
 	struct work_struct	ec_stripe_delete_work;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index c462ab27..6d8397bc 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -302,15 +302,6 @@ static inline void bkey_init(struct bkey *k)
 #define __BKEY_PADDED(key, pad)					\
 	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
-#define BKEY_VAL_TYPE(name, nr)						\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-}
-
 /*
  * - DELETED keys are used internally to mark keys that should be ignored but
  *   override keys in composition order.  Their version number is ignored.
@@ -325,19 +316,37 @@ struct bkey_i_##name {							\
  *   by new writes or cluster-wide GC. Node repair can also overwrite them with
  *   the same or a more recent version number, but not with an older version
  *   number.
+ *
+ * - WHITEOUT: for hash table btrees
 */
-#define KEY_TYPE_DELETED		0
-#define KEY_TYPE_DISCARD		1
-#define KEY_TYPE_ERROR			2
-#define KEY_TYPE_COOKIE			3
-#define KEY_TYPE_PERSISTENT_DISCARD	4
-#define KEY_TYPE_GENERIC_NR		128
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(discard,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(whiteout,		4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
+};
 
 struct bch_cookie {
 	struct bch_val		v;
 	__le64			cookie;
 };
-BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
 
 /* Extents */
 
@@ -615,21 +624,12 @@ union bch_extent_entry {
 #undef x
 };
 
-enum {
-	BCH_EXTENT		= 128,
+struct bch_btree_ptr {
+	struct bch_val		v;
 
-	/*
-	 * This is kind of a hack, we're overloading the type for a boolean that
-	 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
-	 * have the same value type:
-	 */
-	BCH_EXTENT_CACHED	= 129,
-
-	/*
-	 * Persistent reservation:
-	 */
-	BCH_RESERVATION		= 130,
-};
+	struct bch_extent_ptr	start[0];
+	__u64			_data[0];
+} __attribute__((packed, aligned(8)));
 
 struct bch_extent {
 	struct bch_val		v;
@@ -637,7 +637,6 @@ struct bch_extent {
 	union bch_extent_entry	start[0];
 	__u64			_data[0];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(extent,		BCH_EXTENT);
 
 struct bch_reservation {
 	struct bch_val		v;
@@ -646,7 +645,6 @@ struct bch_reservation {
 	__u8			nr_replicas;
 	__u8			pad[3];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 
 /* Maximum size (in u64s) a single pointer could be: */
 #define BKEY_EXTENT_PTR_U64s_MAX\
@@ -674,12 +672,6 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 
 #define BCACHEFS_ROOT_INO	4096
 
-enum bch_inode_types {
-	BCH_INODE_FS		= 128,
-	BCH_INODE_BLOCKDEV	= 129,
-	BCH_INODE_GENERATION	= 130,
-};
-
 struct bch_inode {
 	struct bch_val		v;
 
@@ -688,7 +680,6 @@ struct bch_inode {
 	__le16			bi_mode;
 	__u8			fields[0];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode,		BCH_INODE_FS);
 
 struct bch_inode_generation {
 	struct bch_val		v;
@@ -696,7 +687,6 @@ struct bch_inode_generation {
 	__le32			bi_generation;
 	__le32			pad;
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 
 #define BCH_INODE_FIELDS()					\
 	BCH_INODE_FIELD(bi_atime,			64)	\
@@ -761,24 +751,6 @@ enum {
 LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
 
-struct bch_inode_blockdev {
-	struct bch_val		v;
-
-	__le64			i_size;
-	__le64			i_flags;
-
-	/* Seconds: */
-	__le64			i_ctime;
-	__le64			i_mtime;
-
-	uuid_le			i_uuid;
-	__u8			i_label[32];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
-
-/* Thin provisioned volume, or cache for another block device? */
-LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
-
 /* Dirents */
 
 /*
@@ -792,11 +764,6 @@ LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
  * collision:
  */
 
-enum {
-	BCH_DIRENT		= 128,
-	BCH_DIRENT_WHITEOUT	= 129,
-};
-
 struct bch_dirent {
 	struct bch_val		v;
 
@@ -811,7 +778,6 @@ struct bch_dirent {
 
 	__u8			d_name[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
 
 #define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
 			 sizeof(struct bkey) -				\
@@ -820,16 +786,11 @@ BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
 
 /* Xattrs */
 
-enum {
-	BCH_XATTR		= 128,
-	BCH_XATTR_WHITEOUT	= 129,
-};
-
-#define BCH_XATTR_INDEX_USER			0
-#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
-#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
-#define BCH_XATTR_INDEX_TRUSTED			3
-#define BCH_XATTR_INDEX_SECURITY	        4
+#define KEY_TYPE_XATTR_INDEX_USER			0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
 
 struct bch_xattr {
 	struct bch_val		v;
@@ -838,14 +799,9 @@ struct bch_xattr {
 	__le16			x_val_len;
 	__u8			x_name[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(xattr,		BCH_XATTR);
 
 /* Bucket/allocation information: */
 
-enum {
-	BCH_ALLOC		= 128,
-};
-
 enum {
 	BCH_ALLOC_FIELD_READ_TIME	= 0,
 	BCH_ALLOC_FIELD_WRITE_TIME	= 1,
@@ -857,14 +813,9 @@ struct bch_alloc {
 	__u8			gen;
 	__u8			data[];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
 
 /* Quotas: */
 
-enum {
-	BCH_QUOTA		= 128,
-};
-
 enum quota_types {
 	QTYP_USR		= 0,
 	QTYP_GRP		= 1,
@@ -887,14 +838,9 @@ struct bch_quota {
 	struct bch_val		v;
 	struct bch_quota_counter c[Q_COUNTERS];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(quota,	BCH_QUOTA);
 
 /* Erasure coding */
 
-enum {
-	BCH_STRIPE		= 128,
-};
-
 struct bch_stripe {
 	struct bch_val		v;
 	__le16			sectors;
@@ -908,7 +854,6 @@ struct bch_stripe {
 
 	struct bch_extent_ptr	ptrs[0];
 } __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(stripe,	BCH_STRIPE);
 
 /* Optional/variable size superblock sections: */
 
@@ -1144,15 +1089,21 @@ struct bch_sb_field_clean {
 /* Superblock: */
 
 /*
- * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
- *		BCH_MEMBER_DATA_ALLOWED
- * Version 9:	incompatible extent nonce change
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
  */
+#define BCH_JSET_VERSION_OLD			2
+#define BCH_BSET_VERSION_OLD			3
 
-#define BCH_SB_VERSION_MIN		7
-#define BCH_SB_VERSION_EXTENT_MAX	8
-#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
-#define BCH_SB_VERSION_MAX		9
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min			= 9,
+	bcachefs_metadata_version_new_versioning	= 10,
+	bcachefs_metadata_version_bkey_renumber		= 10,
+	bcachefs_metadata_version_max			= 11,
+};
+
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
 #define BCH_SB_SECTOR			8
 #define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
@@ -1171,6 +1122,9 @@ struct bch_sb_layout {
 /*
  * @offset	- sector where this sb was written
  * @version	- on disk format version
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
  * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
  * @seq		- incremented each time superblock is written
  * @uuid	- used for generating various magic numbers and identifying
@@ -1183,7 +1137,9 @@ struct bch_sb_layout {
  */
 struct bch_sb {
 	struct bch_csum		csum;
-	__le64			version;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
 	uuid_le			magic;
 	uuid_le			uuid;
 	uuid_le			user_uuid;
@@ -1359,11 +1315,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 
 /* Journal */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_JKEYS	2
-#define BCACHE_JSET_VERSION		2
-
 #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
 
 #define BCH_JSET_ENTRY_TYPES()			\
@@ -1443,35 +1394,26 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 
 /* Btree: */
 
-#define DEFINE_BCH_BTREE_IDS()					\
-	DEF_BTREE_ID(EXTENTS,	0, "extents")			\
-	DEF_BTREE_ID(INODES,	1, "inodes")			\
-	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
-	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
-	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")			\
-	DEF_BTREE_ID(EC,	6, "erasure_coding")
-
-#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
+#define BCH_BTREE_IDS()				\
+	x(EXTENTS,	0, "extents")			\
+	x(INODES,	1, "inodes")			\
+	x(DIRENTS,	2, "dirents")			\
+	x(XATTRS,	3, "xattrs")			\
+	x(ALLOC,	4, "alloc")			\
+	x(QUOTAS,	5, "quotas")			\
+	x(EC,		6, "erasure_coding")
 
 enum btree_id {
-	DEFINE_BCH_BTREE_IDS()
+#define x(kwd, val, name) BTREE_ID_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
 	BTREE_ID_NR
 };
 
-#undef DEF_BTREE_ID
-
 #define BTREE_MAX_DEPTH		4U
 
 /* Btree nodes */
 
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_CSUM		1
-#define BCACHE_BSET_KEY_v1		2
-#define BCACHE_BSET_JOURNAL_SEQ		3
-#define BCACHE_BSET_VERSION		3
-
 /*
  * Btree nodes
  *
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 135ecb8d..25725e42 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -484,7 +484,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 	pack_state_finish(&state, out);
 	out->u64s	= f->key_u64s;
 	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->type	= KEY_TYPE_DELETED;
+	out->type	= KEY_TYPE_deleted;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	if (exact) {
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 28bf646c..15397099 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -52,10 +52,12 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+#define bkey_val_end(_k)	vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
 
 #define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
 
 #define bkey_packed_typecheck(_k)					\
 ({									\
@@ -430,7 +432,15 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
  * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
  * functions.
  */
-#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+#define BKEY_VAL_ACCESSORS(name)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
 struct bkey_s_c_##name {						\
 	union {								\
 	struct {							\
@@ -455,20 +465,20 @@ struct bkey_s_##name {							\
 									\
 static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline const struct bkey_i_##name *				\
 bkey_i_to_##name##_c(const struct bkey_i *k)				\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return container_of(&k->k, struct bkey_i_##name, k);		\
 }									\
 									\
 static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 {									\
-	_assert(k.k->type, nr);						\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
 	return (struct bkey_s_##name) {					\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -477,7 +487,7 @@ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
 									\
 static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
 {									\
-	_assert(k.k->type, nr);						\
+	EBUG_ON(k.k->type != KEY_TYPE_##name);				\
 	return (struct bkey_s_c_##name) {				\
 		.k = k.k,						\
 		.v = container_of(k.v, struct bch_##name, v),		\
@@ -503,7 +513,7 @@ name##_i_to_s_c(const struct bkey_i_##name *k)				\
 									\
 static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return (struct bkey_s_##name) {					\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
@@ -513,27 +523,13 @@ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
 static inline struct bkey_s_c_##name					\
 bkey_i_to_s_c_##name(const struct bkey_i *k)				\
 {									\
-	_assert(k->k.type, nr);						\
+	EBUG_ON(k->k.type != KEY_TYPE_##name);				\
 	return (struct bkey_s_c_##name) {				\
 		.k = &k->k,						\
 		.v = container_of(&k->v, struct bch_##name, v),		\
 	};								\
 }									\
 									\
-static inline struct bch_##name *					\
-bkey_p_##name##_val(const struct bkey_format *f,			\
-		    struct bkey_packed *k)				\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
-static inline const struct bch_##name *					\
-bkey_p_c_##name##_val(const struct bkey_format *f,			\
-		      const struct bkey_packed *k)			\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
 static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 {									\
 	struct bkey_i_##name *k =					\
@@ -541,45 +537,23 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
 									\
 	bkey_init(&k->k);						\
 	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = nr;							\
+	k->k.type = KEY_TYPE_##name;					\
 	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
 									\
 	return k;							\
 }
 
-#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
-
-#define BKEY_VAL_ACCESSORS(name, _nr)					\
-	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
-	{								\
-		EBUG_ON(type != _nr);					\
-	}								\
-									\
-	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
-
-BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
-
-static inline void __bch2_extent_assert(u8 type, u8 nr)
-{
-	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
-}
-
-__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch2_extent_assert);
-BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);
-
-BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
-BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
-BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);
-
-BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
-
-BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
-
-BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
-
-BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
-
-BKEY_VAL_ACCESSORS(stripe,		BCH_STRIPE);
+BKEY_VAL_ACCESSORS(cookie);
+BKEY_VAL_ACCESSORS(btree_ptr);
+BKEY_VAL_ACCESSORS(extent);
+BKEY_VAL_ACCESSORS(reservation);
+BKEY_VAL_ACCESSORS(inode);
+BKEY_VAL_ACCESSORS(inode_generation);
+BKEY_VAL_ACCESSORS(dirent);
+BKEY_VAL_ACCESSORS(xattr);
+BKEY_VAL_ACCESSORS(alloc);
+BKEY_VAL_ACCESSORS(quota);
+BKEY_VAL_ACCESSORS(stripe);
 
 /* byte order helpers */
 
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 97d72d2b..6b04bef7 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -11,66 +11,84 @@
 #include "quota.h"
 #include "xattr.h"
 
-const struct bkey_ops bch2_bkey_ops[] = {
-	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
-	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
-	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
-	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
-	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
-	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
-	[BKEY_TYPE_EC]		= bch2_bkey_ec_ops,
-	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+const char * const bch_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
-				  struct bkey_s_c k)
+static const char *deleted_key_invalid(const struct bch_fs *c,
+					struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
-
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-	case KEY_TYPE_DISCARD:
-		return NULL;
-
-	case KEY_TYPE_ERROR:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
-
-	case KEY_TYPE_COOKIE:
-		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
-			? "incorrect value size"
-			: NULL;
-
-	default:
-		if (k.k->type < KEY_TYPE_GENERIC_NR)
-			return "invalid type";
-
-		return ops->key_invalid(c, k);
-	}
+	return NULL;
 }
 
-const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
-{
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+const struct bkey_ops bch2_bkey_ops_deleted = {
+	.key_invalid = deleted_key_invalid,
+};
 
+const struct bkey_ops bch2_bkey_ops_discard = {
+	.key_invalid = deleted_key_invalid,
+};
+
+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k))
+		return "value size should be zero";
+
+	return NULL;
+}
+
+const struct bkey_ops bch2_bkey_ops_error = {
+	.key_invalid = empty_val_key_invalid,
+};
+
+static const char *key_type_cookie_invalid(const struct bch_fs *c,
+					   struct bkey_s_c k)
+{
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+const struct bkey_ops bch2_bkey_ops_cookie = {
+	.key_invalid = key_type_cookie_invalid,
+};
+
+const struct bkey_ops bch2_bkey_ops_whiteout = {
+	.key_invalid = empty_val_key_invalid,
+};
+
+static const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->type >= KEY_TYPE_MAX)
+		return "invalid type";
+
+	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+}
+
+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+				enum btree_node_type type)
+{
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
-	if (!ops->is_extents) {
-		if (k.k->size)
-			return "nonzero size field";
-	} else {
+	if (btree_node_type_is_extents(type)) {
 		if ((k.k->size == 0) != bkey_deleted(k.k))
 			return "bad size field";
+	} else {
+		if (k.k->size)
+			return "nonzero size field";
 	}
 
-	if (ops->is_extents &&
-	    !k.k->size &&
-	    !bkey_deleted(k.k))
-		return "zero size field";
-
 	if (k.k->p.snapshot)
 		return "nonzero snapshot";
 
@@ -81,11 +99,11 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 	return NULL;
 }
 
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			      enum btree_node_type type)
 {
-	return __bch2_bkey_invalid(c, type, k) ?:
-		bch2_bkey_val_invalid(c, type, k);
+	return __bch2_bkey_invalid(c, k, type) ?:
+		bch2_bkey_val_invalid(c, k);
 }
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
@@ -101,24 +119,22 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
-	enum bkey_type type = btree_node_type(b);
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 	const char *invalid;
 
 	BUG_ON(!k.k->u64s);
 
-	invalid = bch2_bkey_invalid(c, type, k) ?:
+	invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
 		bch2_bkey_in_btree_node(b, k);
 	if (invalid) {
 		char buf[160];
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, type, k);
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
 		return;
 	}
 
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->key_debugcheck)
+	if (ops->key_debugcheck)
 		ops->key_debugcheck(c, b, k);
 }
 
@@ -143,46 +159,90 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 }
 
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
-		      enum bkey_type type, struct bkey_s_c k)
+		      struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-		pr_buf(out, " deleted");
-		break;
-	case KEY_TYPE_DISCARD:
-		pr_buf(out, " discard");
-		break;
-	case KEY_TYPE_ERROR:
-		pr_buf(out, " error");
-		break;
-	case KEY_TYPE_COOKIE:
-		pr_buf(out, " cookie");
-		break;
-	default:
-		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-			ops->val_to_text(out, c, k);
-		break;
-	}
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
+	else
+		pr_buf(out, " %s", bch_bkey_types[k.k->type]);
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-			   enum bkey_type type, struct bkey_s_c k)
+			   struct bkey_s_c k)
 {
 	bch2_bkey_to_text(out, k.k);
 	pr_buf(out, ": ");
-	bch2_val_to_text(out, c, type, k);
+	bch2_val_to_text(out, c, k);
 }
 
-void bch2_bkey_swab(enum bkey_type type,
-		   const struct bkey_format *f,
-		   struct bkey_packed *k)
+void bch2_bkey_swab(const struct bkey_format *f,
+		    struct bkey_packed *k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
 
 	bch2_bkey_swab_key(f, k);
 
 	if (ops->swab)
 		ops->swab(f, k);
 }
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
+
+enum merge_result bch2_bkey_merge(struct bch_fs *c,
+				  struct bkey_i *l, struct bkey_i *r)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[l->k.type];
+
+	if (!key_merging_disabled(c) &&
+	    ops->key_merge &&
+	    l->k.type == r->k.type &&
+	    !bversion_cmp(l->k.version, r->k.version) &&
+	    !bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+		return ops->key_merge(c, l, r);
+
+	return BCH_MERGE_NOMERGE;
+}
+
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_BTREE,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_EXTENTS,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_EXTENTS,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_INODES,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_INODES,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_DIRENTS,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_DIRENTS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_XATTRS,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_XATTRS,	129, KEY_TYPE_whiteout		},
+	{BKEY_TYPE_ALLOC,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_QUOTAS,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
+{
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 11ce12f5..cf7a9e9c 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -3,24 +3,12 @@
 
 #include "bkey.h"
 
-#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
-
-enum bkey_type {
-	DEFINE_BCH_BTREE_IDS()
-	BKEY_TYPE_BTREE,
-};
-
-#undef DEF_BTREE_ID
-
-/* Type of a key in btree @id at level @level: */
-static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
-{
-	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
-}
-
 struct bch_fs;
 struct btree;
 struct bkey;
+enum btree_node_type;
+
+extern const char * const bch_bkey_types[];
 
 enum merge_result {
 	BCH_MERGE_NOMERGE,
@@ -33,12 +21,6 @@ enum merge_result {
 	BCH_MERGE_MERGE,
 };
 
-typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
-			      struct bkey_s);
-typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
-					  struct btree *,
-					  struct bkey_i *, struct bkey_i *);
-
 struct bkey_ops {
 	/* Returns reason for being invalid if invalid, else NULL: */
 	const char *	(*key_invalid)(const struct bch_fs *,
@@ -48,29 +30,34 @@ struct bkey_ops {
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
-	key_filter_fn	key_normalize;
-	key_merge_fn	key_merge;
-	bool		is_extents;
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	enum merge_result (*key_merge)(struct bch_fs *,
+				       struct bkey_i *, struct bkey_i *);
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
-				  struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+				enum btree_node_type);
+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+			      enum btree_node_type);
 const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *, enum bkey_type,
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
 		      struct bkey_s_c);
 void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
-			   enum bkey_type, struct bkey_s_c);
+			   struct bkey_s_c);
 
-void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
-		    struct bkey_packed *);
+void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
 
-extern const struct bkey_ops bch2_bkey_ops[];
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+enum merge_result bch2_bkey_merge(struct bch_fs *,
+				  struct bkey_i *, struct bkey_i *);
+
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 #endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
new file mode 100644
index 00000000..c47c862f
--- /dev/null
+++ b/libbcachefs/bkey_sort.c
@@ -0,0 +1,652 @@
+#include "bcachefs.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+/* too many iterators, need to clean this up */
+
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
+
+static inline bool
+bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+	return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	return bch2_btree_node_iter_large_end(iter)
+		? NULL
+		: __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static void
+bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+				   struct btree *b)
+{
+	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+	EBUG_ON(!iter->used);
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
+	else
+		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_large_advance(iter, b);
+
+	return ret;
+}
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+				     struct btree *b,
+				     const struct bkey_packed *k,
+				     const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set n =
+			((struct btree_node_iter_set) {
+				 __btree_node_key_to_offset(b, k),
+				 __btree_node_key_to_offset(b, end)
+			 });
+
+		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
+	}
+}
+
+static void sort_key_next(struct btree_node_iter_large *iter,
+			  struct btree *b,
+			  struct btree_node_iter_set *i)
+{
+	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
+/* regular sort_iters */
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+static inline void __sort_iter_sift(struct sort_iter *iter,
+				    unsigned from,
+				    sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+
+	__sort_iter_sift(iter, 0, cmp);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		__sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return iter->used ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	iter->data->k = bkey_next(iter->data->k);
+
+	BUG_ON(iter->data->k > iter->data->end);
+
+	if (iter->data->k == iter->data->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+#define key_sort_cmp(h, l, r)						\
+({									\
+	bkey_cmp_packed(b,						\
+			__btree_node_offset_to_key(b, (l).k),		\
+			__btree_node_offset_to_key(b, (r).k))		\
+									\
+	?: (l).k - (r).k;						\
+})
+
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
+					struct btree *b)
+{
+	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
+	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+
+	if (bkey_whiteout(k))
+		return true;
+
+	if (iter->used < 2)
+		return false;
+
+	if (iter->used > 2 &&
+	    key_sort_cmp(iter, r[0], r[1]) >= 0)
+		r++;
+
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older and
+	 * should be dropped.
+	 */
+	return !bkey_cmp_packed(b,
+				__btree_node_offset_to_key(b, l->k),
+				__btree_node_offset_to_key(b, r->k));
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, key_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		if (!should_drop_next_key(iter, b)) {
+			struct bkey_packed *k =
+				__btree_node_offset_to_key(b, iter->data->k);
+
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_next(out);
+		}
+
+		sort_key_next(iter, b, iter->data);
+		heap_sift_down(iter, 0, key_sort_cmp, NULL);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+#define extent_sort_cmp(h, l, r)					\
+({									\
+	struct bkey _ul = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (l).k));	\
+	struct bkey _ur = bkey_unpack_key(b,				\
+				__btree_node_offset_to_key(b, (r).k));	\
+									\
+	bkey_cmp(bkey_start_pos(&_ul),					\
+		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
+})
+
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
+				    struct btree *b, size_t i)
+{
+	heap_sift_down(iter, i, extent_sort_cmp, NULL);
+}
+
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
+				    struct btree *b,
+				    struct btree_node_iter_set *i)
+{
+	sort_key_next(iter, b, i);
+	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
+}
+
+static void extent_sort_append(struct bch_fs *c,
+			       struct btree *b,
+			       struct btree_nr_keys *nr,
+			       struct bkey_packed *start,
+			       struct bkey_packed **prev,
+			       struct bkey_packed *k)
+{
+	struct bkey_format *f = &b->format;
+	BKEY_PADDED(k) tmp;
+
+	if (bkey_whiteout(k))
+		return;
+
+	bch2_bkey_unpack(b, &tmp.k, k);
+
+	if (*prev &&
+	    bch2_bkey_merge(c, (void *) *prev, &tmp.k))
+		return;
+
+	if (*prev) {
+		bch2_bkey_pack(*prev, (void *) *prev, f);
+
+		btree_keys_account_key_add(nr, 0, *prev);
+		*prev = bkey_next(*prev);
+	} else {
+		*prev = start;
+	}
+
+	bkey_copy(*prev, &tmp.k);
+}
+
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+					struct bset *dst,
+					struct btree *b,
+					struct btree_node_iter_large *iter)
+{
+	struct bkey_format *f = &b->format;
+	struct btree_node_iter_set *_l = iter->data, *_r;
+	struct bkey_packed *prev = NULL, *out, *lk, *rk;
+	struct bkey l_unpacked, r_unpacked;
+	struct bkey_s l, r;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	heap_resort(iter, extent_sort_cmp, NULL);
+
+	while (!bch2_btree_node_iter_large_end(iter)) {
+		lk = __btree_node_offset_to_key(b, _l->k);
+
+		if (iter->used == 1) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		_r = iter->data + 1;
+		if (iter->used > 2 &&
+		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
+			_r++;
+
+		rk = __btree_node_offset_to_key(b, _r->k);
+
+		l = __bkey_disassemble(b, lk, &l_unpacked);
+		r = __bkey_disassemble(b, rk, &r_unpacked);
+
+		/* If current key and next key don't overlap, just append */
+		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
+			extent_sort_next(iter, b, _l);
+			continue;
+		}
+
+		/* Skip 0 size keys */
+		if (!r.k->size) {
+			extent_sort_next(iter, b, _r);
+			continue;
+		}
+
+		/*
+		 * overlap: keep the newer key and trim the older key so they
+		 * don't overlap. comparing pointers tells us which one is
+		 * newer, since the bsets are appended one after the other.
+		 */
+
+		/* can't happen because of comparison func */
+		BUG_ON(_l->k < _r->k &&
+		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+		if (_l->k > _r->k) {
+			/* l wins, trim r */
+			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+				sort_key_next(iter, b, _r);
+			} else {
+				__bch2_cut_front(l.k->p, r);
+				extent_save(b, rk, r.k);
+			}
+
+			extent_sort_sift(iter, b, _r - iter->data);
+		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+			BKEY_PADDED(k) tmp;
+
+			/*
+			 * r wins, but it overlaps in the middle of l - split l:
+			 */
+			bkey_reassemble(&tmp.k, l.s_c);
+			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+
+			__bch2_cut_front(r.k->p, l);
+			extent_save(b, lk, l.k);
+
+			extent_sort_sift(iter, b, 0);
+
+			extent_sort_append(c, b, &nr, dst->start, &prev,
+					   bkey_to_packed(&tmp.k));
+		} else {
+			bch2_cut_back(bkey_start_pos(r.k), l.k);
+			extent_save(b, lk, l.k);
+		}
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = dst->start;
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(in))
+			continue;
+
+		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+				       ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort, repack, and merge: */
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *c,
+		       struct bset *dst, struct btree *src,
+		       struct btree_node_iter *iter,
+		       struct bkey_format *out_f,
+		       bool filter_whiteouts)
+{
+	struct bkey_packed *k, *prev = NULL, *out;
+	struct btree_nr_keys nr;
+	BKEY_PADDED(k) tmp;
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
+		if (filter_whiteouts && bkey_whiteout(k))
+			continue;
+
+		/*
+		 * The filter might modify pointers, so we have to unpack the
+		 * key and values to &tmp.k:
+		 */
+		bch2_bkey_unpack(src, &tmp.k, k);
+
+		if (filter_whiteouts &&
+		    bch2_bkey_normalize(c, bkey_i_to_s(&tmp.k)))
+			continue;
+
+		/* prev is always unpacked, for key merging: */
+
+		if (prev &&
+		    bch2_bkey_merge(c, (void *) prev, &tmp.k) ==
+		    BCH_MERGE_MERGE)
+			continue;
+
+		/*
+		 * the current key becomes the new prev: advance prev, then
+		 * copy the current key - but first pack prev (in place):
+		 */
+		if (prev) {
+			bch2_bkey_pack(prev, (void *) prev, out_f);
+
+			btree_keys_account_key_add(&nr, 0, prev);
+			prev = bkey_next(prev);
+		} else {
+			prev = vstruct_last(dst);
+		}
+
+		bkey_copy(prev, &tmp.k);
+	}
+
+	if (prev) {
+		bch2_bkey_pack(prev, (void *) prev, out_f);
+		btree_keys_account_key_add(&nr, 0, prev);
+		out = bkey_next(prev);
+	} else {
+		out = vstruct_last(dst);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+			struct sort_iter *iter,
+			bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (next = sort_iter_peek(iter)) &&
+		    !bkey_cmp_packed(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			/*
+			 * XXX racy, called with read lock from write path
+			 *
+			 * leads to spurious BUG_ON() in bkey_unpack_key() in
+			 * debug mode
+			 */
+			next->needs_whiteout |= in->needs_whiteout;
+			continue;
+		}
+
+		if (bkey_whiteout(in)) {
+			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extents_cmp(struct btree *b,
+				   struct bkey_packed *l,
+				   struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r) ?:
+		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+}
+
+unsigned bch2_sort_extents(struct bkey_packed *dst,
+			   struct sort_iter *iter,
+			   bool filter_whiteouts)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_extents_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		if (bkey_whiteout(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_key_whiteouts_cmp(struct btree *b,
+					 struct bkey_packed *l,
+					 struct bkey_packed *r)
+{
+	return bkey_cmp_packed(b, l, r);
+}
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst,
+				 struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
+		bkey_copy(out, in);
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+static inline int sort_extent_whiteouts_cmp(struct btree *b,
+					    struct bkey_packed *l,
+					    struct bkey_packed *r)
+{
+	struct bkey ul = bkey_unpack_key(b, l);
+	struct bkey ur = bkey_unpack_key(b, r);
+
+	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
+}
+
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
+				    struct sort_iter *iter)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *out = dst;
+	struct bkey_i l, r;
+	bool prev = false, l_packed = false;
+	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
+	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
+	u64 new_size;
+
+	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
+
+	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+		if (bkey_deleted(in))
+			continue;
+
+		EBUG_ON(bkeyp_val_u64s(f, in));
+		EBUG_ON(in->type != KEY_TYPE_discard);
+
+		r.k = bkey_unpack_key(iter->b, in);
+
+		if (prev &&
+		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			new_size = l_packed
+				? min(max_packed_size, max_packed_offset -
+				      bkey_start_offset(&l.k))
+				: KEY_SIZE_MAX;
+
+			new_size = min(new_size, r.k.p.offset -
+				       bkey_start_offset(&l.k));
+
+			BUG_ON(new_size < l.k.size);
+
+			bch2_key_resize(&l.k, new_size);
+
+			if (bkey_cmp(l.k.p, r.k.p) >= 0)
+				continue;
+
+			bch2_cut_front(l.k.p, &r);
+		}
+
+		if (prev) {
+			if (!bch2_bkey_pack(out, &l, f)) {
+				BUG_ON(l_packed);
+				bkey_copy(out, &l);
+			}
+			out = bkey_next(out);
+		}
+
+		l = r;
+		prev = true;
+		l_packed = bkey_packed(in);
+	}
+
+	if (prev) {
+		if (!bch2_bkey_pack(out, &l, f)) {
+			BUG_ON(l_packed);
+			bkey_copy(out, &l);
+		}
+		out = bkey_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h
new file mode 100644
index 00000000..d189d814
--- /dev/null
+++ b/libbcachefs/bkey_sort.h
@@ -0,0 +1,68 @@
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct btree_node_iter_large {
+	u16		used;
+
+	struct btree_node_iter_set data[MAX_BSETS];
+};
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+				     struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+
+struct sort_iter {
+	struct btree	*b;
+	unsigned		used;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->b = b;
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bset *, struct btree *,
+			      struct btree_node_iter_large *);
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
+				 struct btree *,
+				 struct btree_node_iter_large *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+struct btree_nr_keys
+bch2_sort_repack_merge(struct bch_fs *,
+		       struct bset *, struct btree *,
+		       struct btree_node_iter *,
+		       struct bkey_format *, bool);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+			struct sort_iter *, bool);
+unsigned bch2_sort_extents(struct bkey_packed *,
+			   struct sort_iter *, bool);
+
+unsigned bch2_sort_key_whiteouts(struct bkey_packed *,
+				 struct sort_iter *);
+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
+				    struct sort_iter *);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index a27c8a29..8bc2fdfd 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -381,7 +381,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 static inline struct bkey_packed *
 bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 {
-	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
+	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
 }
 
 enum bch_extent_overlap {
@@ -513,7 +513,7 @@ bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
 static inline struct bkey_packed *
 bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
 }
 
 static inline struct bkey_packed *
@@ -539,7 +539,7 @@ bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
 static inline struct bkey_packed *
 bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
+	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
 }
 
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 28ac862b..d99441a1 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -5,20 +5,17 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
-#include "extents.h"
 
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
-#define DEF_BTREE_ID(kwd, val, name) name,
-
 const char * const bch2_btree_ids[] = {
-	DEFINE_BCH_BTREE_IDS()
+#define x(kwd, val, name) name,
+	BCH_BTREE_IDS()
+#undef x
 	NULL
 };
 
-#undef DEF_BTREE_ID
-
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
 	unsigned i, reserve = 16;
@@ -99,7 +96,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 	if (!b)
 		return NULL;
 
-	bkey_extent_init(&b->key);
+	bkey_btree_ptr_init(&b->key);
 	six_lock_init(&b->lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
@@ -115,7 +112,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
 
 	/* Cause future lookups for this node to fail: */
-	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+	PTR_HASH(&b->key) = 0;
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -602,7 +599,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 		/* raced with another fill: */
 
 		/* mark as unhashed... */
-		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+		PTR_HASH(&b->key) = 0;
 
 		mutex_lock(&bc->lock);
 		list_add(&b->list, &bc->freeable);
@@ -904,8 +901,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 	       b->data->min_key.offset,
 	       b->data->max_key.inode,
 	       b->data->max_key.offset);
-	bch2_val_to_text(out, c, BKEY_TYPE_BTREE,
-			 bkey_i_to_s_c(&b->key));
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 	pr_buf(out, "\n"
 	       "    format: u64s %u fields %u %u %u %u %u\n"
 	       "    unpack fn len: %u\n"
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 399f8b9a..08e6f2a6 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -3,7 +3,6 @@
 
 #include "bcachefs.h"
 #include "btree_types.h"
-#include "extents.h"
 
 struct btree_iter;
 
@@ -36,12 +35,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
 
-#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+#define PTR_HASH(_k)	*((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
 
 /* is btree node in hash table? */
 static inline bool btree_node_hashed(struct btree *b)
 {
-	return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+	return b->key.k.type == KEY_TYPE_btree_ptr &&
+		PTR_HASH(&b->key);
 }
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 9fe438d0..c30d1f7b 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -109,152 +109,11 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
 
 /* marking of btree keys/nodes: */
 
-static bool bkey_type_needs_gc(enum bkey_type type)
-{
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-	case BKEY_TYPE_EC:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static void ptr_gen_recalc_oldest(struct bch_fs *c,
-				  const struct bch_extent_ptr *ptr,
-				  u8 *max_stale)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	size_t b = PTR_BUCKET_NR(ca, ptr);
-
-	if (gen_after(ca->oldest_gens[b], ptr->gen))
-		ca->oldest_gens[b] = ptr->gen;
-
-	*max_stale = max(*max_stale, ptr_stale(ca, ptr));
-}
-
-static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
-				 enum bkey_type type,
-				 struct bkey_s_c k)
+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+			    u8 *max_stale, bool initial)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
-	u8 max_stale = 0;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		switch (k.k->type) {
-		case BCH_EXTENT:
-		case BCH_EXTENT_CACHED: {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-			extent_for_each_ptr(e, ptr)
-				ptr_gen_recalc_oldest(c, ptr, &max_stale);
-			break;
-		}
-		}
-		break;
-	case BKEY_TYPE_EC:
-		switch (k.k->type) {
-		case BCH_STRIPE: {
-			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-			for (ptr = s.v->ptrs;
-			     ptr < s.v->ptrs + s.v->nr_blocks;
-			     ptr++)
-				ptr_gen_recalc_oldest(c, ptr, &max_stale);
-		}
-		}
-	default:
-		break;
-	}
-
-	return max_stale;
-}
-
-static int ptr_gen_check(struct bch_fs *c,
-			 enum bkey_type type,
-			 const struct bch_extent_ptr *ptr)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	size_t b = PTR_BUCKET_NR(ca, ptr);
-	struct bucket *g = PTR_BUCKET(ca, ptr);
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
-				"found ptr with missing gen in alloc btree,\n"
-				"type %u gen %u",
-				type, ptr->gen)) {
-		g->_mark.gen = ptr->gen;
-		g->_mark.gen_valid = 1;
-		set_bit(b, ca->buckets_dirty);
-	}
-
-	if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-				"%u ptr gen in the future: %u > %u",
-				type, ptr->gen, g->mark.gen)) {
-		g->_mark.gen = ptr->gen;
-		g->_mark.gen_valid = 1;
-		set_bit(b, ca->buckets_dirty);
-		set_bit(BCH_FS_FIXED_GENS, &c->flags);
-	}
-fsck_err:
-	return ret;
-}
-
-static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
-			  struct bkey_s_c k)
-{
-	const struct bch_extent_ptr *ptr;
-	int ret = 0;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		switch (k.k->type) {
-		case BCH_EXTENT:
-		case BCH_EXTENT_CACHED: {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-			extent_for_each_ptr(e, ptr) {
-				ret = ptr_gen_check(c, type, ptr);
-				if (ret)
-					return ret;
-
-			}
-			break;
-		}
-		}
-		break;
-	case BKEY_TYPE_EC:
-		switch (k.k->type) {
-		case BCH_STRIPE: {
-			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-			for (ptr = s.v->ptrs;
-			     ptr < s.v->ptrs + s.v->nr_blocks;
-			     ptr++) {
-				ret = ptr_gen_check(c, type, ptr);
-				if (ret)
-					return ret;
-			}
-		}
-		}
-		break;
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-/*
- * For runtime mark and sweep:
- */
-static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
-			    struct bkey_s_c k, bool initial)
-{
 	struct gc_pos pos = { 0 };
 	unsigned flags =
 		BCH_BUCKET_MARK_GC|
@@ -269,52 +128,77 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 			atomic64_set(&c->key_version, k.k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, type, k,
-							   false), c,
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
 				"superblock not marked as containing replicas (type %u)",
-				type)) {
-			ret = bch2_mark_bkey_replicas(c, type, k);
+				k.k->type)) {
+			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				return ret;
 		}
 
-		ret = ptr_gens_check(c, type, k);
-		if (ret)
-			return ret;
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+			size_t b = PTR_BUCKET_NR(ca, ptr);
+			struct bucket *g = PTR_BUCKET(ca, ptr);
+
+			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+					"found ptr with missing gen in alloc btree,\n"
+					"type %u gen %u",
+					k.k->type, ptr->gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+			}
+
+			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+					"%u ptr gen in the future: %u > %u",
+					k.k->type, ptr->gen, g->mark.gen)) {
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
+				set_bit(b, ca->buckets_dirty);
+				set_bit(BCH_FS_FIXED_GENS, &c->flags);
+			}
+		}
 	}
 
-	bch2_mark_key(c, type, k, true, k.k->size, pos, NULL, 0, flags);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		size_t b = PTR_BUCKET_NR(ca, ptr);
 
-	ret = ptr_gens_recalc_oldest(c, type, k);
+		if (gen_after(ca->oldest_gens[b], ptr->gen))
+			ca->oldest_gens[b] = ptr->gen;
+
+		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
+	}
+
+	bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
 
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
-			      bool initial)
+			      u8 *max_stale, bool initial)
 {
-	enum bkey_type type = btree_node_type(b);
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
-	u8 stale = 0;
-	int ret;
+	int ret = 0;
 
-	if (!bkey_type_needs_gc(type))
+	*max_stale = 0;
+
+	if (!btree_node_type_needs_gc(btree_node_type(b)))
 		return 0;
 
 	for_each_btree_node_key_unpack(b, k, &iter,
 				       &unpacked) {
 		bch2_bkey_debugcheck(c, b, k);
 
-		ret = bch2_gc_mark_key(c, type, k, initial);
-		if (ret < 0)
-			return ret;
-
-		stale = max_t(u8, stale, ret);
+		ret = bch2_gc_mark_key(c, k, max_stale, initial);
+		if (ret)
+			break;
 	}
 
-	return stale;
+	return ret;
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
@@ -323,15 +207,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	struct btree_iter iter;
 	struct btree *b;
 	struct range_checks r;
-	unsigned depth = bkey_type_needs_gc(btree_id) ? 0 : 1;
-	unsigned max_stale;
+	unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
+	u8 max_stale;
 	int ret = 0;
 
 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-	if (!c->btree_roots[btree_id].b)
-		return 0;
-
 	/*
 	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
 	 *
@@ -349,7 +230,9 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_verify_btree_nr_keys(b);
 
-		max_stale = btree_gc_mark_node(c, b, initial);
+		ret = btree_gc_mark_node(c, b, &max_stale, initial);
+		if (ret)
+			break;
 
 		gc_pos_set(c, gc_pos_btree_node(b));
 
@@ -370,7 +253,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 		bch2_btree_iter_cond_resched(&iter);
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+	ret = bch2_btree_iter_unlock(&iter) ?: ret;
 	if (ret)
 		return ret;
 
@@ -378,8 +261,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	b = c->btree_roots[btree_id].b;
 	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, BKEY_TYPE_BTREE,
-				 bkey_i_to_s_c(&b->key), initial);
+		bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+				 &max_stale, initial);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
 	mutex_unlock(&c->btree_root_lock);
@@ -396,6 +279,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 			  bool initial)
 {
 	enum btree_id ids[BTREE_ID_NR];
+	u8 max_stale;
 	unsigned i;
 
 	for (i = 0; i < BTREE_ID_NR; i++)
@@ -404,13 +288,13 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
-		enum bkey_type type = bkey_type(0, id);
+		enum btree_node_type type = __btree_node_type(0, id);
 
 		int ret = bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 
-		if (journal && bkey_type_needs_gc(type)) {
+		if (journal && btree_node_type_needs_gc(type)) {
 			struct bkey_i *k, *n;
 			struct jset_entry *j;
 			struct journal_replay *r;
@@ -418,10 +302,11 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 
 			list_for_each_entry(r, journal, list)
 				for_each_jset_key(k, n, j, &r->j) {
-					if (type == bkey_type(j->level, j->btree_id)) {
-						ret = bch2_gc_mark_key(c, type,
-							bkey_i_to_s_c(k), initial);
-						if (ret < 0)
+					if (type == __btree_node_type(j->level, j->btree_id)) {
+						ret = bch2_gc_mark_key(c,
+							bkey_i_to_s_c(k),
+							&max_stale, initial);
+						if (ret)
 							return ret;
 					}
 				}
@@ -519,8 +404,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			bch2_mark_key(c, BKEY_TYPE_BTREE,
-				      bkey_i_to_s_c(&d->key),
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
 				      true, 0,
 				      pos, NULL, 0,
 				      BCH_BUCKET_MARK_GC);
@@ -579,6 +463,8 @@ static void bch2_gc_free(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	genradix_free(&c->stripes[1]);
+
 	for_each_member_device(ca, c, i) {
 		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
 			sizeof(struct bucket_array) +
@@ -599,6 +485,25 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 	unsigned i;
 	int cpu;
 
+	{
+		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
+		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct stripe *dst, *src;
+
+		c->ec_stripes_heap.used = 0;
+
+		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
+		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			*dst = *src;
+
+			if (dst->alive)
+				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+
+			genradix_iter_advance(&dst_iter, &c->stripes[0]);
+			genradix_iter_advance(&src_iter, &c->stripes[1]);
+		}
+	}
+
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *src = __bucket_array(ca, 1);
 
@@ -646,13 +551,21 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
 #define copy_field(_f, _msg, ...)					\
 	if (dst._f != src._f) {						\
-		pr_info(_msg ": got %llu, should be %llu, fixing"	\
+		bch_err(c, _msg ": got %llu, should be %llu, fixing"\
 			, ##__VA_ARGS__, dst._f, src._f);		\
 		dst._f = src._f;					\
 	}
+#define copy_stripe_field(_f, _msg, ...)				\
+	if (dst->_f != src->_f) {					\
+		bch_err_ratelimited(c, "stripe %zu has wrong "_msg	\
+			": got %u, should be %u, fixing",		\
+			dst_iter.pos, ##__VA_ARGS__,			\
+			dst->_f, src->_f);				\
+		dst->_f = src->_f;					\
+	}
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
-		pr_info("dev %u bucket %zu has wrong " #_f		\
+		bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
 			": got %u, should be %u, fixing",		\
 			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
@@ -669,6 +582,36 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		goto out;
 	}
 
+	{
+		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
+		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+		struct stripe *dst, *src;
+		unsigned i;
+
+		c->ec_stripes_heap.used = 0;
+
+		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
+		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+			copy_stripe_field(alive,	"alive");
+			copy_stripe_field(sectors,	"sectors");
+			copy_stripe_field(algorithm,	"algorithm");
+			copy_stripe_field(nr_blocks,	"nr_blocks");
+			copy_stripe_field(nr_redundant,	"nr_redundant");
+			copy_stripe_field(blocks_nonempty.counter,
+					  "blocks_nonempty");
+
+			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
+				copy_stripe_field(block_sectors[i].counter,
+						  "block_sectors[%u]", i);
+
+			if (dst->alive)
+				bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+
+			genradix_iter_advance(&dst_iter, &c->stripes[0]);
+			genradix_iter_advance(&src_iter, &c->stripes[1]);
+		}
+	}
+
 	for_each_member_device(ca, c, i) {
 		struct bucket_array *dst = __bucket_array(ca, 0);
 		struct bucket_array *src = __bucket_array(ca, 1);
@@ -753,10 +696,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 out:
 	percpu_up_write(&c->usage_lock);
 
-#undef copy_field
 #undef copy_fs_field
 #undef copy_dev_field
 #undef copy_bucket_field
+#undef copy_stripe_field
+#undef copy_field
 }
 
 static int bch2_gc_start(struct bch_fs *c)
@@ -764,6 +708,12 @@ static int bch2_gc_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	unsigned i;
 
+	/*
+	 * indicate to stripe code that we need to allocate for the gc stripes
+	 * radix tree, too
+	 */
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
 	BUG_ON(c->usage[1]);
 
 	c->usage[1] = alloc_percpu(struct bch_fs_usage);
@@ -805,7 +755,7 @@ static int bch2_gc_start(struct bch_fs *c)
 
 	percpu_up_write(&c->usage_lock);
 
-	return 0;
+	return bch2_ec_mem_alloc(c, true);
 }
 
 /**
@@ -870,7 +820,7 @@ out:
 		bch2_gc_done(c, initial);
 
 	/* Indicates that gc is no longer in progress: */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
 	bch2_gc_free(c);
 	up_write(&c->gc_lock);
@@ -1110,7 +1060,6 @@ next:
 	/* Free the old nodes and update our sliding window */
 	for (i = 0; i < nr_old_nodes; i++) {
 		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
-		six_unlock_intent(&old_nodes[i]->lock);
 
 		/*
 		 * the index update might have triggered a split, in which case
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index d7809c2e..8af5f841 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -3,8 +3,6 @@
 
 #include "btree_types.h"
 
-enum bkey_type;
-
 void bch2_coalesce(struct bch_fs *);
 int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
@@ -57,9 +55,9 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 {
 	switch (id) {
-#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	BCH_BTREE_IDS()
+#undef x
 	default:
 		BUG();
 	}
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index a4da9791..231ace4f 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1,6 +1,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -19,40 +20,6 @@
 
 #include <trace/events/bcachefs.h>
 
-/* btree_node_iter_large: */
-
-#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
-				     struct btree *b,
-				     const struct bkey_packed *k,
-				     const struct bkey_packed *end)
-{
-	if (k != end) {
-		struct btree_node_iter_set n =
-			((struct btree_node_iter_set) {
-				 __btree_node_key_to_offset(b, k),
-				 __btree_node_key_to_offset(b, end)
-			 });
-
-		__heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
-	}
-}
-
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
-					struct btree *b)
-{
-	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
-
-	EBUG_ON(!iter->used);
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
-	else
-		heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
-}
-
 static void verify_no_dups(struct btree *b,
 			   struct bkey_packed *start,
 			   struct bkey_packed *end)
@@ -113,193 +80,6 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
 }
 
-typedef int (*sort_cmp_fn)(struct btree *,
-			   struct bkey_packed *,
-			   struct bkey_packed *);
-
-struct sort_iter {
-	struct btree	*b;
-	unsigned		used;
-
-	struct sort_iter_set {
-		struct bkey_packed *k, *end;
-	} data[MAX_BSETS + 1];
-};
-
-static void sort_iter_init(struct sort_iter *iter, struct btree *b)
-{
-	memset(iter, 0, sizeof(*iter));
-	iter->b = b;
-}
-
-static inline void __sort_iter_sift(struct sort_iter *iter,
-				    unsigned from,
-				    sort_cmp_fn cmp)
-{
-	unsigned i;
-
-	for (i = from;
-	     i + 1 < iter->used &&
-	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-	     i++)
-		swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
-	__sort_iter_sift(iter, 0, cmp);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	unsigned i = iter->used;
-
-	while (i--)
-		__sort_iter_sift(iter, i, cmp);
-}
-
-static void sort_iter_add(struct sort_iter *iter,
-			  struct bkey_packed *k,
-			  struct bkey_packed *end)
-{
-	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
-
-	if (k != end)
-		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-	return iter->used ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	iter->data->k = bkey_next(iter->data->k);
-
-	BUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		array_remove_item(iter->data, iter->used, 0);
-	else
-		sort_iter_sift(iter, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-						 sort_cmp_fn cmp)
-{
-	struct bkey_packed *ret = sort_iter_peek(iter);
-
-	if (ret)
-		sort_iter_advance(iter, cmp);
-
-	return ret;
-}
-
-static inline int sort_key_whiteouts_cmp(struct btree *b,
-					 struct bkey_packed *l,
-					 struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r);
-}
-
-static unsigned sort_key_whiteouts(struct bkey_packed *dst,
-				   struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_key_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
-					    struct bkey_packed *l,
-					    struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
-				      struct sort_iter *iter)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *out = dst;
-	struct bkey_i l, r;
-	bool prev = false, l_packed = false;
-	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
-	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
-	u64 new_size;
-
-	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
-	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		EBUG_ON(bkeyp_val_u64s(f, in));
-		EBUG_ON(in->type != KEY_TYPE_DISCARD);
-
-		r.k = bkey_unpack_key(iter->b, in);
-
-		if (prev &&
-		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			new_size = l_packed
-				? min(max_packed_size, max_packed_offset -
-				      bkey_start_offset(&l.k))
-				: KEY_SIZE_MAX;
-
-			new_size = min(new_size, r.k.p.offset -
-				       bkey_start_offset(&l.k));
-
-			BUG_ON(new_size < l.k.size);
-
-			bch2_key_resize(&l.k, new_size);
-
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			bch2_cut_front(l.k.p, &r);
-		}
-
-		if (prev) {
-			if (!bch2_bkey_pack(out, &l, f)) {
-				BUG_ON(l_packed);
-				bkey_copy(out, &l);
-			}
-			out = bkey_next(out);
-		}
-
-		l = r;
-		prev = true;
-		l_packed = bkey_packed(in);
-	}
-
-	if (prev) {
-		if (!bch2_bkey_pack(out, &l, f)) {
-			BUG_ON(l_packed);
-			bkey_copy(out, &l);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
 				    bool compacting,
 				    enum compact_mode mode)
@@ -420,11 +200,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
 	       (void *) btree_bkey_last(b, bset_tree_last(b)));
 
-	u64s = btree_node_is_extents(b)
-		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
-					&sort_iter)
-		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
-				     &sort_iter);
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extent_whiteouts
+		: bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b),
+					   &sort_iter);
 
 	BUG_ON(u64s > b->whiteout_u64s);
 	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
@@ -499,87 +278,6 @@ static bool bch2_drop_whiteouts(struct btree *b)
 	return ret;
 }
 
-static inline int sort_keys_cmp(struct btree *b,
-				struct bkey_packed *l,
-				struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
-		(int) l->needs_whiteout - (int) r->needs_whiteout;
-}
-
-static unsigned sort_keys(struct bkey_packed *dst,
-			  struct sort_iter *iter,
-			  bool filter_whiteouts)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *next, *out = dst;
-
-	sort_iter_sort(iter, sort_keys_cmp);
-
-	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (next = sort_iter_peek(iter)) &&
-		    !bkey_cmp_packed(iter->b, in, next)) {
-			BUG_ON(in->needs_whiteout &&
-			       next->needs_whiteout);
-			/*
-			 * XXX racy, called with read lock from write path
-			 *
-			 * leads to spurious BUG_ON() in bkey_unpack_key() in
-			 * debug mode
-			 */
-			next->needs_whiteout |= in->needs_whiteout;
-			continue;
-		}
-
-		if (bkey_whiteout(in)) {
-			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
-			set_bkeyp_val_u64s(f, out, 0);
-		} else {
-			bkey_copy(out, in);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
-				   struct bkey_packed *l,
-				   struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-static unsigned sort_extents(struct bkey_packed *dst,
-			     struct sort_iter *iter,
-			     bool filter_whiteouts)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_extents_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			    struct btree_iter *iter,
 			    unsigned start_idx,
@@ -618,9 +316,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	if (btree_node_is_extents(b))
 		filter_whiteouts = bset_written(b, start_bset);
 
-	u64s = btree_node_is_extents(b)
-		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
-		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+	u64s = (btree_node_is_extents(b)
+		? bch2_sort_extents
+		: bch2_sort_keys)(out->keys.start,
+				  &sort_iter,
+				  filter_whiteouts);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
@@ -678,101 +378,6 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	bch2_verify_btree_nr_keys(b);
 }
 
-/* Sort + repack in a new format: */
-static struct btree_nr_keys sort_repack(struct bset *dst,
-					struct btree *src,
-					struct btree_node_iter *src_iter,
-					struct bkey_format *out_f,
-					bool filter_whiteouts)
-{
-	struct bkey_format *in_f = &src->format;
-	struct bkey_packed *in, *out = vstruct_last(dst);
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(in))
-			continue;
-
-		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-				       ? in_f : &bch2_bkey_format_current, in))
-			out->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(src, (void *) out, in);
-
-		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_next(out);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-/* Sort, repack, and merge: */
-static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
-					      struct bset *dst,
-					      struct btree *src,
-					      struct btree_node_iter *iter,
-					      struct bkey_format *out_f,
-					      bool filter_whiteouts,
-					      key_filter_fn filter,
-					      key_merge_fn merge)
-{
-	struct bkey_packed *k, *prev = NULL, *out;
-	struct btree_nr_keys nr;
-	BKEY_PADDED(k) tmp;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(k))
-			continue;
-
-		/*
-		 * The filter might modify pointers, so we have to unpack the
-		 * key and values to &tmp.k:
-		 */
-		bch2_bkey_unpack(src, &tmp.k, k);
-
-		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
-			continue;
-
-		/* prev is always unpacked, for key merging: */
-
-		if (prev &&
-		    merge &&
-		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
-			continue;
-
-		/*
-		 * the current key becomes the new prev: advance prev, then
-		 * copy the current key - but first pack prev (in place):
-		 */
-		if (prev) {
-			bch2_bkey_pack(prev, (void *) prev, out_f);
-
-			btree_keys_account_key_add(&nr, 0, prev);
-			prev = bkey_next(prev);
-		} else {
-			prev = vstruct_last(dst);
-		}
-
-		bkey_copy(prev, &tmp.k);
-	}
-
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, out_f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = vstruct_last(dst);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 void bch2_btree_sort_into(struct bch_fs *c,
 			 struct btree *dst,
 			 struct btree *src)
@@ -787,16 +392,13 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	if (btree_node_ops(src)->key_normalize ||
-	    btree_node_ops(src)->key_merge)
-		nr = sort_repack_merge(c, btree_bset_first(dst),
+	if (btree_node_is_extents(src))
+		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
 				src, &src_iter,
 				&dst->format,
-				true,
-				btree_node_ops(src)->key_normalize,
-				btree_node_ops(src)->key_merge);
+				true);
 	else
-		nr = sort_repack(btree_bset_first(dst),
+		nr = bch2_sort_repack(btree_bset_first(dst),
 				src, &src_iter,
 				&dst->format,
 				true);
@@ -1000,8 +602,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 {
 	struct bkey_packed *k, *prev = NULL;
 	struct bpos prev_pos = POS_MIN;
-	enum bkey_type type = btree_node_type(b);
 	bool seen_non_whiteout = false;
+	unsigned version;
 	const char *err;
 	int ret = 0;
 
@@ -1047,13 +649,12 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			     "invalid bkey format: %s", err);
 	}
 
-	if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
-			 BTREE_ERR_FIXABLE, c, b, i,
-			 "unsupported bset version")) {
-		i->version = cpu_to_le16(BCACHE_BSET_VERSION);
-		i->u64s = 0;
-		return 0;
-	}
+	version = le16_to_cpu(i->version);
+	btree_err_on((version != BCH_BSET_VERSION_OLD &&
+		      version < bcachefs_metadata_version_min) ||
+		     version >= bcachefs_metadata_version_max,
+		     BTREE_ERR_FATAL, c, b, i,
+		     "unsupported bset version");
 
 	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
 			 BTREE_ERR_FIXABLE, c, b, i,
@@ -1102,17 +703,21 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}
 
 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(type, &b->format, k);
+			bch2_bkey_swab(&b->format, k);
+
+		if (!write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
 
 		u = bkey_disassemble(b, k, &tmp);
 
-		invalid = __bch2_bkey_invalid(c, type, u) ?:
+		invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
 			bch2_bkey_in_btree_node(b, u) ?:
-			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
+			(write ? bch2_bkey_val_invalid(c, u) : NULL);
 		if (invalid) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey:\n%s\n%s", invalid, buf);
 
@@ -1122,6 +727,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			continue;
 		}
 
+		if (write &&
+		    version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(btree_node_type(b), k, write);
+
 		/*
 		 * with the separate whiteouts thing (used for extents), the
 		 * second set of keys actually can have whiteouts too, so we
@@ -1287,17 +896,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
-		enum bkey_type type = btree_node_type(b);
 		struct bkey tmp;
 		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
-		const char *invalid = bch2_bkey_val_invalid(c, type, u);
+		const char *invalid = bch2_bkey_val_invalid(c, u);
 
 		if (invalid ||
 		    (inject_invalid_keys(c) &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
 			char buf[160];
 
-			bch2_bkey_val_to_text(&PBUF(buf), c, type, u);
+			bch2_bkey_val_to_text(&PBUF(buf), c, u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);
 
@@ -1367,7 +975,9 @@ start:
 
 		bch2_mark_io_failure(&failed, &rb->pick);
 
-		can_retry = bch2_btree_pick_ptr(c, b, &failed, &rb->pick) > 0;
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
 		    !bch2_btree_node_read_done(c, b, can_retry))
@@ -1410,7 +1020,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 
 	trace_btree_read(c, b);
 
-	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
 			"btree node read error: no device to read from")) {
 		set_btree_node_read_error(b);
@@ -1537,8 +1148,8 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 {
 	struct btree *b		= wbio->wbio.bio.bi_private;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct bkey_i_extent *new_key;
-	struct bkey_s_extent e;
+	struct bkey_i_btree_ptr *new_key;
+	struct bkey_s_btree_ptr bp;
 	struct bch_extent_ptr *ptr;
 	struct btree_iter iter;
 	int ret;
@@ -1562,13 +1173,13 @@ retry:
 
 	bkey_copy(&tmp.k, &b->key);
 
-	new_key = bkey_i_to_extent(&tmp.k);
-	e = extent_i_to_s(new_key);
+	new_key = bkey_i_to_btree_ptr(&tmp.k);
+	bp = btree_ptr_i_to_s(new_key);
 
-	bch2_extent_drop_ptrs(e, ptr,
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-	if (!bch2_extent_nr_ptrs(e.c))
+	if (!bch2_bkey_nr_ptrs(bp.s_c))
 		goto err;
 
 	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
@@ -1671,12 +1282,11 @@ static void btree_node_write_endio(struct bio *bio)
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
-	const struct bch_extent_ptr *ptr;
 	unsigned whiteout_u64s = 0;
 	int ret;
 
-	extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
-		break;
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+		return -1;
 
 	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
 	if (ret)
@@ -1694,7 +1304,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
 	BKEY_PADDED(key) k;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
@@ -1702,6 +1311,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	u64 seq = 0;
 	bool used_mempool;
 	unsigned long old, new;
+	bool validate_before_checksum = false;
 	void *data;
 
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
@@ -1815,8 +1425,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	b->whiteout_u64s = 0;
 
 	u64s = btree_node_is_extents(b)
-		? sort_extents(vstruct_last(i), &sort_iter, false)
-		: sort_keys(i->start, &sort_iter, false);
+		? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
+		: bch2_sort_keys(i->start, &sort_iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
 	clear_needs_whiteout(i);
@@ -1835,11 +1445,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
-	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le16(BCH_BSET_VERSION_OLD)
+		: cpu_to_le16(c->sb.version);
 	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
 	/* if we're going to be encrypting, check metadata validity first: */
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
@@ -1853,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
 	/* if we're not encrypting, check metadata after checksumming: */
-	if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (!validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
@@ -1907,9 +1527,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 */
 
 	bkey_copy(&k.key, &b->key);
-	e = bkey_i_to_s_extent(&k.key);
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
 		ptr->offset += b->written;
 
 	b->written += sectors_to_write;
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 48833a98..4be3221a 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -142,46 +142,4 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
 void bch2_btree_verify_flushed(struct bch_fs *);
 ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 
-/* Sorting */
-
-struct btree_node_iter_large {
-	u16		used;
-
-	struct btree_node_iter_set data[MAX_BSETS];
-};
-
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
-					struct btree *);
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
-				     struct btree *,
-				     const struct bkey_packed *,
-				     const struct bkey_packed *);
-
-static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
-{
-	return !iter->used;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	return bch2_btree_node_iter_large_end(iter)
-		? NULL
-		: __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
-{
-	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
-
-	if (ret)
-		bch2_btree_node_iter_large_advance(iter, b);
-
-	return ret;
-}
-
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index ae1d4f85..f4922bce 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -263,10 +263,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+void __bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
+	BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
+	       !btree_node_locked(iter, 0));
+
 	for (l = 0; btree_iter_node(iter, l); l++) {
 		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
 		    !btree_node_locked(iter, l))
@@ -276,6 +279,15 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 		       btree_node_locked_type(iter, l));
 	}
 }
+
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+{
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		__bch2_btree_iter_verify_locks(linked);
+
+}
 #endif
 
 __flatten
@@ -381,9 +393,9 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter,
 				break;
 			}
 		}
-
-		bch2_btree_iter_verify_locks(linked);
 	}
+
+	bch2_btree_iter_verify_locks(iter);
 }
 
 int bch2_btree_iter_unlock(struct btree_iter *iter)
@@ -420,7 +432,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
 	 * whiteouts)
 	 */
 	k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
-		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
+		? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
 		: bch2_btree_node_iter_prev_all(&tmp, b);
 	if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
 		char buf[100];
@@ -609,7 +621,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 		 * signal to bch2_btree_iter_peek_slot() that we're currently at
 		 * a hole
 		 */
-		u->type = KEY_TYPE_DELETED;
+		u->type = KEY_TYPE_deleted;
 		return bkey_s_c_null;
 	}
 
@@ -775,9 +787,17 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 	unsigned level = b->level;
 
+	/* caller now responsible for unlocking @b */
+
+	BUG_ON(iter->l[level].b != b);
+	BUG_ON(!btree_node_intent_locked(iter, level));
+
+	iter->l[level].b = BTREE_ITER_NOT_END;
+	mark_btree_node_unlocked(iter, level);
+
 	for_each_btree_iter(iter, linked)
 		if (linked->l[level].b == b) {
-			btree_node_unlock(linked, level);
+			__btree_node_unlock(linked, level);
 			linked->l[level].b = BTREE_ITER_NOT_END;
 		}
 }
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 9bbed99e..33260a99 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -94,7 +94,7 @@ btree_lock_want(struct btree_iter *iter, int level)
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);
 
@@ -105,6 +105,13 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 	mark_btree_node_unlocked(iter, level);
 }
 
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	BUG_ON(!level && iter->flags & BTREE_ITER_NOUNLOCK);
+
+	__btree_node_unlock(iter, level);
+}
+
 static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
 	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index a7eda114..a91a37e4 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -191,6 +191,7 @@ enum btree_iter_type {
  */
 #define BTREE_ITER_IS_EXTENTS		(1 << 4)
 #define BTREE_ITER_ERROR		(1 << 5)
+#define BTREE_ITER_NOUNLOCK		(1 << 6)
 
 enum btree_iter_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -403,20 +404,45 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 	return i - (void *) b->data;
 }
 
-/* Type of keys @b contains: */
-static inline enum bkey_type btree_node_type(struct btree *b)
+enum btree_node_type {
+#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_BTREE,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
 {
-	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+	return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
 }
 
-static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
 {
-	return &bch2_bkey_ops[btree_node_type(b)];
+	return __btree_node_type(b->level, b->btree_id);
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	return type == BKEY_TYPE_EXTENTS;
 }
 
 static inline bool btree_node_is_extents(struct btree *b)
 {
-	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+	return btree_node_type_is_extents(btree_node_type(b));
+}
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
+		return true;
+	default:
+		return false;
+	}
 }
 
 struct btree_root {
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 882e1c27..76836362 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -119,7 +119,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
-			       struct btree *, struct bkey_i_extent *);
+			       struct btree *, struct bkey_i_btree_ptr *);
 
 /* new transactional interface: */
 
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 537b8da7..ee19b135 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -131,13 +131,15 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 /* Btree node freeing/allocation: */
 
 static bool btree_key_matches(struct bch_fs *c,
-			      struct bkey_s_c_extent l,
-			      struct bkey_s_c_extent r)
+			      struct bkey_s_c l,
+			      struct bkey_s_c r)
 {
+	struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
+	struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
 	const struct bch_extent_ptr *ptr1, *ptr2;
 
-	extent_for_each_ptr(l, ptr1)
-		extent_for_each_ptr(r, ptr2)
+	bkey_for_each_ptr(ptrs1, ptr1)
+		bkey_for_each_ptr(ptrs2, ptr2)
 			if (ptr1->dev == ptr2->dev &&
 			    ptr1->gen == ptr2->gen &&
 			    ptr1->offset == ptr2->offset)
@@ -159,17 +161,11 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-
-	/*
-	 * btree_update lock is only needed here to avoid racing with
-	 * gc:
-	 */
-	mutex_lock(&c->btree_interior_update_lock);
+	struct gc_pos pos = { 0 };
 
 	for (d = as->pending; d < as->pending + as->nr_pending; d++)
 		if (!bkey_cmp(k.k->p, d->key.k.p) &&
-		    btree_key_matches(c, bkey_s_c_to_extent(k),
-				      bkey_i_to_s_c_extent(&d->key)))
+		    btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
 			goto found;
 	BUG();
 found:
@@ -200,20 +196,11 @@ found:
 	if (gc_pos_cmp(c->gc_pos, b
 		       ? gc_pos_btree_node(b)
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
-	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct gc_pos pos = { 0 };
-
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
+		bch2_mark_key_locked(c,
 			      bkey_i_to_s_c(&d->key),
 			      false, 0, pos,
 			      NULL, 0, BCH_BUCKET_MARK_GC);
-		/*
-		 * Don't apply tmp - pending deletes aren't tracked in
-		 * bch_alloc_stats:
-		 */
-	}
-
-	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -256,6 +243,11 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 				struct btree_iter *iter)
 {
+	struct btree_iter *linked;
+
+	for_each_btree_iter(iter, linked)
+		BUG_ON(linked->l[b->level].b == b);
+
 	/*
 	 * Is this a node that isn't reachable on disk yet?
 	 *
@@ -267,11 +259,10 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 	 */
 	btree_update_drop_new_node(c, b);
 
-	__bch2_btree_node_lock_write(b, iter);
+	six_lock_write(&b->lock);
 	__btree_node_free(c, b);
 	six_unlock_write(&b->lock);
-
-	bch2_btree_iter_node_drop(iter, b);
+	six_unlock_intent(&b->lock);
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
@@ -279,8 +270,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 {
 	BUG_ON(!pending->index_update_done);
 
-	bch2_mark_key(c, BKEY_TYPE_BTREE,
-		      bkey_i_to_s_c(&pending->key),
+	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
 		      NULL, 0, 0);
@@ -294,7 +284,6 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	struct write_point *wp;
 	struct btree *b;
 	BKEY_PADDED(k) tmp;
-	struct bkey_i_extent *e;
 	struct open_buckets ob = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
@@ -345,8 +334,8 @@ retry:
 		goto retry;
 	}
 
-	e = bkey_extent_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+	bkey_btree_ptr_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
 	bch2_open_bucket_get(c, wp, &ob);
 	bch2_alloc_sectors_done(c, wp);
@@ -384,7 +373,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+	b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
 	bch2_btree_build_aux_trees(b);
 
@@ -537,8 +526,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 			goto err_free;
 		}
 
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-					      bkey_i_to_s_c(&b->key));
+		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 		if (ret)
 			goto err_free;
 
@@ -1078,8 +1066,10 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 
 	__bch2_btree_set_root_inmem(c, b);
 
-	bch2_mark_key(c, BKEY_TYPE_BTREE,
-		      bkey_i_to_s_c(&b->key),
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read_preempt_disable(&c->usage_lock);
+
+	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
 		      &stats, 0, 0);
@@ -1090,6 +1080,9 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 					   &stats);
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_root(b->btree_id));
+
+	percpu_up_read_preempt_enable(&c->usage_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
@@ -1166,11 +1159,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
-	if (bkey_extent_is_data(&insert->k))
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
-			      bkey_i_to_s_c(insert),
-			      true, 0,
-			      gc_pos_btree_node(b), &stats, 0, 0);
+	mutex_lock(&c->btree_interior_update_lock);
+	percpu_down_read_preempt_disable(&c->usage_lock);
+
+	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+			     true, 0,
+			     gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1188,6 +1182,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
+	percpu_up_read_preempt_enable(&c->usage_lock);
+	mutex_unlock(&c->btree_interior_update_lock);
+
 	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
 	set_btree_node_dirty(b);
 	set_btree_node_need_write(b);
@@ -1420,25 +1417,19 @@ static void btree_split(struct btree_update *as, struct btree *b,
 	if (n3)
 		bch2_open_buckets_put(c, &n3->ob);
 
-	/*
-	 * Note - at this point other linked iterators could still have @b read
-	 * locked; we're depending on the bch2_btree_iter_node_replace() calls
-	 * below removing all references to @b so we don't return with other
-	 * iterators pointing to a node they have locked that's been freed.
-	 *
-	 * We have to free the node first because the bch2_iter_node_replace()
-	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
-	 */
-	bch2_btree_node_free_inmem(c, b, iter);
-
 	/* Successful split, update the iterator to point to the new nodes: */
 
+	bch2_btree_iter_node_drop(iter, b);
 	if (n3)
 		bch2_btree_iter_node_replace(iter, n3);
 	if (n2)
 		bch2_btree_iter_node_replace(iter, n2);
 	bch2_btree_iter_node_replace(iter, n1);
 
+	bch2_btree_node_free_inmem(c, b, iter);
+
+	bch2_btree_iter_verify_locks(iter);
+
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }
 
@@ -1734,17 +1725,21 @@ retry:
 	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
 	bch2_open_buckets_put(c, &n->ob);
-	bch2_btree_node_free_inmem(c, b, iter);
-	bch2_btree_node_free_inmem(c, m, iter);
+
+	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 
 	bch2_btree_iter_verify(iter, n);
 
+	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(c, m, iter);
+
 	bch2_btree_update_done(as);
 
-	six_unlock_intent(&m->lock);
 	up_read(&c->gc_lock);
 out:
+	bch2_btree_iter_verify_locks(iter);
+
 	/*
 	 * Don't downgrade locks here: we're called after successful insert,
 	 * and the caller will downgrade locks after a successful insert
@@ -1827,9 +1822,9 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
 	bch2_open_buckets_put(c, &n->ob);
 
-	bch2_btree_node_free_inmem(c, b, iter);
-
+	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
+	bch2_btree_node_free_inmem(c, b, iter);
 
 	bch2_btree_update_done(as);
 	return 0;
@@ -1892,7 +1887,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 					 struct btree_update *as,
 					 struct btree_iter *iter,
 					 struct btree *b, struct btree *new_hash,
-					 struct bkey_i_extent *new_key)
+					 struct bkey_i_btree_ptr *new_key)
 {
 	struct btree *parent;
 	int ret;
@@ -1955,8 +1950,10 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		bch2_btree_node_lock_write(b, iter);
 
-		bch2_mark_key(c, BKEY_TYPE_BTREE,
-			      bkey_i_to_s_c(&new_key->k_i),
+		mutex_lock(&c->btree_interior_update_lock);
+		percpu_down_read_preempt_disable(&c->usage_lock);
+
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
 			      &stats, 0, 0);
@@ -1966,6 +1963,9 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
 
+		percpu_up_read_preempt_enable(&c->usage_lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+
 		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
 			mutex_lock(&c->btree_cache.lock);
 			bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -1986,7 +1986,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 }
 
 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
-			       struct btree *b, struct bkey_i_extent *new_key)
+			       struct btree *b,
+			       struct bkey_i_btree_ptr *new_key)
 {
 	struct btree *parent = btree_node_parent(iter, b);
 	struct btree_update *as = NULL;
@@ -2052,8 +2053,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			goto err;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-				      extent_i_to_s_c(new_key).s_c);
+	ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
 	if (ret)
 		goto err_free_update;
 
@@ -2111,9 +2111,9 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	b->level	= 0;
 	b->btree_id	= id;
 
-	bkey_extent_init(&b->key);
+	bkey_btree_ptr_init(&b->key);
 	b->key.k.p = POS_MAX;
-	bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+	PTR_HASH(&b->key) = U64_MAX - id;
 
 	bch2_bset_init_first(b, &b->data->keys);
 	bch2_btree_build_aux_trees(b);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index e8d6e078..57c5c7ab 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -70,7 +70,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 			goto overwrite;
 		}
 
-		k->type = KEY_TYPE_DELETED;
+		k->type = KEY_TYPE_deleted;
 		bch2_btree_node_iter_fix(iter, b, node_iter, k,
 					 k->u64s, k->u64s);
 		bch2_btree_iter_verify(iter, b);
@@ -186,7 +186,6 @@ bch2_insert_fixup_key(struct btree_insert *trans,
 				       insert->k))
 		bch2_btree_journal_key(trans, iter, insert->k);
 
-	trans->did_work = true;
 	return BTREE_INSERT_OK;
 }
 
@@ -312,7 +311,6 @@ btree_key_can_insert(struct btree_insert *trans,
 		return BTREE_INSERT_BTREE_NODE_FULL;
 
 	if (!bch2_bkey_replicas_marked(c,
-			insert->iter->btree_id,
 			bkey_i_to_s_c(insert->k),
 			true))
 		return BTREE_INSERT_NEED_MARK_REPLICAS;
@@ -337,6 +335,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
+	struct btree_iter *linked;
 	unsigned u64s;
 	int ret;
 
@@ -414,12 +413,25 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 				i->k->k.version = MAX_VERSION;
 	}
 
+	if (trans->flags & BTREE_INSERT_NOUNLOCK) {
+		/*
+		 * linked iterators that weren't being updated may or may not
+		 * have been traversed/locked, depending on what the caller was
+		 * doing:
+		 */
+		for_each_btree_iter(trans->entries[0].iter, linked)
+			if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
+				linked->flags |= BTREE_ITER_NOUNLOCK;
+	}
+	trans->did_work = true;
+
 	trans_for_each_entry(trans, i) {
 		switch (btree_insert_key_leaf(trans, i)) {
 		case BTREE_INSERT_OK:
 			break;
 		case BTREE_INSERT_NEED_TRAVERSE:
-			BUG_ON((trans->flags & BTREE_INSERT_ATOMIC));
+			BUG_ON((trans->flags &
+				(BTREE_INSERT_ATOMIC|BTREE_INSERT_NOUNLOCK)));
 			ret = -EINTR;
 			goto out;
 		default:
@@ -440,8 +452,8 @@ static inline void btree_insert_entry_checks(struct bch_fs *c,
 	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
 	BUG_ON(debug_check_bkeys(c) &&
 	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, i->iter->btree_id,
-				 bkey_i_to_s_c(i->k)));
+	       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+				 i->iter->btree_id));
 }
 
 /**
@@ -465,8 +477,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 
 	BUG_ON(!trans->nr);
 
-	for_each_btree_iter(trans->entries[0].iter, linked)
-		bch2_btree_iter_verify_locks(linked);
+	bch2_btree_iter_verify_locks(trans->entries[0].iter);
 
 	/* for the sake of sanity: */
 	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
@@ -508,15 +519,11 @@ retry:
 out:
 	percpu_ref_put(&c->writes);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		/* make sure we didn't drop or screw up locks: */
-		for_each_btree_iter(trans->entries[0].iter, linked) {
-			bch2_btree_iter_verify_locks(linked);
-			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
-			       trans->did_work &&
-			       !btree_node_locked(linked, 0));
-		}
-	}
+	/* make sure we didn't drop or screw up locks: */
+	bch2_btree_iter_verify_locks(trans->entries[0].iter);
+
+	for_each_btree_iter(trans->entries[0].iter, linked)
+		linked->flags &= ~BTREE_ITER_NOUNLOCK;
 
 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
 
@@ -581,8 +588,7 @@ err:
 		}
 
 		bch2_btree_iter_unlock(trans->entries[0].iter);
-		ret = bch2_mark_bkey_replicas(c, i->iter->btree_id,
-					      bkey_i_to_s_c(i->k))
+		ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k))
 			?: -EINTR;
 		break;
 	default:
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 60377630..401ff825 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -302,7 +302,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
-		?  BCH_DATA_CACHED
+		? BCH_DATA_CACHED
 		: m.data_type;
 }
 
@@ -322,6 +322,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	s64 added = sum.data + sum.reserved;
 	s64 should_not_have_added;
 
+	percpu_rwsem_assert_held(&c->usage_lock);
+
 	/*
 	 * Not allowed to reduce sectors_available except by getting a
 	 * reservation:
@@ -338,7 +340,6 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 		stats->online_reserved	-= added;
 	}
 
-	percpu_down_read_preempt_disable(&c->usage_lock);
 	/* online_reserved not subject to gc: */
 	this_cpu_ptr(c->usage[0])->online_reserved +=
 		stats->online_reserved;
@@ -350,7 +351,6 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 		bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
 
 	bch2_fs_stats_verify(c);
-	percpu_up_read_preempt_enable(&c->usage_lock);
 
 	memset(stats, 0, sizeof(*stats));
 }
@@ -372,14 +372,14 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-	if (bucket_type(old) != bucket_type(new)) {
-		if (bucket_type(old)) {
-			fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-			dev_usage->buckets[bucket_type(old)]--;
-		} else {
-			fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
-			dev_usage->buckets[bucket_type(new)]++;
-		}
+	if (bucket_type(old)) {
+		fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+		dev_usage->buckets[bucket_type(old)]--;
+	}
+
+	if (bucket_type(new)) {
+		fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+		dev_usage->buckets[bucket_type(new)]++;
 	}
 
 	dev_usage->buckets_alloc +=
@@ -402,11 +402,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, stats, g, new, expr)		\
+void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bucket_mark old = { .v.counter = 0 };
+	struct bch_fs_usage *fs_usage;
+	struct bucket_array *buckets;
+	struct bucket *g;
+
+	percpu_down_read_preempt_disable(&c->usage_lock);
+	fs_usage = this_cpu_ptr(c->usage[0]);
+	buckets = bucket_array(ca);
+
+	for_each_bucket(g, buckets)
+		if (g->mark.data_type)
+			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
+	percpu_up_read_preempt_enable(&c->usage_lock);
+}
+
+#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, stats, _old, new, gc);	\
+	bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc);	\
 	_old;							\
 })
 
@@ -486,12 +503,12 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark old, new;
+	struct bucket_mark new;
 
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);
 
-	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
@@ -542,7 +559,7 @@ static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 				    crc.uncompressed_size));
 }
 
-static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
+static s64 ptr_disk_sectors(const struct bkey *k,
 			    struct extent_ptr_decoded p,
 			    s64 sectors)
 {
@@ -554,8 +571,8 @@ static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
 			old_sectors = 0;
 			new_sectors = sectors;
 		} else {
-			old_sectors = e.k->size;
-			new_sectors = e.k->size + sectors;
+			old_sectors = k->size;
+			new_sectors = k->size + sectors;
 		}
 
 		sectors = -__disk_sectors(p.crc, old_sectors)
@@ -571,7 +588,6 @@ static s64 ptr_disk_sectors(struct bkey_s_c_extent e,
  * that with the gc pos seqlock held.
  */
 static void bch2_mark_pointer(struct bch_fs *c,
-			      struct bkey_s_c_extent e,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
@@ -630,23 +646,25 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	BUG_ON(!gc && bucket_became_unavailable(old, new));
 }
 
-static void bch2_mark_stripe_ptr(struct bch_fs *c,
-				 struct bch_extent_stripe_ptr p,
-				 s64 sectors, unsigned flags,
-				 s64 *adjusted_disk_sectors,
-				 unsigned *redundancy)
+static int bch2_mark_stripe_ptr(struct bch_fs *c,
+				struct bch_extent_stripe_ptr p,
+				s64 sectors, unsigned flags,
+				s64 *adjusted_disk_sectors,
+				unsigned *redundancy,
+				bool gc)
 {
-	struct ec_stripe *m;
+	struct stripe *m;
 	unsigned old, new, nr_data;
 	int blocks_nonempty_delta;
 	s64 parity_sectors;
 
-	m = genradix_ptr(&c->ec_stripes, p.idx);
-	if (WARN_ON(!m))
-		return;
+	m = genradix_ptr(&c->stripes[gc], p.idx);
 
-	if (WARN_ON(!m->alive))
-		return;
+	if (!m || !m->alive) {
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		return -1;
+	}
 
 	nr_data = m->nr_blocks - m->nr_redundant;
 
@@ -664,81 +682,74 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
 
 	blocks_nonempty_delta = (int) !!new - (int) !!old;
 	if (!blocks_nonempty_delta)
-		return;
+		return 0;
 
 	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
 
 	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
 
-	bch2_stripes_heap_update(c, m, p.idx);
+	if (!gc)
+		bch2_stripes_heap_update(c, m, p.idx);
+
+	return 0;
 }
 
-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
-			     s64 sectors, enum bch_data_type data_type,
-			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags,
-			     bool gc)
+static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+			    s64 sectors, enum bch_data_type data_type,
+			    struct bch_fs_usage *stats,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	s64 cached_sectors	= 0;
+	s64 dirty_sectors	= 0;
+	s64 ec_sectors		= 0;
+	unsigned replicas	= 0;
+	unsigned ec_redundancy	= 0;
+	unsigned i;
+	int ret;
+
 	BUG_ON(!sectors);
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		s64 cached_sectors	= 0;
-		s64 dirty_sectors	= 0;
-		s64 ec_sectors		= 0;
-		unsigned replicas	= 0;
-		unsigned ec_redundancy	= 0;
-		unsigned i;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(k.k, p, sectors);
+		s64 adjusted_disk_sectors = disk_sectors;
 
-		extent_for_each_ptr_decode(e, p, entry) {
-			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
-			s64 adjusted_disk_sectors = disk_sectors;
+		bch2_mark_pointer(c, p, disk_sectors, data_type,
+				  stats, journal_seq, flags, gc);
 
-			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-					  stats, journal_seq, flags, gc);
+		if (!p.ptr.cached)
+			for (i = 0; i < p.ec_nr; i++) {
+				ret = bch2_mark_stripe_ptr(c, p.ec[i],
+						disk_sectors, flags,
+						&adjusted_disk_sectors,
+						&ec_redundancy, gc);
+				if (ret)
+					return ret;
+			}
+		if (!p.ptr.cached)
+			replicas++;
 
-			if (!p.ptr.cached)
-				for (i = 0; i < p.ec_nr; i++)
-					bch2_mark_stripe_ptr(c, p.ec[i],
-							disk_sectors, flags,
-							&adjusted_disk_sectors,
-							&ec_redundancy);
-			if (!p.ptr.cached)
-				replicas++;
-
-			if (p.ptr.cached)
-				cached_sectors	+= adjusted_disk_sectors;
-			else if (!p.ec_nr)
-				dirty_sectors	+= adjusted_disk_sectors;
-			else
-				ec_sectors	+= adjusted_disk_sectors;
-		}
-
-		replicas	= clamp_t(unsigned,	replicas,
-					  1, ARRAY_SIZE(stats->replicas));
-		ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
-					  1, ARRAY_SIZE(stats->replicas));
-
-		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
-		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
-		stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
-		break;
+		if (p.ptr.cached)
+			cached_sectors	+= adjusted_disk_sectors;
+		else if (!p.ec_nr)
+			dirty_sectors	+= adjusted_disk_sectors;
+		else
+			ec_sectors	+= adjusted_disk_sectors;
 	}
-	case BCH_RESERVATION: {
-		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
-		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas,
-				   1, ARRAY_SIZE(stats->replicas));
+	replicas	= clamp_t(unsigned,	replicas,
+				  1, ARRAY_SIZE(stats->replicas));
+	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
+				  1, ARRAY_SIZE(stats->replicas));
 
-		stats->replicas[replicas - 1].persistent_reserved += sectors;
-		break;
-	}
-	}
+	stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
+	stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+	stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
+
+	return 0;
 }
 
 static void bucket_set_stripe(struct bch_fs *c,
@@ -759,7 +770,7 @@ static void bucket_set_stripe(struct bch_fs *c,
 
 		BUG_ON(ptr_stale(ca, ptr));
 
-		old = bucket_cmpxchg(g, new, ({
+		old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 			new.stripe			= enabled;
 			if (journal_seq) {
 				new.journal_seq_valid	= 1;
@@ -768,103 +779,143 @@ static void bucket_set_stripe(struct bch_fs *c,
 		}));
 
 		BUG_ON(old.stripe == enabled);
-
-		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	}
 }
 
-static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-			     bool inserting,
-			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags,
-			     bool gc)
+static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			    bool inserting,
+			    struct bch_fs_usage *fs_usage,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		size_t idx = s.k->p.offset;
-		struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
-		unsigned i;
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	size_t idx = s.k->p.offset;
+	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
+	unsigned i;
 
-		BUG_ON(!m);
-		BUG_ON(m->alive == inserting);
+	if (!m || (!inserting && !m->alive)) {
+		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
+				    idx);
+		return -1;
+	}
 
-		BUG_ON(atomic_read(&m->blocks_nonempty));
+	if (inserting && m->alive) {
+		bch_err_ratelimited(c, "error marking stripe %zu: already exists",
+				    idx);
+		return -1;
+	}
 
-		for (i = 0; i < EC_STRIPE_MAX; i++)
-			BUG_ON(atomic_read(&m->block_sectors[i]));
+	BUG_ON(atomic_read(&m->blocks_nonempty));
 
-		if (inserting) {
-			m->sectors	= le16_to_cpu(s.v->sectors);
-			m->algorithm	= s.v->algorithm;
-			m->nr_blocks	= s.v->nr_blocks;
-			m->nr_redundant	= s.v->nr_redundant;
-		}
+	for (i = 0; i < EC_STRIPE_MAX; i++)
+		BUG_ON(atomic_read(&m->block_sectors[i]));
 
+	if (inserting) {
+		m->sectors	= le16_to_cpu(s.v->sectors);
+		m->algorithm	= s.v->algorithm;
+		m->nr_blocks	= s.v->nr_blocks;
+		m->nr_redundant	= s.v->nr_redundant;
+	}
+
+	if (!gc) {
 		if (inserting)
 			bch2_stripes_heap_insert(c, m, idx);
 		else
 			bch2_stripes_heap_del(c, m, idx);
+	} else {
+		m->alive = inserting;
+	}
 
-		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
-		break;
-	}
-	}
+	bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
+	return 0;
 }
 
-static void __bch2_mark_key(struct bch_fs *c,
-			    enum bkey_type type, struct bkey_s_c k,
-			    bool inserting, s64 sectors,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags,
-			    bool gc)
+static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+			   bool inserting, s64 sectors,
+			   struct bch_fs_usage *stats,
+			   u64 journal_seq, unsigned flags,
+			   bool gc)
 {
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-		bch2_mark_extent(c, k, inserting
-				 ?  c->opts.btree_node_size
-				 : -c->opts.btree_node_size,
-				 BCH_DATA_BTREE,
-				 stats, journal_seq, flags, gc);
+	int ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		ret = bch2_mark_extent(c, k, inserting
+				       ?  c->opts.btree_node_size
+				       : -c->opts.btree_node_size,
+				       BCH_DATA_BTREE,
+				       stats, journal_seq, flags, gc);
 		break;
-	case BKEY_TYPE_EXTENTS:
-		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				 stats, journal_seq, flags, gc);
+	case KEY_TYPE_extent:
+		ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+				       stats, journal_seq, flags, gc);
 		break;
-	case BKEY_TYPE_EC:
-		bch2_mark_stripe(c, k, inserting,
-				 stats, journal_seq, flags, gc);
+	case KEY_TYPE_stripe:
+		ret = bch2_mark_stripe(c, k, inserting,
+				       stats, journal_seq, flags, gc);
 		break;
+	case KEY_TYPE_reservation: {
+		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+		sectors *= replicas;
+		replicas = clamp_t(unsigned, replicas,
+				   1, ARRAY_SIZE(stats->replicas));
+
+		stats->replicas[replicas - 1].persistent_reserved += sectors;
+		break;
+	}
 	default:
 		break;
 	}
+
+	return ret;
 }
 
-void bch2_mark_key(struct bch_fs *c,
-		   enum bkey_type type, struct bkey_s_c k,
+int bch2_mark_key_locked(struct bch_fs *c,
+		   struct bkey_s_c k,
 		   bool inserting, s64 sectors,
 		   struct gc_pos pos,
 		   struct bch_fs_usage *stats,
 		   u64 journal_seq, unsigned flags)
 {
-	percpu_down_read_preempt_disable(&c->usage_lock);
+	int ret;
 
 	if (!(flags & BCH_BUCKET_MARK_GC)) {
 		if (!stats)
 			stats = this_cpu_ptr(c->usage[0]);
 
-		__bch2_mark_key(c, type, k, inserting, sectors,
-				stats, journal_seq, flags, false);
+		ret = __bch2_mark_key(c, k, inserting, sectors,
+				      stats, journal_seq, flags, false);
+		if (ret)
+			return ret;
 	}
 
 	if ((flags & BCH_BUCKET_MARK_GC) ||
 	    gc_visited(c, pos)) {
-		__bch2_mark_key(c, type, k, inserting, sectors,
-				this_cpu_ptr(c->usage[1]),
-				journal_seq, flags, true);
+		ret = __bch2_mark_key(c, k, inserting, sectors,
+				      this_cpu_ptr(c->usage[1]),
+				      journal_seq, flags, true);
+		if (ret)
+			return ret;
 	}
 
+	return 0;
+}
+
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+		  bool inserting, s64 sectors,
+		  struct gc_pos pos,
+		  struct bch_fs_usage *stats,
+		  u64 journal_seq, unsigned flags)
+{
+	int ret;
+
+	percpu_down_read_preempt_disable(&c->usage_lock);
+	ret = bch2_mark_key_locked(c, k, inserting, sectors,
+				   pos, stats, journal_seq, flags);
 	percpu_up_read_preempt_enable(&c->usage_lock);
+
+	return ret;
 }
 
 void bch2_mark_update(struct btree_insert *trans,
@@ -878,15 +929,19 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
 
+	if (!btree_node_type_needs_gc(iter->btree_id))
+		return;
+
+	percpu_down_read_preempt_disable(&c->usage_lock);
+
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_mark_key(c, btree_node_type(b), bkey_i_to_s_c(insert->k),
-			      true,
-			      bpos_min(insert->k->k.p, b->key.k.p).offset -
-			      bkey_start_offset(&insert->k->k),
-			      pos, &stats, trans->journal_res.seq, 0);
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
+			bpos_min(insert->k->k.p, b->key.k.p).offset -
+			bkey_start_offset(&insert->k->k),
+			pos, &stats, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-						      KEY_TYPE_DISCARD))) {
+						      KEY_TYPE_discard))) {
 		struct bkey		unpacked;
 		struct bkey_s_c		k;
 		s64			sectors = 0;
@@ -915,9 +970,8 @@ void bch2_mark_update(struct btree_insert *trans,
 				sectors = k.k->p.offset - insert->k->k.p.offset;
 				BUG_ON(sectors <= 0);
 
-				bch2_mark_key(c, btree_node_type(b), k,
-					      true, sectors,
-					      pos, &stats, trans->journal_res.seq, 0);
+				bch2_mark_key_locked(c, k, true, sectors,
+					pos, &stats, trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -927,14 +981,15 @@ void bch2_mark_update(struct btree_insert *trans,
 			BUG_ON(sectors >= 0);
 		}
 
-		bch2_mark_key(c, btree_node_type(b), k,
-			      false, sectors,
-			      pos, &stats, trans->journal_res.seq, 0);
+		bch2_mark_key_locked(c, k, false, sectors,
+			pos, &stats, trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
 	bch2_fs_usage_apply(c, &stats, trans->disk_res, pos);
+
+	percpu_up_read_preempt_enable(&c->usage_lock);
 }
 
 /* Disk reservations: */
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 76ebe2ec..17a9b445 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -219,9 +219,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
 #define BCH_BUCKET_MARK_GC			(1 << 1)
 
-void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
-		   bool, s64, struct gc_pos,
-		   struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
+		  bool, s64, struct gc_pos,
+		  struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
+		  bool, s64, struct gc_pos,
+		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 77f90f7d..e8a671a1 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -55,7 +55,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	v->btree_id	= b->btree_id;
 	bch2_btree_keys_init(v, &c->expensive_debug_checks);
 
-	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+				       NULL, &pick) <= 0)
 		return;
 
 	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -222,8 +223,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	k = bch2_btree_iter_peek(&iter);
 
 	while (k.k && !(err = btree_iter_err(k))) {
-		bch2_bkey_val_to_text(&PBUF(i->buf), i->c,
-				      bkey_type(0, i->id), k);
+		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
 		i->bytes = strlen(i->buf);
 		BUG_ON(i->bytes >= PAGE_SIZE);
 		i->buf[i->bytes] = '\n';
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 3ec0b4c5..9a400085 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -64,8 +64,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 
 const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.btree_id	= BTREE_ID_DIRENTS,
-	.key_type	= BCH_DIRENT,
-	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.key_type	= KEY_TYPE_dirent,
 	.hash_key	= dirent_hash_key,
 	.hash_bkey	= dirent_hash_bkey,
 	.cmp_key	= dirent_cmp_key,
@@ -74,58 +73,37 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 
 const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_dirent d;
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	unsigned len;
 
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
-			return "value too small";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+		return "value too small";
 
-		d = bkey_s_c_to_dirent(k);
-		len = bch2_dirent_name_bytes(d);
+	len = bch2_dirent_name_bytes(d);
+	if (!len)
+		return "empty name";
 
-		if (!len)
-			return "empty name";
+	/*
+	 * older versions of bcachefs were buggy and creating dirent
+	 * keys that were bigger than necessary:
+	 */
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+		return "value too big";
 
-		/*
-		 * older versions of bcachefs were buggy and creating dirent
-		 * keys that were bigger than necessary:
-		 */
-		if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
-			return "value too big";
+	if (len > BCH_NAME_MAX)
+		return "dirent name too big";
 
-		if (len > BCH_NAME_MAX)
-			return "dirent name too big";
-
-		return NULL;
-	case BCH_DIRENT_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
-
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	struct bkey_s_c_dirent d;
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		d = bkey_s_c_to_dirent(k);
-
-		bch_scnmemcpy(out, d.v->d_name,
-			      bch2_dirent_name_bytes(d));
-		pr_buf(out, " -> %llu", d.v->d_inum);
-		break;
-	case BCH_DIRENT_WHITEOUT:
-		pr_buf(out, "whiteout");
-		break;
-	}
+	bch_scnmemcpy(out, d.v->d_name,
+		      bch2_dirent_name_bytes(d));
+	pr_buf(out, " -> %llu", d.v->d_inum);
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -286,7 +264,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * overwrite old_dst - just make sure to use a
 				 * whiteout when deleting src:
 				 */
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_whiteout;
 			}
 		} else {
 			/* Check if we need a whiteout to delete src: */
@@ -297,7 +275,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				return ret;
 
 			if (ret)
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_whiteout;
 		}
 	}
 
@@ -360,7 +338,7 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 		if (k.k->p.inode > dir_inum)
 			break;
 
-		if (k.k->type == BCH_DIRENT) {
+		if (k.k->type == KEY_TYPE_dirent) {
 			ret = -ENOTEMPTY;
 			break;
 		}
@@ -384,7 +362,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file,
 
 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
 			   POS(inode->v.i_ino, ctx->pos), 0, k) {
-		if (k.k->type != BCH_DIRENT)
+		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		dirent = bkey_s_c_to_dirent(k);
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index a57a5382..ed09d306 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -8,7 +8,7 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc;
 const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
+#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
 	.key_invalid	= bch2_dirent_invalid,		\
 	.val_to_text	= bch2_dirent_to_text,		\
 }
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 02c51eaf..c8115f63 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -122,49 +122,39 @@ static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
 	return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
 }
 
-const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
 	if (k.k->p.inode)
 		return "invalid stripe key";
 
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	if (bkey_val_bytes(k.k) < sizeof(*s))
+		return "incorrect value size";
 
-		if (bkey_val_bytes(k.k) < sizeof(*s))
-			return "incorrect value size";
+	if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+		return "incorrect value size";
 
-		if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
-			return "incorrect value size";
-
-		return NULL;
-	}
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
-void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	switch (k.k->type) {
-	case BCH_STRIPE: {
-		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-		unsigned i;
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned i;
 
-		pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
-		       s->algorithm,
-		       le16_to_cpu(s->sectors),
-		       s->nr_blocks - s->nr_redundant,
-		       s->nr_redundant,
-		       s->csum_type,
-		       1U << s->csum_granularity_bits);
+	pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	       s->algorithm,
+	       le16_to_cpu(s->sectors),
+	       s->nr_blocks - s->nr_redundant,
+	       s->nr_redundant,
+	       s->csum_type,
+	       1U << s->csum_granularity_bits);
 
-		for (i = 0; i < s->nr_blocks; i++)
-			pr_buf(out, " %u:%llu", s->ptrs[i].dev,
-			       (u64) s->ptrs[i].offset);
-	}
-	}
+	for (i = 0; i < s->nr_blocks; i++)
+		pr_buf(out, " %u:%llu", s->ptrs[i].dev,
+		       (u64) s->ptrs[i].offset);
 }
 
 static int ptr_matches_stripe(struct bch_fs *c,
@@ -453,7 +443,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 			     POS(0, stripe_idx),
 			     BTREE_ITER_SLOTS);
 	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k) || k.k->type != BCH_STRIPE) {
+	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe) {
 		__bcache_io_error(c,
 			"error doing reconstruct read: stripe not found");
 		kfree(buf);
@@ -529,7 +519,7 @@ err:
 	return ret;
 }
 
-/* ec_stripe bucket accounting: */
+/* stripe bucket accounting: */
 
 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 {
@@ -550,7 +540,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 		free_heap(&n);
 	}
 
-	if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp))
+	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+		return -ENOMEM;
+
+	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
 		return -ENOMEM;
 
 	return 0;
@@ -591,27 +585,26 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
 {
 	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
 
-	genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i;
+	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
 }
 
 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
 
 	BUG_ON(!m->alive);
 	BUG_ON(m->heap_idx >= h->used);
 	BUG_ON(h->data[m->heap_idx].idx != idx);
 }
 
-static inline unsigned stripe_entry_blocks(struct ec_stripe *m)
+static inline unsigned stripe_entry_blocks(struct stripe *m)
 {
-	return atomic_read(&m->pin)
-		? UINT_MAX : atomic_read(&m->blocks_nonempty);
+	return atomic_read(&m->blocks_nonempty);
 }
 
 void bch2_stripes_heap_update(struct bch_fs *c,
-			      struct ec_stripe *m, size_t idx)
+			      struct stripe *m, size_t idx)
 {
 	ec_stripes_heap *h = &c->ec_stripes_heap;
 	bool queue_delete;
@@ -645,7 +638,7 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 }
 
 void bch2_stripes_heap_del(struct bch_fs *c,
-			   struct ec_stripe *m, size_t idx)
+			   struct stripe *m, size_t idx)
 {
 	spin_lock(&c->ec_stripes_heap_lock);
 	heap_verify_backpointer(c, idx);
@@ -658,7 +651,7 @@ void bch2_stripes_heap_del(struct bch_fs *c,
 }
 
 void bch2_stripes_heap_insert(struct bch_fs *c,
-			      struct ec_stripe *m, size_t idx)
+			      struct stripe *m, size_t idx)
 {
 	spin_lock(&c->ec_stripes_heap_lock);
 
@@ -677,7 +670,9 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
 	spin_unlock(&c->ec_stripes_heap_lock);
 }
 
-static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
+/* stripe deletion */
+
+static void ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
 	struct btree_iter iter;
 	struct bch_stripe *v = NULL;
@@ -689,7 +684,7 @@ static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
 			     POS(0, idx),
 			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k) || k.k->type != BCH_STRIPE)
+	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe)
 		goto out;
 
 	v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
@@ -716,6 +711,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 	ssize_t idx;
 
 	down_read(&c->gc_lock);
+	mutex_lock(&c->ec_stripe_create_lock);
 
 	while (1) {
 		spin_lock(&c->ec_stripes_heap_lock);
@@ -728,13 +724,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		ec_stripe_delete(c, idx);
 	}
 
+	mutex_unlock(&c->ec_stripe_create_lock);
 	up_read(&c->gc_lock);
 }
 
+/* stripe creation: */
+
 static int ec_stripe_bkey_insert(struct bch_fs *c,
 				 struct bkey_i_stripe *stripe)
 {
-	struct ec_stripe *m;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
@@ -754,18 +752,13 @@ retry:
 
 	return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
 found_slot:
-	mutex_lock(&c->ec_stripes_lock);
 	ret = ec_stripe_mem_alloc(c, &iter);
-	mutex_unlock(&c->ec_stripes_lock);
 
 	if (ret == -EINTR)
 		goto retry;
 	if (ret)
 		return ret;
 
-	m = genradix_ptr(&c->ec_stripes, iter.pos.offset);
-	atomic_inc(&m->pin);
-
 	stripe->k.p = iter.pos;
 
 	ret = bch2_btree_insert_at(c, NULL, NULL,
@@ -774,14 +767,9 @@ found_slot:
 				   BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
 	bch2_btree_iter_unlock(&iter);
 
-	if (ret)
-		atomic_dec(&m->pin);
-
 	return ret;
 }
 
-/* stripe creation: */
-
 static void extent_stripe_ptr_add(struct bkey_s_extent e,
 				  struct ec_stripe_buf *s,
 				  struct bch_extent_ptr *ptr,
@@ -857,7 +845,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
  */
 static void ec_stripe_create(struct ec_stripe_new *s)
 {
-	struct ec_stripe *ec_stripe;
 	struct bch_fs *c = s->c;
 	struct open_bucket *ob;
 	struct bkey_i *k;
@@ -897,10 +884,12 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			goto err_put_writes;
 		}
 
+	mutex_lock(&c->ec_stripe_create_lock);
+
 	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
-		goto err_put_writes;
+		goto err_unlock;
 	}
 
 	for_each_keylist_key(&s->keys, k) {
@@ -909,12 +898,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			break;
 	}
 
-	ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset);
-
-	atomic_dec(&ec_stripe->pin);
-	bch2_stripes_heap_update(c, ec_stripe,
-				 s->stripe.key.k.p.offset);
-
+err_unlock:
+	mutex_unlock(&c->ec_stripe_create_lock);
 err_put_writes:
 	percpu_ref_put(&c->writes);
 err:
@@ -1221,7 +1206,7 @@ unlock:
 	mutex_unlock(&c->ec_new_stripe_lock);
 }
 
-int bch2_fs_ec_start(struct bch_fs *c)
+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -1237,19 +1222,25 @@ int bch2_fs_ec_start(struct bch_fs *c)
 	if (ret)
 		return ret;
 
-	if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+	if (!gc &&
+	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
 		       GFP_KERNEL))
 		return -ENOMEM;
 #if 0
-	ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL);
+	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
 #else
 	for (i = 0; i < idx; i++)
-		if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL))
+		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
 			return -ENOMEM;
 #endif
 	return 0;
 }
 
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+	return bch2_ec_mem_alloc(c, false);
+}
+
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
@@ -1270,7 +1261,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 	}
 
 	free_heap(&c->ec_stripes_heap);
-	genradix_free(&c->ec_stripes);
+	genradix_free(&c->stripes[0]);
 	bioset_exit(&c->ec_bioset);
 }
 
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 13b875a4..c728c52c 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -4,13 +4,13 @@
 #include "ec_types.h"
 #include "keylist_types.h"
 
-const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 
-#define bch2_bkey_ec_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_ec_key_invalid,		\
-	.val_to_text	= bch2_ec_key_to_text,		\
+#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
+	.key_invalid	= bch2_stripe_invalid,		\
+	.val_to_text	= bch2_stripe_to_text,		\
 }
 
 struct bch_read_bio;
@@ -92,14 +92,16 @@ void bch2_ec_stripe_head_put(struct ec_stripe_head *);
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
 					       unsigned, unsigned);
 
-void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+int bch2_ec_mem_alloc(struct bch_fs *, bool);
+
 int bch2_fs_ec_start(struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index feb36010..d0429810 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -5,7 +5,7 @@
 
 #define EC_STRIPE_MAX	16
 
-struct ec_stripe {
+struct stripe {
 	size_t			heap_idx;
 
 	u16			sectors;
@@ -15,7 +15,6 @@ struct ec_stripe {
 	u8			nr_redundant;
 
 	u8			alive;
-	atomic_t		pin;
 	atomic_t		blocks_nonempty;
 	atomic_t		block_sectors[EC_STRIPE_MAX];
 };
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index ebaf390f..dc3fbfb6 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -27,88 +27,229 @@
 
 #include <trace/events/bcachefs.h>
 
-static void sort_key_next(struct btree_node_iter_large *iter,
-			  struct btree *b,
-			  struct btree_node_iter_set *i)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
-	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
 
-	if (i->k == i->end)
-		*i = iter->data[--iter->used];
+	bkey_for_each_ptr(p, ptr)
+		nr_ptrs++;
+
+	return nr_ptrs;
+}
+
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
+{
+	unsigned nr_ptrs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent: {
+		struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+		const struct bch_extent_ptr *ptr;
+
+		bkey_for_each_ptr(p, ptr)
+			nr_ptrs += !ptr->cached;
+		BUG_ON(!nr_ptrs);
+		break;
+	}
+	case KEY_TYPE_reservation:
+		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return nr_ptrs;
+}
+
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
+{
+	unsigned i, durability = 0;
+	struct bch_dev *ca;
+
+	if (p.ptr.cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);
+
+	for (i = 0; i < p.ec_nr; i++) {
+		struct stripe *s =
+			genradix_ptr(&c->stripes[0], p.idx);
+
+		if (WARN_ON(!s))
+			continue;
+
+		durability = max_t(unsigned, durability, s->nr_redundant);
+	}
+
+	return durability;
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);
+
+	return durability;
+}
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
 }
 
 /*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
+ * returns true if p1 is better than p2:
  */
-#define key_sort_cmp(h, l, r)						\
-({									\
-	bkey_cmp_packed(b,						\
-			__btree_node_offset_to_key(b, (l).k),		\
-			__btree_node_offset_to_key(b, (r).k))		\
-									\
-	?: (l).k - (r).k;						\
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
-					struct btree *b)
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
 {
-	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
-	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
 
-	if (bkey_whiteout(k))
-		return true;
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
-	if (iter->used < 2)
-		return false;
+		/* Pick at random, biased in favor of the faster device: */
 
-	if (iter->used > 2 &&
-	    key_sort_cmp(iter, r[0], r[1]) >= 0)
-		r++;
-
-	/*
-	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older and
-	 * should be dropped.
-	 */
-	return !bkey_cmp_packed(b,
-				__btree_node_offset_to_key(b, l->k),
-				__btree_node_offset_to_key(b, r->k));
-}
-
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
-{
-	struct bkey_packed *out = dst->start;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	heap_resort(iter, key_sort_cmp, NULL);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		if (!should_drop_next_key(iter, b)) {
-			struct bkey_packed *k =
-				__btree_node_offset_to_key(b, iter->data->k);
-
-			bkey_copy(out, k);
-			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_next(out);
-		}
-
-		sort_key_next(iter, b, iter->data);
-		heap_sift_down(iter, 0, key_sort_cmp, NULL);
+		return bch2_rand_range(l1 + l2) > l1;
 	}
 
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
 }
 
-/* Common among btree and extent ptrs */
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -EIO;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -EIO;
+
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+			continue;
+
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    !p.idx && p.ec_nr)
+			p.idx++;
+
+		if (p.idx >= p.ec_nr + 1)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *k,
+			  struct bch_extent_ptr ptr)
+{
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+		memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+		       &ptr,
+		       sizeof(ptr));
+		k->u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+/* extent specific utility code */
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
@@ -122,13 +263,6 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
 	return NULL;
 }
 
-void bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
-{
-	struct bch_extent_ptr *ptr;
-
-	bch2_extent_drop_ptrs(e, ptr, ptr->dev == dev);
-}
-
 const struct bch_extent_ptr *
 bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
 {
@@ -159,86 +293,12 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
 	return NULL;
 }
 
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
-{
-	const struct bch_extent_ptr *ptr;
-	unsigned nr_ptrs = 0;
-
-	extent_for_each_ptr(e, ptr)
-		nr_ptrs++;
-
-	return nr_ptrs;
-}
-
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	unsigned nr_ptrs = 0;
-
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-
-		extent_for_each_ptr(e, ptr)
-			nr_ptrs += !ptr->cached;
-		break;
-
-	case BCH_RESERVATION:
-		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
-		break;
-	}
-
-	return nr_ptrs;
-}
-
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-					   struct extent_ptr_decoded p)
-{
-	unsigned i, durability = 0;
-	struct bch_dev *ca;
-
-	if (p.ptr.cached)
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
-	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
-		durability = max_t(unsigned, durability, ca->mi.durability);
-
-	for (i = 0; i < p.ec_nr; i++) {
-		struct ec_stripe *s =
-			genradix_ptr(&c->ec_stripes, p.idx);
-
-		if (WARN_ON(!s))
-			continue;
-
-		durability = max_t(unsigned, durability, s->nr_redundant);
-	}
-
-	return durability;
-}
-
-unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
-{
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	extent_for_each_ptr_decode(e, p, entry)
-		durability += bch2_extent_ptr_durability(c, p);
-
-	return durability;
-}
-
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	unsigned ret = 0;
 
 	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
+	case KEY_TYPE_extent: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
@@ -270,10 +330,10 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }
 
-static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
 					  union bch_extent_entry *entry)
 {
-	union bch_extent_entry *i = e.v->start;
+	union bch_extent_entry *i = ptrs.start;
 
 	if (i == entry)
 		return NULL;
@@ -283,23 +343,24 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
 	return i;
 }
 
-union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
-					     struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
 	union bch_extent_entry *dst, *src, *prev;
 	bool drop_crc = true;
 
-	EBUG_ON(ptr < &e.v->start->ptr ||
-		ptr >= &extent_entry_last(e)->ptr);
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
 	src = extent_entry_next(to_entry(ptr));
-	if (src != extent_entry_last(e) &&
+	if (src != ptrs.end &&
 	    !extent_entry_is_crc(src))
 		drop_crc = false;
 
 	dst = to_entry(ptr);
-	while ((prev = extent_entry_prev(e, dst))) {
+	while ((prev = extent_entry_prev(ptrs, dst))) {
 		if (extent_entry_is_ptr(prev))
 			break;
 
@@ -313,8 +374,8 @@ union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
 	}
 
 	memmove_u64s_down(dst, src,
-			  (u64 *) extent_entry_last(e) - (u64 *) src);
-	e.k->u64s -= (u64 *) src - (u64 *) dst;
+			  (u64 *) ptrs.end - (u64 *) src);
+	k.k->u64s -= (u64 *) src - (u64 *) dst;
 
 	return dst;
 }
@@ -381,7 +442,7 @@ found:
 restart_narrow_pointers:
 	extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
 		if (can_narrow_crc(p.crc, n)) {
-			bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
+			bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
 			p.ptr.offset += p.crc.offset;
 			p.crc = n;
 			bch2_extent_ptr_decoded_append(e, &p);
@@ -406,66 +467,47 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
 		bch2_crc_cmp(l.csum, r.csum));
 }
 
-static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
-{
-	struct bch_extent_ptr *ptr;
-
-	bch2_extent_drop_ptrs(e, ptr,
-		ptr->cached &&
-		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
-}
-
-bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
-{
-	return bch2_extent_normalize(c, k);
-}
-
 void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 {
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		union bch_extent_entry *entry;
-		u64 *d = (u64 *) bkeyp_val(f, k);
-		unsigned i;
+	union bch_extent_entry *entry;
+	u64 *d = (u64 *) bkeyp_val(f, k);
+	unsigned i;
 
-		for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-			d[i] = swab64(d[i]);
+	for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+		d[i] = swab64(d[i]);
 
-		for (entry = (union bch_extent_entry *) d;
-		     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-		     entry = extent_entry_next(entry)) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.csum = swab32(entry->crc32.csum);
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.csum.hi = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.hi);
-				entry->crc128.csum.lo = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.lo);
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
-			}
+	for (entry = (union bch_extent_entry *) d;
+	     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
 		}
-		break;
-	}
 	}
 }
 
 static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c_extent e,
+				      struct bkey_s_c k,
 				      const struct bch_extent_ptr *ptr,
 				      unsigned size_ondisk,
 				      bool metadata)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr2;
 	struct bch_dev *ca;
 
@@ -477,7 +519,7 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	if (!ca)
 		return "pointer to invalid device";
 
-	extent_for_each_ptr(e, ptr2)
+	bkey_for_each_ptr(ptrs, ptr2)
 		if (ptr != ptr2 && ptr->dev == ptr2->dev)
 			return "multiple pointers to same device";
 
@@ -494,9 +536,10 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	return NULL;
 }
 
-static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c_extent e)
+static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
@@ -504,7 +547,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 	struct bch_dev *ca;
 	bool first = true;
 
-	extent_for_each_entry(e, entry) {
+	bkey_extent_entry_for_each(ptrs, entry) {
 		if (!first)
 			pr_buf(out, " ");
 
@@ -524,7 +567,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
 			pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
 			       crc.compressed_size,
@@ -541,167 +584,48 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			break;
 		default:
 			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-			goto out;
+			return;
 		}
 
 		first = false;
 	}
-out:
-	if (bkey_extent_is_cached(e.k))
-		pr_buf(out, " cached");
-}
-
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
-						   unsigned dev)
-{
-	struct bch_dev_io_failures *i;
-
-	for (i = f->devs; i < f->devs + f->nr; i++)
-		if (i->dev == dev)
-			return i;
-
-	return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p)
-{
-	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
-
-	if (!f) {
-		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-		f = &failed->devs[failed->nr++];
-		f->dev		= p->ptr.dev;
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else if (p->idx != f->idx) {
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else {
-		f->nr_failed++;
-	}
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
-			      const struct extent_ptr_decoded p1,
-			      const struct extent_ptr_decoded p2)
-{
-	if (likely(!p1.idx && !p2.idx)) {
-		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
-
-		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
-
-		/* Pick at random, biased in favor of the faster device: */
-
-		return bch2_rand_range(l1 + l2) > l1;
-	}
-
-	if (force_reconstruct_read(c))
-		return p1.idx > p2.idx;
-
-	return p1.idx < p2.idx;
-}
-
-static int extent_pick_read_device(struct bch_fs *c,
-				   struct bkey_s_c_extent e,
-				   struct bch_io_failures *failed,
-				   struct extent_ptr_decoded *pick)
-{
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_dev_io_failures *f;
-	struct bch_dev *ca;
-	int ret = 0;
-
-	extent_for_each_ptr_decode(e, p, entry) {
-		ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
-		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
-			continue;
-
-		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f)
-			p.idx = f->nr_failed < f->nr_retries
-				? f->idx
-				: f->idx + 1;
-
-		if (!p.idx &&
-		    !bch2_dev_is_readable(ca))
-			p.idx++;
-
-		if (force_reconstruct_read(c) &&
-		    !p.idx && p.ec_nr)
-			p.idx++;
-
-		if (p.idx >= p.ec_nr + 1)
-			continue;
-
-		if (ret && !ptr_better(c, p, *pick))
-			continue;
-
-		*pick = p;
-		ret = 1;
-	}
-
-	return ret;
 }
 
 /* Btree ptrs */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_extent_is_cached(k.k))
-		return "cached";
-
-	if (k.k->size)
-		return "nonzero key size";
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	const struct bch_extent_ptr *ptr;
+	const char *reason;
 
 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
 
-	switch (k.k->type) {
-	case BCH_EXTENT: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		const struct bch_extent_ptr *ptr;
-		const char *reason;
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
 
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
-
-			if (!extent_entry_is_ptr(entry))
-				return "has non ptr field";
-		}
-
-		extent_for_each_ptr(e, ptr) {
-			reason = extent_ptr_invalid(c, e, ptr,
-						    c->opts.btree_node_size,
-						    true);
-			if (reason)
-				return reason;
-		}
-
-		return NULL;
+		if (!extent_entry_is_ptr(entry))
+			return "has non ptr field";
 	}
 
-	default:
-		return "invalid value type";
+	bkey_for_each_ptr(ptrs, ptr) {
+		reason = extent_ptr_invalid(c, k, ptr,
+					    c->opts.btree_node_size,
+					    true);
+		if (reason)
+			return reason;
 	}
+
+	return NULL;
 }
 
 void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			       struct bkey_s_c k)
 {
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
 	unsigned seq;
 	const char *err;
@@ -711,7 +635,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	unsigned replicas = 0;
 	bool bad;
 
-	extent_for_each_ptr(e, ptr) {
+	bkey_for_each_ptr(ptrs, ptr) {
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 		replicas++;
 
@@ -737,9 +661,8 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
-				       e.s_c, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
+	    !bch2_bkey_replicas_marked(c, k, false)) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
 		bch2_fs_bug(c,
 			"btree key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -748,7 +671,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 
 	return;
 err:
-	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
+	bch2_bkey_val_to_text(&PBUF(buf), c, k);
 	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
 		    err, buf, PTR_BUCKET_NR(ca, ptr),
 		    mark.gen, (unsigned) mark.v.counter);
@@ -759,25 +682,16 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	const char *invalid;
 
-	if (bkey_extent_is_data(k.k))
-		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
+	bkey_ptrs_to_text(out, c, k);
 
 	invalid = bch2_btree_ptr_invalid(c, k);
 	if (invalid)
 		pr_buf(out, " invalid: %s", invalid);
 }
 
-int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-			struct bch_io_failures *failed,
-			struct extent_ptr_decoded *pick)
-{
-	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				       failed, pick);
-}
-
 /* Extents */
 
-static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 {
 	u64 len = 0;
 
@@ -795,7 +709,7 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 	 * cause offset to point to the next bucket:
 	 */
 	if (!len)
-		k.k->type = KEY_TYPE_DELETED;
+		k.k->type = KEY_TYPE_deleted;
 	else if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_extent e = bkey_s_to_extent(k);
 		union bch_extent_entry *entry;
@@ -830,11 +744,6 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 	return true;
 }
 
-bool bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-	return __bch2_cut_front(where, bkey_i_to_s(k));
-}
-
 bool bch2_cut_back(struct bpos where, struct bkey *k)
 {
 	u64 len = 0;
@@ -852,7 +761,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 	k->size = len;
 
 	if (!len)
-		k->type = KEY_TYPE_DELETED;
+		k->type = KEY_TYPE_deleted;
 
 	return true;
 }
@@ -870,24 +779,6 @@ void bch2_key_resize(struct bkey *k,
 	k->size = new_size;
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static void extent_save(struct btree *b, struct bkey_packed *dst,
-			struct bkey *src)
-{
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-
-	if ((dst_unpacked = packed_to_bkey(dst)))
-		dst_unpacked->k = *src;
-	else
-		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
 static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 			  struct bkey_i *src)
 {
@@ -906,170 +797,6 @@ static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
 	return true;
 }
 
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(h, l, r)					\
-({									\
-	struct bkey _ul = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (l).k));	\
-	struct bkey _ur = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (r).k));	\
-									\
-	bkey_cmp(bkey_start_pos(&_ul),					\
-		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
-				    struct btree *b, size_t i)
-{
-	heap_sift_down(iter, i, extent_sort_cmp, NULL);
-}
-
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
-				    struct btree *b,
-				    struct btree_node_iter_set *i)
-{
-	sort_key_next(iter, b, i);
-	heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
-}
-
-static void extent_sort_append(struct bch_fs *c,
-			       struct btree *b,
-			       struct btree_nr_keys *nr,
-			       struct bkey_packed *start,
-			       struct bkey_packed **prev,
-			       struct bkey_packed *k)
-{
-	struct bkey_format *f = &b->format;
-	BKEY_PADDED(k) tmp;
-
-	if (bkey_whiteout(k))
-		return;
-
-	bch2_bkey_unpack(b, &tmp.k, k);
-
-	if (*prev &&
-	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
-		return;
-
-	if (*prev) {
-		bch2_bkey_pack(*prev, (void *) *prev, f);
-
-		btree_keys_account_key_add(nr, 0, *prev);
-		*prev = bkey_next(*prev);
-	} else {
-		*prev = start;
-	}
-
-	bkey_copy(*prev, &tmp.k);
-}
-
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-					struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
-{
-	struct bkey_format *f = &b->format;
-	struct btree_node_iter_set *_l = iter->data, *_r;
-	struct bkey_packed *prev = NULL, *out, *lk, *rk;
-	struct bkey l_unpacked, r_unpacked;
-	struct bkey_s l, r;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	heap_resort(iter, extent_sort_cmp, NULL);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		lk = __btree_node_offset_to_key(b, _l->k);
-
-		if (iter->used == 1) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
-
-		_r = iter->data + 1;
-		if (iter->used > 2 &&
-		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
-			_r++;
-
-		rk = __btree_node_offset_to_key(b, _r->k);
-
-		l = __bkey_disassemble(b, lk, &l_unpacked);
-		r = __bkey_disassemble(b, rk, &r_unpacked);
-
-		/* If current key and next key don't overlap, just append */
-		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
-
-		/* Skip 0 size keys */
-		if (!r.k->size) {
-			extent_sort_next(iter, b, _r);
-			continue;
-		}
-
-		/*
-		 * overlap: keep the newer key and trim the older key so they
-		 * don't overlap. comparing pointers tells us which one is
-		 * newer, since the bsets are appended one after the other.
-		 */
-
-		/* can't happen because of comparison func */
-		BUG_ON(_l->k < _r->k &&
-		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-		if (_l->k > _r->k) {
-			/* l wins, trim r */
-			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				sort_key_next(iter, b, _r);
-			} else {
-				__bch2_cut_front(l.k->p, r);
-				extent_save(b, rk, r.k);
-			}
-
-			extent_sort_sift(iter, b, _r - iter->data);
-		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-			BKEY_PADDED(k) tmp;
-
-			/*
-			 * r wins, but it overlaps in the middle of l - split l:
-			 */
-			bkey_reassemble(&tmp.k, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
-
-			__bch2_cut_front(r.k->p, l);
-			extent_save(b, lk, l.k);
-
-			extent_sort_sift(iter, b, 0);
-
-			extent_sort_append(c, b, &nr, dst->start, &prev,
-					   bkey_to_packed(&tmp.k));
-		} else {
-			bch2_cut_back(bkey_start_pos(r.k), l.k);
-			extent_save(b, lk, l.k);
-		}
-	}
-
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = dst->start;
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 struct extent_insert_state {
 	struct btree_insert		*trans;
 	struct btree_insert_entry	*insert;
@@ -1098,13 +825,13 @@ static void verify_extent_nonoverlapping(struct btree *b,
 	struct bkey uk;
 
 	iter = *_iter;
-	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
 	BUG_ON(k &&
 	       (uk = bkey_unpack_key(b, k),
 		bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
 
 	iter = *_iter;
-	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
 #if 0
 	BUG_ON(k &&
 	       (uk = bkey_unpack_key(b, k),
@@ -1150,13 +877,13 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
 	verify_extent_nonoverlapping(l->b, &l->iter, insert);
 
 	node_iter = l->iter;
-	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
 	if (k && !bkey_written(l->b, k) &&
 	    bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
 		return;
 
 	node_iter = l->iter;
-	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+	k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
 	if (k && !bkey_written(l->b, k) &&
 	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
 		return;
@@ -1180,7 +907,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
 	bkey_copy(&split.k, insert);
 	if (s->deleting)
-		split.k.k.type = KEY_TYPE_DISCARD;
+		split.k.k.type = KEY_TYPE_discard;
 
 	bch2_cut_back(s->committed, &split.k.k);
 
@@ -1202,7 +929,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	if (s->update_journal) {
 		bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
 		if (s->deleting)
-			split.k.k.type = KEY_TYPE_DISCARD;
+			split.k.k.type = KEY_TYPE_discard;
 
 		bch2_cut_back(s->committed, &split.k.k);
 
@@ -1214,7 +941,6 @@ static void extent_insert_committed(struct extent_insert_state *s)
 	bch2_cut_front(s->committed, insert);
 
 	insert->k.needs_whiteout	= false;
-	s->trans->did_work		= true;
 }
 
 void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
@@ -1254,7 +980,7 @@ bch2_extent_can_insert(struct btree_insert *trans,
 		*u64s += BKEY_U64s;
 
 	_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-					      KEY_TYPE_DISCARD);
+					      KEY_TYPE_discard);
 	if (!_k)
 		return BTREE_INSERT_OK;
 
@@ -1331,7 +1057,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 			btree_account_key_drop(l->b, _k);
 
 		k.k->size = 0;
-		k.k->type = KEY_TYPE_DELETED;
+		k.k->type = KEY_TYPE_deleted;
 
 		if (_k >= btree_bset_last(l->b)->start) {
 			unsigned u64s = _k->u64s;
@@ -1392,7 +1118,7 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 
 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
 	       (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
-						      KEY_TYPE_DISCARD))) {
+						      KEY_TYPE_discard))) {
 		struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
 		enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
@@ -1424,7 +1150,7 @@ static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 		    !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
 			if (!bkey_whiteout(k.k)) {
 				btree_account_key_drop(l->b, _k);
-				_k->type = KEY_TYPE_DISCARD;
+				_k->type = KEY_TYPE_discard;
 				reserve_whiteout(l->b, _k);
 			}
 			break;
@@ -1555,88 +1281,66 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	unsigned size_ondisk = e.k->size;
+	const char *reason;
+	unsigned nonce = UINT_MAX;
+
+	if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX)
 		return "value too big";
 
-	if (!k.k->size)
-		return "zero key size";
+	extent_for_each_entry(e, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+			return "invalid extent entry type";
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct bch_extent_crc_unpacked crc;
-		const struct bch_extent_ptr *ptr;
-		unsigned size_ondisk = e.k->size;
-		const char *reason;
-		unsigned nonce = UINT_MAX;
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
 
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
+			reason = extent_ptr_invalid(c, e.s_c, &entry->ptr,
+						    size_ondisk, false);
+			if (reason)
+				return reason;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				ptr = entry_to_ptr(entry);
+			if (crc.offset + e.k->size >
+			    crc.uncompressed_size)
+				return "checksum offset + key size > uncompressed size";
 
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-			case BCH_EXTENT_ENTRY_crc64:
-			case BCH_EXTENT_ENTRY_crc128:
-				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+			size_ondisk = crc.compressed_size;
 
-				if (crc.offset + e.k->size >
-				    crc.uncompressed_size)
-					return "checksum offset + key size > uncompressed size";
+			if (!bch2_checksum_type_valid(c, crc.csum_type))
+				return "invalid checksum type";
 
-				size_ondisk = crc.compressed_size;
+			if (crc.compression_type >= BCH_COMPRESSION_NR)
+				return "invalid compression type";
 
-				if (!bch2_checksum_type_valid(c, crc.csum_type))
-					return "invalid checksum type";
-
-				if (crc.compression_type >= BCH_COMPRESSION_NR)
-					return "invalid compression type";
-
-				if (bch2_csum_type_is_encryption(crc.csum_type)) {
-					if (nonce == UINT_MAX)
-						nonce = crc.offset + crc.nonce;
-					else if (nonce != crc.offset + crc.nonce)
-						return "incorrect nonce";
-				}
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					return "incorrect nonce";
 			}
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
 		}
-
-		return NULL;
 	}
 
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-			return "incorrect value size";
-
-		if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-			return "invalid nr_replicas";
-
-		return NULL;
-	}
-
-	default:
-		return "invalid value type";
-	}
+	return NULL;
 }
 
-static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
-					  struct bkey_s_c_extent e)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k)
 {
+	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 	const struct bch_extent_ptr *ptr;
 	struct bch_dev *ca;
 	struct bucket_mark mark;
@@ -1698,8 +1402,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (replicas > BCH_REPLICAS_MAX) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-				      e.s_c);
+		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (too many replicas: %u): %s",
 			replicas, buf);
@@ -1707,10 +1410,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-	    !bch2_bkey_replicas_marked(c, btree_node_type(b),
-				       e.s_c, false)) {
-		bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-				      e.s_c);
+	    !bch2_bkey_replicas_marked(c, e.s_c, false)) {
+		bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 		bch2_fs_bug(c,
 			"extent key bad (replicas not marked in superblock):\n%s",
 			buf);
@@ -1720,34 +1421,18 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	return;
 
 bad_ptr:
-	bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
-			      e.s_c);
+	bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c);
 	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
 		   "gen %i type %u", buf,
 		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
-		break;
-	case BCH_RESERVATION:
-		break;
-	default:
-		BUG();
-	}
-}
-
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
 	const char *invalid;
 
-	if (bkey_extent_is_data(k.k))
-		extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
+	bkey_ptrs_to_text(out, c, k);
 
 	invalid = bch2_extent_invalid(c, k);
 	if (invalid)
@@ -1843,41 +1528,17 @@ found:
  */
 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 {
-	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
 
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return false;
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-	case KEY_TYPE_DELETED:
-		return true;
-	case KEY_TYPE_DISCARD:
-		return bversion_zero(k.k->version);
-	case KEY_TYPE_COOKIE:
-		return false;
+	/* will only happen if all pointers were cached: */
+	if (!bkey_val_u64s(k.k))
+		k.k->type = KEY_TYPE_deleted;
 
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_to_extent(k);
-
-		bch2_extent_drop_stale(c, e);
-
-		if (!bkey_val_u64s(e.k)) {
-			if (bkey_extent_is_cached(e.k)) {
-				k.k->type = KEY_TYPE_DISCARD;
-				if (bversion_zero(k.k->version))
-					return true;
-			} else {
-				k.k->type = KEY_TYPE_ERROR;
-			}
-		}
-
-		return false;
-	case BCH_RESERVATION:
-		return false;
-	default:
-		BUG();
-	}
+	return false;
 }
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
@@ -1887,7 +1548,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 {
 	union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
+	int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
 
 	if (target && extra > 0)
 		extent_for_each_ptr_decode(e, p, entry) {
@@ -1911,106 +1572,40 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 		}
 }
 
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_io_failures *failed,
-			 struct extent_ptr_decoded *pick)
-{
-	int ret;
-
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return -EIO;
-
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
-					      failed, pick);
-
-		if (!ret && !bkey_extent_is_cached(k.k))
-			ret = -EIO;
-
-		return ret;
-
-	default:
-		return 0;
-	}
-}
-
-enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+enum merge_result bch2_extent_merge(struct bch_fs *c,
 				    struct bkey_i *l, struct bkey_i *r)
 {
-	struct bkey_s_extent el, er;
+	struct bkey_s_extent el = bkey_i_to_s_extent(l);
+	struct bkey_s_extent er = bkey_i_to_s_extent(r);
 	union bch_extent_entry *en_l, *en_r;
 
-	if (key_merging_disabled(c))
+	if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
 		return BCH_MERGE_NOMERGE;
 
-	/*
-	 * Generic header checks
-	 * Assumes left and right are in order
-	 * Left and right must be exactly aligned
-	 */
+	extent_for_each_entry(el, en_l) {
+		struct bch_extent_ptr *lp, *rp;
+		struct bch_dev *ca;
 
-	if (l->k.u64s		!= r->k.u64s ||
-	    l->k.type		!= r->k.type ||
-	    bversion_cmp(l->k.version, r->k.version) ||
-	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
-		return BCH_MERGE_NOMERGE;
+		en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
-	switch (l->k.type) {
-	case KEY_TYPE_DISCARD:
-	case KEY_TYPE_ERROR:
-		/* These types are mergeable, and no val to check */
-		break;
-
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		el = bkey_i_to_s_extent(l);
-		er = bkey_i_to_s_extent(r);
-
-		extent_for_each_entry(el, en_l) {
-			struct bch_extent_ptr *lp, *rp;
-			struct bch_dev *ca;
-
-			en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
-
-			if ((extent_entry_type(en_l) !=
-			     extent_entry_type(en_r)) ||
-			    !extent_entry_is_ptr(en_l))
-				return BCH_MERGE_NOMERGE;
-
-			lp = &en_l->ptr;
-			rp = &en_r->ptr;
-
-			if (lp->offset + el.k->size	!= rp->offset ||
-			    lp->dev			!= rp->dev ||
-			    lp->gen			!= rp->gen)
-				return BCH_MERGE_NOMERGE;
-
-			/* We don't allow extents to straddle buckets: */
-			ca = bch_dev_bkey_exists(c, lp->dev);
-
-			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return BCH_MERGE_NOMERGE;
-		}
-
-		break;
-	case BCH_RESERVATION: {
-		struct bkey_i_reservation *li = bkey_i_to_reservation(l);
-		struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
-
-		if (li->v.generation != ri->v.generation ||
-		    li->v.nr_replicas != ri->v.nr_replicas)
+		if ((extent_entry_type(en_l) !=
+		     extent_entry_type(en_r)) ||
+		    !extent_entry_is_ptr(en_l))
+			return BCH_MERGE_NOMERGE;
+
+		lp = &en_l->ptr;
+		rp = &en_r->ptr;
+
+		if (lp->offset + el.k->size	!= rp->offset ||
+		    lp->dev			!= rp->dev ||
+		    lp->gen			!= rp->gen)
+			return BCH_MERGE_NOMERGE;
+
+		/* We don't allow extents to straddle buckets: */
+		ca = bch_dev_bkey_exists(c, lp->dev);
+
+		if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
 			return BCH_MERGE_NOMERGE;
-		break;
-	}
-	default:
-		return BCH_MERGE_NOMERGE;
 	}
 
 	l->k.needs_whiteout |= r->k.needs_whiteout;
@@ -2060,7 +1655,7 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	bch2_bkey_unpack(b, &li.k, l);
 	bch2_bkey_unpack(b, &ri.k, r);
 
-	ret = bch2_extent_merge(c, b, &li.k, &ri.k);
+	ret = bch2_bkey_merge(c, &li.k, &ri.k);
 	if (ret == BCH_MERGE_NOMERGE)
 		return false;
 
@@ -2128,3 +1723,54 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 
 	return ret;
 }
+
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+		return "incorrect value size";
+
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+		return "invalid nr_replicas";
+
+	return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	pr_buf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+					 struct bkey_i *l, struct bkey_i *r)
+{
+	struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+	struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+	if (li->v.generation != ri->v.generation ||
+	    li->v.nr_replicas != ri->v.nr_replicas)
+		return BCH_MERGE_NOMERGE;
+
+	l->k.needs_whiteout |= r->k.needs_whiteout;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+		bch2_key_resize(&l->k, KEY_SIZE_MAX);
+		bch2_cut_front(l->k.p, r);
+		return BCH_MERGE_PARTIAL;
+	}
+
+	bch2_key_resize(&l->k, l->k.size + r->k.size);
+
+	return BCH_MERGE_MERGE;
+}
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 307abd26..e6e9c307 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -6,141 +6,37 @@
 #include "extents_types.h"
 
 struct bch_fs;
-struct journal_res;
-struct btree_node_iter;
-struct btree_node_iter_large;
 struct btree_insert;
 struct btree_insert_entry;
-struct bch_devs_mask;
-union bch_extent_crc;
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
-			       struct bkey_s_c);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+/* extent entries: */
 
-#define bch2_bkey_btree_ops (struct bkey_ops) {			\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-}
+#define extent_entry_last(_e)		bkey_val_end(_e)
 
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
-enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-				    struct bkey_i *, struct bkey_i *);
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
 
-#define bch2_bkey_extent_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_extent_invalid,			\
-	.key_debugcheck	= bch2_extent_debugcheck,		\
-	.val_to_text	= bch2_extent_to_text,			\
-	.swab		= bch2_ptr_swab,			\
-	.key_normalize	= bch2_ptr_normalize,			\
-	.key_merge	= bch2_extent_merge,			\
-	.is_extents	= true,					\
-}
-
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
-						  struct btree *,
-						  struct btree_node_iter_large *);
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-						     struct bset *,
-						     struct btree *,
-						     struct btree_node_iter_large *);
-
-void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *);
-int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-			struct bch_io_failures *,
-			struct extent_ptr_decoded *);
-int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
-			 struct bch_io_failures *,
-			 struct extent_ptr_decoded *);
-
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-
-static inline bool bch2_extent_is_atomic(struct bkey *k,
-					 struct btree_iter *iter)
-{
-	struct btree *b = iter->l[0].b;
-
-	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
-		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
-}
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
-		       unsigned *);
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
-
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-				      unsigned, unsigned);
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-void bch2_extent_drop_device(struct bkey_s_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
-
-unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
-
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
-			     struct bch_extent_ptr, u64);
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-	case BCH_RESERVATION:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
-	return bkey_extent_is_allocation(k.k) &&
-		!bch2_extent_is_compressed(k);
-}
-
-static inline bool bkey_extent_is_cached(const struct bkey *k)
-{
-	return k->type == BCH_EXTENT_CACHED;
-}
-
-static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
-{
-	EBUG_ON(k->type != BCH_EXTENT &&
-		k->type != BCH_EXTENT_CACHED);
-
-	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
-}
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
 
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
@@ -205,21 +101,6 @@ union bch_extent_crc {
 	struct bch_extent_crc128	crc128;
 };
 
-/* downcast, preserves const */
-#define to_entry(_entry)						\
-({									\
-	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *) &&	\
-		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
-									\
-	__builtin_choose_expr(						\
-		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
-		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
-		(const union bch_extent_entry *) (_entry),		\
-		(union bch_extent_entry *) (_entry));			\
-})
-
 #define __entry_to_crc(_entry)						\
 	__builtin_choose_expr(						\
 		type_is_exact(_entry, const union bch_extent_entry *),	\
@@ -233,18 +114,6 @@ union bch_extent_crc {
 	__entry_to_crc(_entry);						\
 })
 
-#define entry_to_ptr(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
-									\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const struct bch_extent_ptr *) (_entry),		\
-		(struct bch_extent_ptr *) (_entry));			\
-})
-
-/* checksum entries: */
-
 static inline struct bch_extent_crc_unpacked
 bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 {
@@ -302,43 +171,317 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 #undef common_fields
 }
 
-/* Extent entry iteration: */
+/* bkey_ptrs: generically over any key type that has ptrs */
+
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
+
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
+
+/* iterate over bkey ptrs */
 
 #define extent_entry_next(_entry)					\
 	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
-#define extent_entry_last(_e)						\
-	vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next(_entry))
 
-/* Iterate over all entries: */
+#define __bkey_ptr_next(_ptr, _end)					\
+({									\
+	typeof(_end) _entry;						\
+									\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
+		if (extent_entry_is_ptr(_entry))			\
+			break;						\
+									\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
+})
+
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for ((_ptr) = (_start);						\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
+
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
+
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
+({									\
+	__label__ out;							\
+									\
+	(_ptr).idx	= 0;						\
+	(_ptr).ec_nr	= 0;						\
+									\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
+		switch (extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
+			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
+					entry_to_crc(_entry));		\
+			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
+			break;						\
+		}							\
+out:									\
+	_entry < (_end);						\
+})
+
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
+	     (_entry) = extent_entry_next(_entry))
+
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
+
+/* utility code common to all keys with pointers: */
+
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(bkey_val_end(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+/* bch_btree_ptr: */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+			       struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+/* bch_extent: */
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *,
+				    struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_ops_extent (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+}
+
+/* bch_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+enum merge_result bch2_reservation_merge(struct bch_fs *,
+					 struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
+	.key_invalid	= bch2_reservation_invalid,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+}
+
+void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+
+static inline bool bch2_extent_is_atomic(struct bkey *k,
+					 struct btree_iter *iter)
+{
+	struct btree *b = iter->l[0].b;
+
+	return bkey_cmp(k->p, b->key.k.p) <= 0 &&
+		bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
+		       unsigned *);
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
+
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+				      unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_extent:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+/* Extent entry iteration: */
 
 #define extent_for_each_entry_from(_e, _entry, _start)			\
-	for ((_entry) = _start;						\
-	     (_entry) < extent_entry_last(_e);				\
-	     (_entry) = extent_entry_next(_entry))
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e),_entry)
 
 #define extent_for_each_entry(_e, _entry)				\
 	extent_for_each_entry_from(_e, _entry, (_e).v->start)
 
-/* Iterate over pointers only: */
-
 #define extent_ptr_next(_e, _ptr)					\
-({									\
-	typeof(&(_e).v->start[0]) _entry;				\
-									\
-	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
-		if (extent_entry_is_ptr(_entry))			\
-			break;						\
-									\
-	_entry < extent_entry_last(_e) ? entry_to_ptr(_entry) : NULL;	\
-})
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
 
 #define extent_for_each_ptr(_e, _ptr)					\
-	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
-	     (_ptr)++)
-
-/* Iterate over crcs only: */
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
 #define extent_crc_next(_e, _crc, _iter)				\
 ({									\
@@ -357,41 +500,9 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 	     extent_crc_next(_e, _crc, _iter);				\
 	     (_iter) = extent_entry_next(_iter))
 
-/* Iterate over pointers, with crcs: */
-
-#define __extent_ptr_next_decode(_e, _ptr, _entry)			\
-({									\
-	__label__ out;							\
-									\
-	(_ptr).idx	= 0;						\
-	(_ptr).ec_nr	= 0;						\
-									\
-	extent_for_each_entry_from(_e, _entry, _entry)			\
-		switch (extent_entry_type(_entry)) {			\
-		case BCH_EXTENT_ENTRY_ptr:				\
-			(_ptr).ptr		= _entry->ptr;		\
-			goto out;					\
-		case BCH_EXTENT_ENTRY_crc32:				\
-		case BCH_EXTENT_ENTRY_crc64:				\
-		case BCH_EXTENT_ENTRY_crc128:				\
-			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
-					entry_to_crc(_entry));		\
-			break;						\
-		case BCH_EXTENT_ENTRY_stripe_ptr:			\
-			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
-			break;						\
-		}							\
-out:									\
-	_entry < extent_entry_last(_e);					\
-})
-
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_entry) = (_e).v->start;					\
-	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
-	     (_entry) = extent_entry_next(_entry))
-
-/* Iterate over pointers backwards: */
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
 
 void bch2_extent_crc_append(struct bkey_i_extent *,
 			    struct bch_extent_crc_unpacked);
@@ -420,96 +531,23 @@ static inline void __extent_entry_push(struct bkey_i_extent *e)
 	e->k.u64s += extent_entry_u64s(entry);
 }
 
-static inline void extent_ptr_append(struct bkey_i_extent *e,
-				     struct bch_extent_ptr ptr)
-{
-	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	extent_entry_last(extent_i_to_s(e))->ptr = ptr;
-	__extent_entry_push(e);
-}
-
-static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		ret.devs[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
-}
-
 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
 				 struct bch_extent_crc_unpacked);
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
 
-union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent ,
-					     struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+					   struct bch_extent_ptr *);
 
-#define bch2_extent_drop_ptrs(_e, _ptr, _cond)				\
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
 do {									\
-	_ptr = &(_e).v->start->ptr;					\
+	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
 									\
-	while ((_ptr = extent_ptr_next(e, _ptr))) {			\
+	_ptr = &_ptrs.start->ptr;					\
+									\
+	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
 		if (_cond) {						\
-			_ptr = (void *) bch2_extent_drop_ptr(_e, _ptr);	\
+			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
+			_ptrs = bch2_bkey_ptrs(_k);			\
 			continue;					\
 		}							\
 									\
@@ -517,10 +555,34 @@ do {									\
 	}								\
 } while (0)
 
-bool bch2_cut_front(struct bpos, struct bkey_i *);
+bool __bch2_cut_front(struct bpos, struct bkey_s);
+
+static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	return __bch2_cut_front(where, bkey_i_to_s(k));
+}
+
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
 
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static inline void extent_save(struct btree *b, struct bkey_packed *dst,
+			       struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		dst_unpacked->k = *src;
+	else
+		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
+}
+
 int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
 
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 34cfd5d6..41ac5d48 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -121,7 +121,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 	BUG_ON(res->sectors > inode->ei_quota_reserved);
 
 	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
 	inode->ei_quota_reserved -= res->sectors;
 	mutex_unlock(&inode->ei_quota_lock);
 
@@ -138,7 +138,7 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 
 	mutex_lock(&inode->ei_quota_lock);
 	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
 	if (likely(!ret)) {
 		inode->ei_quota_reserved += sectors;
 		res->sectors += sectors;
@@ -220,7 +220,7 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 		quota_res->sectors -= sectors;
 		inode->ei_quota_reserved -= sectors;
 	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
 #endif
 	inode->v.i_blocks += sectors;
@@ -242,9 +242,15 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 	bch2_btree_iter_link(_iter, &iter);
 	bch2_btree_iter_copy(&iter, _iter);
 
-	for_each_btree_key_continue(&iter, BTREE_ITER_SLOTS, old) {
-		if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-			break;
+	old = bch2_btree_iter_peek_slot(&iter);
+
+	while (1) {
+		/*
+		 * should not be possible to get an error here, since we're
+		 * carefully not advancing past @new and thus whatever leaf node
+		 * @_iter currently points to:
+		 */
+		BUG_ON(btree_iter_err(old));
 
 		if (allocating &&
 		    !bch2_extent_is_fully_allocated(old))
@@ -256,6 +262,11 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 			      bkey_start_offset(old.k))) *
 			(bkey_extent_is_allocation(&new->k) -
 			 bkey_extent_is_allocation(old.k));
+
+		if (bkey_cmp(old.k->p, new->k.p) >= 0)
+			break;
+
+		old = bch2_btree_iter_next_slot(&iter);
 	}
 
 	bch2_btree_iter_unlink(&iter);
@@ -848,7 +859,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-		? bch2_extent_nr_dirty_ptrs(k)
+		? bch2_bkey_nr_dirty_ptrs(k)
 		: 0;
 
 	bio_for_each_segment(bv, bio, iter) {
@@ -2397,7 +2408,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
 		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
-				bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
 				BCH_DISK_RESERVATION_NOFAIL);
 		BUG_ON(ret);
 
@@ -2504,7 +2515,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 			goto btree_iter_err;
 
 		/* already reserved */
-		if (k.k->type == BCH_RESERVATION &&
+		if (k.k->type == KEY_TYPE_reservation &&
 		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
 			bch2_btree_iter_next_slot(iter);
 			continue;
@@ -2517,7 +2528,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		}
 
 		bkey_reservation_init(&reservation.k_i);
-		reservation.k.type	= BCH_RESERVATION;
+		reservation.k.type	= KEY_TYPE_reservation;
 		reservation.k.p		= k.k->p;
 		reservation.k.size	= k.k->size;
 
@@ -2525,7 +2536,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		bch2_cut_back(end_pos, &reservation.k);
 
 		sectors = reservation.k.size;
-		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
+		reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
 
 		if (!bkey_extent_is_allocation(k.k)) {
 			ret = bch2_quota_reservation_add(c, inode,
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 9bda6212..67b0dd37 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -281,7 +281,7 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
 	if (tmpfile)
 		inode_u.bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -394,7 +394,7 @@ err_trans:
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
 err:
-	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN);
 	inode = ERR_PTR(ret);
 	goto out;
 }
@@ -999,7 +999,7 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 		}
 
 		return 0;
-	} else if (k->k.type == BCH_RESERVATION) {
+	} else if (k->k.type == KEY_TYPE_reservation) {
 		return fiemap_fill_next_extent(info,
 					       bkey_start_offset(&k->k) << 9,
 					       0, k->k.size << 9,
@@ -1028,7 +1028,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
 			   POS(ei->v.i_ino, start >> 9), 0, k)
 		if (bkey_extent_is_data(k.k) ||
-		    k.k->type == BCH_RESERVATION) {
+		    k.k->type == KEY_TYPE_reservation) {
 			if (bkey_cmp(bkey_start_pos(k.k),
 				     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
 				break;
@@ -1329,9 +1329,9 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-				BCH_QUOTA_WARN);
+				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-				BCH_QUOTA_WARN);
+				KEY_TYPE_QUOTA_WARN);
 		bch2_inode_rm(c, inode->v.i_ino);
 
 		WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 53ee1b0e..5525af8f 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -234,7 +234,6 @@ static int hash_check_duplicates(const struct bch_hash_desc desc,
 				!desc.cmp_bkey(k, k2), c,
 				"duplicate hash table keys:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       bkey_type(0, desc.btree_id),
 						       k), buf))) {
 			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
 			if (ret)
@@ -254,7 +253,7 @@ static bool key_has_correct_hash(const struct bch_hash_desc desc,
 {
 	u64 hash;
 
-	if (k.k->type != desc.whiteout_type &&
+	if (k.k->type != KEY_TYPE_whiteout &&
 	    k.k->type != desc.key_type)
 		return true;
 
@@ -279,7 +278,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 	u64 hashed;
 	int ret = 0;
 
-	if (k.k->type != desc.whiteout_type &&
+	if (k.k->type != KEY_TYPE_whiteout &&
 	    k.k->type != desc.key_type)
 		return 0;
 
@@ -299,7 +298,6 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			desc.btree_id, k.k->p.offset,
 			hashed, h->chain->pos.offset,
 			(bch2_bkey_val_to_text(&PBUF(buf), c,
-					       bkey_type(0, desc.btree_id),
 					       k), buf))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
 		if (ret) {
@@ -369,7 +367,7 @@ static int check_dirent_hash(struct hash_check *h, struct bch_fs *c,
 
 		*k = bch2_btree_iter_peek(iter);
 
-		BUG_ON(k->k->type != BCH_DIRENT);
+		BUG_ON(k->k->type != KEY_TYPE_dirent);
 	}
 err:
 fsck_err:
@@ -384,7 +382,6 @@ err_redo:
 		     buf, strlen(buf), BTREE_ID_DIRENTS,
 		     k->k->p.offset, hash, h->chain->pos.offset,
 		     (bch2_bkey_val_to_text(&PBUF(buf), c,
-					    bkey_type(0, BTREE_ID_DIRENTS),
 					    *k), buf))) {
 		ret = hash_redo_key(bch2_dirent_hash_desc,
 				    h, c, iter, *k, hash);
@@ -470,7 +467,7 @@ static int check_extents(struct bch_fs *c)
 
 		if (fsck_err_on(w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			k.k->type != BCH_RESERVATION &&
+			k.k->type != KEY_TYPE_reservation &&
 			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
 			"extent type %u offset %llu past end of inode %llu, i_size %llu",
 			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
@@ -528,13 +525,11 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(!w.have_inode, c,
 				"dirent in nonexisting directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       BTREE_ID_DIRENTS,
 						       k), buf)) ||
 		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
 				"dirent in non directory inode type %u:\n%s",
 				mode_to_type(w.inode.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = bch2_btree_delete_at(iter, 0);
 			if (ret)
@@ -556,7 +551,7 @@ static int check_dirents(struct bch_fs *c)
 		if (ret)
 			goto fsck_err;
 
-		if (k.k->type != BCH_DIRENT)
+		if (k.k->type != KEY_TYPE_dirent)
 			continue;
 
 		d = bkey_s_c_to_dirent(k);
@@ -585,7 +580,6 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(d_inum == d.k->p.inode, c,
 				"dirent points to own directory:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
@@ -603,7 +597,6 @@ static int check_dirents(struct bch_fs *c)
 		if (fsck_err_on(!have_target, c,
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       BTREE_ID_DIRENTS,
 						       k), buf))) {
 			ret = remove_dirent(c, iter, d);
 			if (ret)
@@ -617,7 +610,6 @@ static int check_dirents(struct bch_fs *c)
 				"incorrect d_type: should be %u:\n%s",
 				mode_to_type(target.bi_mode),
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
-						       BTREE_ID_DIRENTS,
 						       k), buf))) {
 			struct bkey_i_dirent *n;
 
@@ -898,7 +890,7 @@ next:
 
 			e->offset = k.k->p.offset;
 
-			if (k.k->type != BCH_DIRENT)
+			if (k.k->type != KEY_TYPE_dirent)
 				continue;
 
 			dirent = bkey_s_c_to_dirent(k);
@@ -941,7 +933,7 @@ up:
 	}
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
+		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
@@ -1029,7 +1021,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
 		switch (k.k->type) {
-		case BCH_DIRENT:
+		case KEY_TYPE_dirent:
 			d = bkey_s_c_to_dirent(k);
 			d_inum = le64_to_cpu(d.v->d_inum);
 
@@ -1309,7 +1301,7 @@ peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
 		if (iter.pos.inode < nlinks_pos || !link)
 			link = &zero_links;
 
-		if (k.k && k.k->type == BCH_INODE_FS) {
+		if (k.k && k.k->type == KEY_TYPE_inode) {
 			/*
 			 * Avoid potential deadlocks with iter for
 			 * truncate/rm/etc.:
@@ -1391,7 +1383,7 @@ static int check_inodes_fast(struct bch_fs *c)
 	int ret = 0;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
+		if (k.k->type != KEY_TYPE_inode)
 			continue;
 
 		inode = bkey_s_c_to_inode(k);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 207ca369..8c3d4431 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -177,76 +177,69 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	if (k.k->p.offset)
-		return "nonzero offset";
-
-	switch (k.k->type) {
-	case BCH_INODE_FS: {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 		struct bch_inode_unpacked unpacked;
 
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-			return "incorrect value size";
+	if (k.k->p.offset)
+		return "nonzero offset";
 
-		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
-			return "fs inode in blockdev range";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+		return "incorrect value size";
 
-		if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-			return "invalid str hash type";
+	if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+		return "fs inode in blockdev range";
 
-		if (bch2_inode_unpack(inode, &unpacked))
-			return "invalid variable length fields";
+	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+		return "invalid str hash type";
 
-		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-			return "invalid data checksum type";
+	if (bch2_inode_unpack(inode, &unpacked))
+		return "invalid variable length fields";
 
-		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-			return "invalid data checksum type";
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+		return "invalid data checksum type";
 
-		if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-		    unpacked.bi_nlink != 0)
-			return "flagged as unlinked but bi_nlink != 0";
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+		return "invalid data checksum type";
 
-		return NULL;
-	}
-	case BCH_INODE_BLOCKDEV:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
-			return "incorrect value size";
+	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+	    unpacked.bi_nlink != 0)
+		return "flagged as unlinked but bi_nlink != 0";
 
-		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
-			return "blockdev inode in fs range";
-
-		return NULL;
-	case BCH_INODE_GENERATION:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
-			return "incorrect value size";
-
-		return NULL;
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
 		       struct bkey_s_c k)
 {
-	struct bkey_s_c_inode inode;
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 	struct bch_inode_unpacked unpacked;
 
-	switch (k.k->type) {
-	case BCH_INODE_FS:
-		inode = bkey_s_c_to_inode(k);
-		if (bch2_inode_unpack(inode, &unpacked)) {
-			pr_buf(out, "(unpack error)");
-			break;
-		}
+	if (bch2_inode_unpack(inode, &unpacked)) {
+		pr_buf(out, "(unpack error)");
+		return;
+	}
 
 #define BCH_INODE_FIELD(_name, _bits)						\
-		pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
-		BCH_INODE_FIELDS()
+	pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
+	BCH_INODE_FIELDS()
 #undef  BCH_INODE_FIELD
-		break;
-	}
+}
+
+const char *bch2_inode_generation_invalid(const struct bch_fs *c,
+					  struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+		return "incorrect value size";
+
+	return NULL;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -282,10 +275,9 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 static inline u32 bkey_generation(struct bkey_s_c k)
 {
 	switch (k.k->type) {
-	case BCH_INODE_BLOCKDEV:
-	case BCH_INODE_FS:
+	case KEY_TYPE_inode:
 		BUG();
-	case BCH_INODE_GENERATION:
+	case KEY_TYPE_inode_generation:
 		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
 	default:
 		return 0;
@@ -331,8 +323,7 @@ again:
 			return ret;
 
 		switch (k.k->type) {
-		case BCH_INODE_BLOCKDEV:
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			/* slot used */
 			if (iter->pos.inode >= max)
 				goto out;
@@ -406,19 +397,19 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			return ret;
 		}
 
-		bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
+		bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
 					"inode %llu not found when deleting",
 					inode_nr);
 
 		switch (k.k->type) {
-		case BCH_INODE_FS: {
+		case KEY_TYPE_inode: {
 			struct bch_inode_unpacked inode_u;
 
 			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
 				bi_generation = inode_u.bi_generation + 1;
 			break;
 		}
-		case BCH_INODE_GENERATION: {
+		case KEY_TYPE_inode_generation: {
 			struct bkey_s_c_inode_generation g =
 				bkey_s_c_to_inode_generation(k);
 			bi_generation = le32_to_cpu(g.v->bi_generation);
@@ -456,7 +447,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 			   POS(inode_nr, 0),
 			   BTREE_ITER_SLOTS, k) {
 		switch (k.k->type) {
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
 			break;
 		default:
@@ -465,7 +456,6 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
 		}
 
 		break;
-
 	}
 
 	return bch2_btree_iter_unlock(&iter) ?: ret;
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index e4495a44..44855e1a 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -8,11 +8,21 @@
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_inode_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_inode (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
 }
 
+const char *bch2_inode_generation_invalid(const struct bch_fs *,
+					  struct bkey_s_c);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
+				   struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_generation_invalid,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+}
+
 struct bch_inode_unpacked {
 	u64			bi_inum;
 	__le64			bi_hash_seed;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 12d77ec6..98eca9a0 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -202,20 +202,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			       enum bch_data_type type,
 			       const struct bkey_i *k)
 {
-	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 	const struct bch_extent_ptr *ptr;
 	struct bch_write_bio *n;
 	struct bch_dev *ca;
 
 	BUG_ON(c->opts.nochanges);
 
-	extent_for_each_ptr(e, ptr) {
+	bkey_for_each_ptr(ptrs, ptr) {
 		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 		       !c->devs[ptr->dev]);
 
 		ca = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ptr + 1 < &extent_entry_last(e)->ptr) {
+		if (to_entry(ptr + 1) < ptrs.end) {
 			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
 						   &ca->replica_set));
 
@@ -300,7 +300,6 @@ static void __bch2_write_index(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
 	unsigned dev;
@@ -310,12 +309,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		n = bkey_next(src);
 		bkey_copy(dst, src);
 
-		e = bkey_i_to_s_extent(dst);
-
-		bch2_extent_drop_ptrs(e, ptr,
+		bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
 			test_bit(ptr->dev, op->failed.d));
 
-		if (!bch2_extent_nr_ptrs(e.c)) {
+		if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
 			ret = -EIO;
 			goto err;
 		}
@@ -416,10 +413,10 @@ static void init_append_extent(struct bch_write_op *op,
 	e->k.p = op->pos;
 	e->k.size = crc.uncompressed_size;
 	e->k.version = version;
-	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 
 	bch2_extent_crc_append(e, crc);
-	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
+	bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i,
+				       crc.compressed_size);
 
 	bch2_keylist_push(&op->insert_keys);
 }
@@ -1589,7 +1586,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	struct bpos pos = bkey_start_pos(k.k);
 	int pick_ret;
 
-	pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick);
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
 	/* hole or reservation - just zero fill: */
 	if (!pick_ret)
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index ac1219fc..47cfd50d 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -462,7 +462,7 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
 int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool need_reclaim = false;
+	int ret;
 retry:
 	spin_lock(&j->lock);
 
@@ -490,14 +490,11 @@ retry:
 
 	BUG_ON(journal_cur_seq(j) < seq);
 
-	if (!journal_entry_open(j)) {
-		need_reclaim = true;
-		goto blocked;
+	ret = journal_entry_open(j);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return ret < 0 ? ret : 0;
 	}
-
-	spin_unlock(&j->lock);
-
-	return 0;
 blocked:
 	if (!j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
@@ -505,8 +502,7 @@ blocked:
 	closure_wait(&j->async_wait, cl);
 	spin_unlock(&j->lock);
 
-	if (need_reclaim)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	bch2_journal_reclaim_work(&j->reclaim_work.work);
 	return -EAGAIN;
 }
 
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 3840764a..05500bf5 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -141,11 +141,12 @@ static void journal_entry_null_range(void *start, void *end)
 
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 				struct jset_entry *entry,
-				struct bkey_i *k, enum bkey_type key_type,
+				struct bkey_i *k, enum btree_node_type key_type,
 				const char *type, int write)
 {
 	void *next = vstruct_next(entry);
 	const char *invalid;
+	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s, c,
@@ -174,14 +175,17 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 	}
 
 	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+		bch2_bkey_swab(NULL, bkey_to_packed(k));
 
-	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (!write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+
+	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
 	if (invalid) {
 		char buf[160];
 
-		bch2_bkey_val_to_text(&PBUF(buf), c, key_type,
-				      bkey_i_to_s_c(k));
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
 		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
 				 type, invalid, buf);
 
@@ -190,6 +194,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
+
+	if (write &&
+	    version < bcachefs_metadata_version_bkey_renumber)
+		bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
 fsck_err:
 	return ret;
 }
@@ -203,8 +211,8 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 
 	vstruct_for_each(entry, k) {
 		int ret = journal_validate_key(c, jset, entry, k,
-				bkey_type(entry->level,
-					  entry->btree_id),
+				__btree_node_type(entry->level,
+						  entry->btree_id),
 				"key", write);
 		if (ret)
 			return ret;
@@ -351,14 +359,17 @@ static int jset_validate(struct bch_fs *c,
 {
 	size_t bytes = vstruct_bytes(jset);
 	struct bch_csum csum;
+	unsigned version;
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
-		bch_err(c, "unknown journal entry version %u",
-			le32_to_cpu(jset->version));
+	version = le32_to_cpu(jset->version);
+	if ((version != BCH_JSET_VERSION_OLD &&
+	     version < bcachefs_metadata_version_min) ||
+	    version >= bcachefs_metadata_version_max) {
+		bch_err(c, "unknown journal entry version %u", jset->version);
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
 
@@ -929,7 +940,6 @@ static void __journal_write_alloc(struct journal *j,
 				  unsigned replicas_want)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	unsigned i;
@@ -951,13 +961,14 @@ static void __journal_write_alloc(struct journal *j,
 		if (!ca->mi.durability ||
 		    ca->mi.state != BCH_MEMBER_STATE_RW ||
 		    !ja->nr ||
-		    bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
+		    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
+					 ca->dev_idx) ||
 		    sectors > ja->sectors_free)
 			continue;
 
 		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
-		extent_ptr_append(e,
+		bch2_bkey_append_ptr(&w->key,
 			(struct bch_extent_ptr) {
 				  .offset = bucket_to_sector(ca,
 					ja->buckets[ja->cur_idx]) +
@@ -1096,7 +1107,7 @@ static void journal_write_done(struct closure *cl)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_prev_buf(j);
 	struct bch_devs_list devs =
-		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
@@ -1158,7 +1169,7 @@ static void journal_write_endio(struct bio *bio)
 		unsigned long flags;
 
 		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+		bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
@@ -1175,6 +1186,7 @@ void bch2_journal_write(struct closure *cl)
 	struct jset *jset;
 	struct bio *bio;
 	struct bch_extent_ptr *ptr;
+	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes;
 
 	journal_buf_realloc(j, w);
@@ -1196,12 +1208,22 @@ void bch2_journal_write(struct closure *cl)
 	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
 	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
+		? cpu_to_le32(BCH_JSET_VERSION_OLD)
+		: cpu_to_le32(c->sb.version);
 
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
-	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
 	    jset_validate_entries(c, jset, WRITE))
 		goto err;
 
@@ -1212,7 +1234,7 @@ void bch2_journal_write(struct closure *cl)
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
 				  journal_nonce(jset), jset);
 
-	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	if (!validate_before_checksum &&
 	    jset_validate_entries(c, jset, WRITE))
 		goto err;
 
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index df4fbae2..3f26f450 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -14,7 +14,7 @@
 #include "replicas.h"
 #include "super-io.h"
 
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 			 unsigned dev_idx, int flags, bool metadata)
 {
 	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
@@ -22,9 +22,9 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
 	unsigned nr_good;
 
-	bch2_extent_drop_device(e, dev_idx);
+	bch2_bkey_drop_device(k, dev_idx);
 
-	nr_good = bch2_extent_durability(c, e.c);
+	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
 		return -EINVAL;
@@ -35,7 +35,6 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct bkey_s_c k;
-	struct bkey_s_extent e;
 	BKEY_PADDED(key) tmp;
 	struct btree_iter iter;
 	int ret = 0;
@@ -50,7 +49,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	       !(ret = btree_iter_err(k))) {
 		if (!bkey_extent_is_data(k.k) ||
 		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
-			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
+			ret = bch2_mark_bkey_replicas(c, k);
 			if (ret)
 				break;
 			bch2_btree_iter_next(&iter);
@@ -58,18 +57,18 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		}
 
 		bkey_reassemble(&tmp.key, k);
-		e = bkey_i_to_s_extent(&tmp.key);
 
-		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+				    dev_idx, flags, false);
 		if (ret)
 			break;
 
 		/*
 		 * If the new extent no longer has any pointers, bch2_extent_normalize()
 		 * will do the appropriate thing with it (turning it into a
-		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 * KEY_TYPE_error key, or just a discard if it was a cached extent)
 		 */
-		bch2_extent_normalize(c, e.s);
+		bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
 
 		iter.pos = bkey_start_pos(&tmp.key.k);
 
@@ -117,7 +116,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-			struct bkey_i_extent *new_key;
+			struct bkey_i_btree_ptr *new_key;
 retry:
 			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
 						    dev_idx)) {
@@ -129,15 +128,14 @@ retry:
 				 */
 				bch2_btree_iter_downgrade(&iter);
 
-				ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-							      bkey_i_to_s_c(&b->key));
+				ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 				if (ret)
 					goto err;
 			} else {
 				bkey_copy(&tmp.k, &b->key);
-				new_key = bkey_i_to_extent(&tmp.k);
+				new_key = bkey_i_to_btree_ptr(&tmp.k);
 
-				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+				ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
 						    dev_idx, flags, true);
 				if (ret)
 					goto err;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 885792bd..8c95aa92 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -100,8 +100,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_cut_back(insert->k.p, &new->k);
 
 		if (m->data_cmd == DATA_REWRITE)
-			bch2_extent_drop_device(extent_i_to_s(insert),
-						m->data_opts.rewrite_dev);
+			bch2_bkey_drop_device(extent_i_to_s(insert).s,
+					      m->data_opts.rewrite_dev);
 
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
 			if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
@@ -132,8 +132,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		 * has fewer replicas than when we last looked at it - meaning
 		 * we need to get a disk reservation here:
 		 */
-		nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
-			(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
+		nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+			(bch2_bkey_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
 		if (nr > 0) {
 			/*
 			 * can't call bch2_disk_reservation_add() with btree
@@ -243,7 +243,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 	switch (data_cmd) {
 	case DATA_ADD_REPLICAS: {
 		int nr = (int) io_opts.data_replicas -
-			bch2_extent_nr_dirty_ptrs(k);
+			bch2_bkey_nr_dirty_ptrs(k);
 
 		if (nr > 0) {
 			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
@@ -478,7 +478,6 @@ int bch2_move_data(struct bch_fs *c,
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
 	struct data_opts data_opts;
 	enum data_cmd data_cmd;
 	u64 delay, cur_inum = U64_MAX;
@@ -531,8 +530,6 @@ peek:
 		if (!bkey_extent_is_data(k.k))
 			goto next_nondata;
 
-		e = bkey_s_c_to_extent(k);
-
 		if (cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
@@ -546,8 +543,7 @@ peek:
 			goto peek;
 		}
 
-		switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
-					 &io_opts, &data_opts))) {
+		switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
 		case DATA_SKIP:
 			goto next;
 		case DATA_SCRUB:
@@ -582,7 +578,7 @@ peek:
 		if (rate)
 			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+		atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
 			     &stats->sectors_seen);
 next_nondata:
 		bch2_btree_iter_next(&stats->iter);
@@ -614,7 +610,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
-		ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS, k);
+		ret = bch2_mark_bkey_replicas(c, k);
 		if (ret)
 			break;
 	}
@@ -638,8 +634,7 @@ static int bch2_gc_btree_replicas(struct bch_fs *c)
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-						      bkey_i_to_s_c(&b->key));
+			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
 
 			bch2_btree_iter_cond_resched(&iter);
 		}
@@ -669,10 +664,9 @@ static int bch2_move_btree(struct bch_fs *c,
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
-					    bkey_i_to_s_c_extent(&b->key),
-					    &io_opts,
-					    &data_opts))) {
+			switch ((cmd = pred(c, arg,
+					    bkey_i_to_s_c(&b->key),
+					    &io_opts, &data_opts))) {
 			case DATA_SKIP:
 				goto next;
 			case DATA_SCRUB:
@@ -698,8 +692,7 @@ next:
 
 #if 0
 static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
-				enum bkey_type type,
-				struct bkey_s_c_extent e,
+				struct bkey_s_c k,
 				struct bch_io_opts *io_opts,
 				struct data_opts *data_opts)
 {
@@ -708,33 +701,38 @@ static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
 #endif
 
 static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
-				      enum bkey_type type,
-				      struct bkey_s_c_extent e,
+				      struct bkey_s_c k,
 				      struct bch_io_opts *io_opts,
 				      struct data_opts *data_opts)
 {
-	unsigned nr_good = bch2_extent_durability(c, e);
-	unsigned replicas = type == BKEY_TYPE_BTREE
-		? c->opts.metadata_replicas
-		: io_opts->data_replicas;
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+		replicas = c->opts.metadata_replicas;
+		break;
+	case KEY_TYPE_extent:
+		replicas = io_opts->data_replicas;
+		break;
+	}
 
 	if (!nr_good || nr_good >= replicas)
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
-	data_opts->btree_insert_flags = 0;
+	data_opts->btree_insert_flags	= 0;
 	return DATA_ADD_REPLICAS;
 }
 
 static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
-				  enum bkey_type type,
-				  struct bkey_s_c_extent e,
+				  struct bkey_s_c k,
 				  struct bch_io_opts *io_opts,
 				  struct data_opts *data_opts)
 {
 	struct bch_ioctl_data *op = arg;
 
-	if (!bch2_extent_has_device(e, op->migrate.dev))
+	if (!bch2_bkey_has_device(k, op->migrate.dev))
 		return DATA_SKIP;
 
 	data_opts->target		= 0;
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index bc87e067..b3bee07e 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -46,7 +46,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
 			    struct bkey_s_c);
 
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
-				enum bkey_type, struct bkey_s_c_extent,
+				struct bkey_s_c,
 				struct bch_io_opts *, struct data_opts *);
 
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 70318f2c..d6890824 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -65,36 +65,42 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 }
 
 static bool __copygc_pred(struct bch_dev *ca,
-			  struct bkey_s_c_extent e)
+			  struct bkey_s_c k)
 {
 	copygc_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr =
-		bch2_extent_has_device(e, ca->dev_idx);
 
-	if (ptr) {
-		struct copygc_heap_entry search = { .offset = ptr->offset };
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr =
+			bch2_extent_has_device(e, ca->dev_idx);
 
-		ssize_t i = eytzinger0_find_le(h->data, h->used,
-					       sizeof(h->data[0]),
-					       bucket_offset_cmp, &search);
+		if (ptr) {
+			struct copygc_heap_entry search = { .offset = ptr->offset };
 
-		return (i >= 0 &&
-			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-			ptr->gen == h->data[i].gen);
+			ssize_t i = eytzinger0_find_le(h->data, h->used,
+						       sizeof(h->data[0]),
+						       bucket_offset_cmp, &search);
+
+			return (i >= 0 &&
+				ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+				ptr->gen == h->data[i].gen);
+		}
+		break;
+	}
 	}
 
 	return false;
 }
 
 static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-				 enum bkey_type type,
-				 struct bkey_s_c_extent e,
+				 struct bkey_s_c k,
 				 struct bch_io_opts *io_opts,
 				 struct data_opts *data_opts)
 {
 	struct bch_dev *ca = arg;
 
-	if (!__copygc_pred(ca, e))
+	if (!__copygc_pred(ca, k))
 		return DATA_SKIP;
 
 	data_opts->target		= dev_to_target(ca->dev_idx);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 8ffae3d9..5c744018 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -180,6 +180,9 @@ enum opt_type {
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)			\
 	BCH_OPT(nostart,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(version_upgrade,	u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)
 
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index 75104ea9..95ff0cae 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -21,23 +21,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 
 const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_quota dq;
-
 	if (k.k->p.inode >= QTYP_NR)
 		return "invalid quota type";
 
-	switch (k.k->type) {
-	case BCH_QUOTA: {
-		dq = bkey_s_c_to_quota(k);
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+		return "incorrect value size";
 
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
-			return "incorrect value size";
-
-		return NULL;
-	}
-	default:
-		return "invalid type";
-	}
+	return NULL;
 }
 
 static const char * const bch2_quota_counters[] = {
@@ -48,20 +38,14 @@ static const char * const bch2_quota_counters[] = {
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
-	struct bkey_s_c_quota dq;
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
 	unsigned i;
 
-	switch (k.k->type) {
-	case BCH_QUOTA:
-		dq = bkey_s_c_to_quota(k);
-
-		for (i = 0; i < Q_COUNTERS; i++)
-			pr_buf(out, "%s hardlimit %llu softlimit %llu",
-			       bch2_quota_counters[i],
-			       le64_to_cpu(dq.v->c[i].hardlimit),
-			       le64_to_cpu(dq.v->c[i].softlimit));
-		break;
-	}
+	for (i = 0; i < Q_COUNTERS; i++)
+		pr_buf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -177,7 +161,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 
 	BUG_ON((s64) n < 0);
 
-	if (mode == BCH_QUOTA_NOCHECK)
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
 		return 0;
 
 	if (v <= 0) {
@@ -200,7 +184,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->hardlimit &&
 	    qc->hardlimit < n &&
 	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
@@ -211,7 +195,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	    qc->timer &&
 	    ktime_get_real_seconds() >= qc->timer &&
 	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
@@ -220,7 +204,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->softlimit &&
 	    qc->softlimit < n &&
 	    qc->timer == 0) {
-		if (mode == BCH_QUOTA_PREALLOC)
+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
 			return -EDQUOT;
 
 		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
@@ -311,13 +295,13 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
 					     dst_q[i]->c[Q_SPC].v + space,
-					     BCH_QUOTA_PREALLOC);
+					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
 			goto err;
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
 					     dst_q[i]->c[Q_INO].v + 1,
-					     BCH_QUOTA_PREALLOC);
+					     KEY_TYPE_QUOTA_PREALLOC);
 		if (ret)
 			goto err;
 	}
@@ -346,7 +330,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 	BUG_ON(k.k->p.inode >= QTYP_NR);
 
 	switch (k.k->type) {
-	case BCH_QUOTA:
+	case KEY_TYPE_quota:
 		dq = bkey_s_c_to_quota(k);
 		q = &c->quotas[k.k->p.inode];
 
@@ -446,15 +430,15 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
 			   BTREE_ITER_PREFETCH, k) {
 		switch (k.k->type) {
-		case BCH_INODE_FS:
+		case KEY_TYPE_inode:
 			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
 			if (ret)
 				return ret;
 
 			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-					BCH_QUOTA_NOCHECK);
+					KEY_TYPE_QUOTA_NOCHECK);
 			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-					BCH_QUOTA_NOCHECK);
+					KEY_TYPE_QUOTA_NOCHECK);
 		}
 	}
 	return bch2_btree_iter_unlock(&iter) ?: ret;
@@ -699,22 +683,19 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
 	struct bch_fs *c		= sb->s_fs_info;
 	struct bch_memquota_type *q	= &c->quotas[kqid->type];
 	qid_t qid			= from_kqid(&init_user_ns, *kqid);
-	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct genradix_iter iter;
 	struct bch_memquota *mq;
 	int ret = 0;
 
 	mutex_lock(&q->lock);
 
-	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+	genradix_for_each_from(&q->table, iter, mq, qid)
 		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
 			__bch2_quota_get(qdq, mq);
 			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
 			goto found;
 		}
 
-		genradix_iter_advance(&iter, &q->table);
-	}
-
 	ret = -ENOENT;
 found:
 	mutex_unlock(&q->lock);
@@ -745,7 +726,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 		return ret;
 
 	switch (k.k->type) {
-	case BCH_QUOTA:
+	case KEY_TYPE_quota:
 		new_quota.v = *bkey_s_c_to_quota(k).v;
 		break;
 	}
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
index 325b9fc0..0c3eb697 100644
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@@ -9,15 +9,15 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_quota_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_quota (struct bkey_ops) {		\
 	.key_invalid	= bch2_quota_invalid,		\
 	.val_to_text	= bch2_quota_to_text,		\
 }
 
 enum quota_acct_mode {
-	BCH_QUOTA_PREALLOC,
-	BCH_QUOTA_WARN,
-	BCH_QUOTA_NOCHECK,
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
 };
 
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 25d72de6..dc6ca94d 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -69,28 +69,34 @@ void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
 }
 
 static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-				    enum bkey_type type,
-				    struct bkey_s_c_extent e,
+				    struct bkey_s_c k,
 				    struct bch_io_opts *io_opts,
 				    struct data_opts *data_opts)
 {
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		/* Make sure we have room to add a new pointer: */
+		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+		    BKEY_EXTENT_VAL_U64s_MAX)
+			return DATA_SKIP;
+
+		extent_for_each_ptr_decode(e, p, entry)
+			if (rebalance_ptr_pred(c, p, io_opts))
+				goto found;
 
-	/* Make sure we have room to add a new pointer: */
-	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-	    BKEY_EXTENT_VAL_U64s_MAX)
 		return DATA_SKIP;
-
-	extent_for_each_ptr_decode(e, p, entry)
-		if (rebalance_ptr_pred(c, p, io_opts))
-			goto found;
-
-	return DATA_SKIP;
 found:
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
+		data_opts->target		= io_opts->background_target;
+		data_opts->btree_insert_flags	= 0;
+		return DATA_ADD_REPLICAS;
+	}
+	default:
+		return DATA_SKIP;
+	}
 }
 
 struct rebalance_work {
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 0e3c321a..e9e4a1ad 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -146,6 +146,10 @@ int bch2_fs_recovery(struct bch_fs *c)
 			mutex_unlock(&c->sb_lock);
 			goto err;
 		}
+
+		if (le16_to_cpu(c->disk_sb.sb->version) <
+		    bcachefs_metadata_version_bkey_renumber)
+			bch2_sb_clean_renumber(clean, READ);
 	}
 	mutex_unlock(&c->sb_lock);
 
@@ -264,12 +268,18 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
-		mutex_unlock(&c->sb_lock);
+	mutex_lock(&c->sb_lock);
+	if (c->opts.version_upgrade) {
+		if (c->sb.version < bcachefs_metadata_version_new_versioning)
+			c->disk_sb.sb->version_min =
+				le16_to_cpu(bcachefs_metadata_version_min);
+		c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
 	}
 
+	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags))
+		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+	mutex_unlock(&c->sb_lock);
+
 	if (enabled_qtypes(c)) {
 		bch_verbose(c, "reading quotas:");
 		ret = bch2_fs_quota_read(c);
@@ -304,6 +314,9 @@ int bch2_fs_initialize(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
 	ret = bch2_initial_gc(c, &journal);
 	if (ret)
 		goto err;
@@ -315,9 +328,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 			goto err;
 		}
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		bch2_btree_root_alloc(c, i);
-
 	/*
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
@@ -378,9 +388,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 		goto err;
 
 	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->version = c->disk_sb.sb->version_min =
+		le16_to_cpu(bcachefs_metadata_version_current);
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 0ba5ce5c..6ab4e36e 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -72,64 +72,57 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 static void extent_to_replicas(struct bkey_s_c k,
 			       struct bch_replicas_entry *r)
 {
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-		r->nr_required	= 1;
+	r->nr_required	= 1;
 
-		extent_for_each_ptr_decode(e, p, entry) {
-			if (p.ptr.cached)
-				continue;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
 
-			if (p.ec_nr) {
-				r->nr_devs = 0;
-				break;
-			}
-
-			r->devs[r->nr_devs++] = p.ptr.dev;
+		if (p.ec_nr) {
+			r->nr_devs = 0;
+			break;
 		}
+
+		r->devs[r->nr_devs++] = p.ptr.dev;
 	}
 }
 
 static void stripe_to_replicas(struct bkey_s_c k,
 			       struct bch_replicas_entry *r)
 {
-	if (k.k->type == BCH_STRIPE) {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		const struct bch_extent_ptr *ptr;
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	const struct bch_extent_ptr *ptr;
 
-		r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
 
-		for (ptr = s.v->ptrs;
-		     ptr < s.v->ptrs + s.v->nr_blocks;
-		     ptr++)
-			r->devs[r->nr_devs++] = ptr->dev;
-	}
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(enum bkey_type type,
-			     struct bkey_s_c k,
+static void bkey_to_replicas(struct bkey_s_c k,
 			     struct bch_replicas_entry *e)
 {
 	e->nr_devs = 0;
 
-	switch (type) {
-	case BKEY_TYPE_BTREE:
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
 		e->data_type = BCH_DATA_BTREE;
 		extent_to_replicas(k, e);
 		break;
-	case BKEY_TYPE_EXTENTS:
+	case KEY_TYPE_extent:
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
-	case BKEY_TYPE_EC:
+	case KEY_TYPE_stripe:
 		e->data_type = BCH_DATA_USER;
 		stripe_to_replicas(k, e);
 		break;
-	default:
-		break;
 	}
 
 	replicas_entry_sort(e);
@@ -295,26 +288,21 @@ int bch2_mark_replicas(struct bch_fs *c,
 	return __bch2_mark_replicas(c, &search.e);
 }
 
-int bch2_mark_bkey_replicas(struct bch_fs *c,
-			    enum bkey_type type,
-			    struct bkey_s_c k)
+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bch_replicas_entry_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 	int ret;
 
 	memset(&search, 0, sizeof(search));
 
-	if (type == BKEY_TYPE_EXTENTS) {
-		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-		unsigned i;
+	for (i = 0; i < cached.nr; i++)
+		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+					      bch2_dev_list_single(cached.devs[i]))))
+			return ret;
 
-		for (i = 0; i < cached.nr; i++)
-			if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-						bch2_dev_list_single(cached.devs[i]))))
-				return ret;
-	}
-
-	bkey_to_replicas(type, k, &search.e);
+	bkey_to_replicas(k, &search.e);
 
 	return search.e.nr_devs
 		? __bch2_mark_replicas(c, &search.e)
@@ -718,26 +706,22 @@ bool bch2_replicas_marked(struct bch_fs *c,
 }
 
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       enum bkey_type type,
 			       struct bkey_s_c k,
 			       bool check_gc_replicas)
 {
 	struct bch_replicas_entry_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 
 	memset(&search, 0, sizeof(search));
 
-	if (type == BKEY_TYPE_EXTENTS) {
-		struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-		unsigned i;
+	for (i = 0; i < cached.nr; i++)
+		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					  bch2_dev_list_single(cached.devs[i]),
+					  check_gc_replicas))
+			return false;
 
-		for (i = 0; i < cached.nr; i++)
-			if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					bch2_dev_list_single(cached.devs[i]),
-					check_gc_replicas))
-				return false;
-	}
-
-	bkey_to_replicas(type, k, &search.e);
+	bkey_to_replicas(k, &search.e);
 
 	return search.e.nr_devs
 		? replicas_has_entry(c, &search.e, check_gc_replicas)
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 7fee927c..87246a04 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -5,12 +5,11 @@
 
 bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
 			  struct bch_devs_list, bool);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bkey_type,
+bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
 int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
 		       struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bkey_type,
-			    struct bkey_s_c);
+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
 
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 7eff5a42..032b34a1 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -117,7 +117,6 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 struct bch_hash_desc {
 	enum btree_id	btree_id;
 	u8		key_type;
-	u8		whiteout_type;
 
 	u64		(*hash_key)(const struct bch_hash_info *, const void *);
 	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
@@ -148,7 +147,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 		if (k.k->type == desc.key_type) {
 			if (!desc.cmp_key(k, key))
 				return iter;
-		} else if (k.k->type == desc.whiteout_type) {
+		} else if (k.k->type == KEY_TYPE_whiteout) {
 			;
 		} else {
 			/* hole, not found */
@@ -201,7 +200,7 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
 		if (k.k->type != desc.key_type &&
-		    k.k->type != desc.whiteout_type)
+		    k.k->type != KEY_TYPE_whiteout)
 			return false;
 
 		if (k.k->type == desc.key_type &&
@@ -244,7 +243,7 @@ static inline int __bch2_hash_set(struct btree_trans *trans,
 				return PTR_ERR(slot);
 		}
 
-		if (k.k->type != desc.whiteout_type)
+		if (k.k->type != KEY_TYPE_whiteout)
 			goto not_found;
 	}
 
@@ -294,7 +293,7 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
 
 	bkey_init(&delete->k);
 	delete->k.p = iter->pos;
-	delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+	delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
 
 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
 	return 0;
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 71920079..c5eaf155 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -232,21 +232,25 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi;
 	const char *err;
+	u32 version, version_min;
 	u16 block_size;
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
-	    le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
-		return"Unsupported superblock version";
+	version		= le16_to_cpu(sb->version);
+	version_min	= version >= bcachefs_metadata_version_new_versioning
+		? le16_to_cpu(sb->version_min)
+		: version;
+
+	if (version    >= bcachefs_metadata_version_max ||
+	    version_min < bcachefs_metadata_version_min)
+		return "Unsupported superblock version";
+
+	if (version_min > version)
+		return "Bad minimum version";
 
 	if (sb->features[1] ||
 	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
 		return "Filesystem has incompatible features";
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
-		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
-		SET_BCH_SB_POSIX_ACL(sb, 1);
-	}
-
 	block_size = le16_to_cpu(sb->block_size);
 
 	if (!is_power_of_2(block_size) ||
@@ -333,13 +337,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 			return err;
 	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
-	    bch2_sb_get_crypt(sb) &&
-	    BCH_SB_INITIALIZED(sb))
-		return "Incompatible extent nonces";
-
-	sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
-
 	return NULL;
 }
 
@@ -356,6 +353,7 @@ static void bch2_sb_update(struct bch_fs *c)
 
 	c->sb.uuid		= src->uuid;
 	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
@@ -377,6 +375,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	unsigned i;
 
 	dst->version		= src->version;
+	dst->version_min	= src->version_min;
 	dst->seq		= src->seq;
 	dst->uuid		= src->uuid;
 	dst->user_uuid		= src->user_uuid;
@@ -476,8 +475,8 @@ reread:
 	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
 		return "Not a bcachefs superblock";
 
-	if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
-	    le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
+	if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
+	    le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
 		return "Unsupported superblock version";
 
 	bytes = vstruct_bytes(sb->sb);
@@ -843,12 +842,6 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
 			return "bucket size smaller than btree node size";
 	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
-		for (m = mi->members;
-		     m < mi->members + sb->nr_devices;
-		     m++)
-			SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
-
 	return NULL;
 }
 
@@ -878,6 +871,16 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
 
 /* BCH_SB_FIELD_clean: */
 
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+{
+	struct jset_entry *entry;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry))
+		bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+}
+
 void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 {
 	struct bch_sb_field_clean *sb_clean;
@@ -932,6 +935,10 @@ void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
 
 	BUG_ON(entry != vstruct_end(&sb_clean->field));
 
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(sb_clean, WRITE);
+
 	mutex_unlock(&c->btree_root_lock);
 write_super:
 	bch2_write_super(c);
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index c66fd974..b493d628 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -134,6 +134,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+
 void bch2_fs_mark_clean(struct bch_fs *, bool);
 
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 0eb6b7e7..b33117dd 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -9,6 +9,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
 #include "btree_update_interior.h"
@@ -580,7 +581,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	INIT_LIST_HEAD(&c->ec_new_stripe_list);
 	mutex_init(&c->ec_new_stripe_lock);
-	mutex_init(&c->ec_stripes_lock);
+	mutex_init(&c->ec_stripe_create_lock);
 	spin_lock_init(&c->ec_stripes_heap_lock);
 
 	seqcount_init(&c->gc_pos_lock);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 0c3bdcd1..42e09f5f 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 		return -EPERM;
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
-		if (k.k->type == BCH_EXTENT) {
+		if (k.k->type == KEY_TYPE_extent) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const union bch_extent_entry *entry;
 			struct extent_ptr_decoded p;
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 31f3b981..85d8bdd3 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -61,8 +61,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 
 const struct bch_hash_desc bch2_xattr_hash_desc = {
 	.btree_id	= BTREE_ID_XATTRS,
-	.key_type	= BCH_XATTR,
-	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.key_type	= KEY_TYPE_xattr,
 	.hash_key	= xattr_hash_key,
 	.hash_bkey	= xattr_hash_bkey,
 	.cmp_key	= xattr_cmp_key,
@@ -72,71 +71,50 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	switch (k.k->type) {
-	case BCH_XATTR:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
-			return "value too small";
+	if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+		return "value too small";
 
-		xattr = bkey_s_c_to_xattr(k);
+	if (bkey_val_u64s(k.k) <
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len)))
+		return "value too small";
 
-		if (bkey_val_u64s(k.k) <
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len)))
-			return "value too small";
+	if (bkey_val_u64s(k.k) >
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len) + 4))
+		return "value too big";
 
-		if (bkey_val_u64s(k.k) >
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len) + 4))
-			return "value too big";
-
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (!handler)
-			return "invalid type";
-
-		if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
-			return "xattr name has invalid characters";
-
-		return NULL;
-	case BCH_XATTR_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
-
-	default:
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (!handler)
 		return "invalid type";
-	}
+
+	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+		return "xattr name has invalid characters";
+
+	return NULL;
 }
 
 void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 			struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	switch (k.k->type) {
-	case BCH_XATTR:
-		xattr = bkey_s_c_to_xattr(k);
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		pr_buf(out, "%s", handler->prefix);
+	else if (handler)
+		pr_buf(out, "(type %u)", xattr.v->x_type);
+	else
+		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
 
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (handler && handler->prefix)
-			pr_buf(out, "%s", handler->prefix);
-		else if (handler)
-			pr_buf(out, "(type %u)", xattr.v->x_type);
-		else
-			pr_buf(out, "(unknown type %u)", xattr.v->x_type);
-
-		bch_scnmemcpy(out, xattr.v->x_name,
-			      xattr.v->x_name_len);
-		pr_buf(out, ":");
-		bch_scnmemcpy(out, xattr_val(xattr.v),
-			      le16_to_cpu(xattr.v->x_val_len));
-		break;
-	case BCH_XATTR_WHITEOUT:
-		pr_buf(out, "whiteout");
-		break;
-	}
+	bch_scnmemcpy(out, xattr.v->x_name,
+		      xattr.v->x_name_len);
+	pr_buf(out, ":");
+	bch_scnmemcpy(out, xattr_val(xattr.v),
+		      le16_to_cpu(xattr.v->x_val_len));
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
@@ -260,7 +238,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (k.k->p.inode > inum)
 			break;
 
-		if (k.k->type != BCH_XATTR)
+		if (k.k->type != KEY_TYPE_xattr)
 			continue;
 
 		xattr = bkey_s_c_to_xattr(k).v;
@@ -313,7 +291,7 @@ static const struct xattr_handler bch_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_USER,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
 };
 
 static bool bch2_xattr_trusted_list(struct dentry *dentry)
@@ -326,14 +304,14 @@ static const struct xattr_handler bch_xattr_trusted_handler = {
 	.list	= bch2_xattr_trusted_list,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_TRUSTED,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
 };
 
 static const struct xattr_handler bch_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_SECURITY,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
 };
 
 #ifndef NO_BCACHEFS_FS
@@ -471,13 +449,13 @@ const struct xattr_handler *bch2_xattr_handlers[] = {
 };
 
 static const struct xattr_handler *bch_xattr_handler_map[] = {
-	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
 		&posix_acl_access_xattr_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
 		&posix_acl_default_xattr_handler,
-	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
-	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
 };
 
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h
index 42b7ba3e..e9b27767 100644
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -8,7 +8,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc;
 const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_xattr_ops (struct bkey_ops) {		\
+#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
 	.key_invalid	= bch2_xattr_invalid,		\
 	.val_to_text	= bch2_xattr_to_text,		\
 }