Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace

2025-03-28 00:00:03 +03:00 · 2017-12-21 18:00:30 -05:00 · 2017-12-21 18:00:30 -05:00 · 1cf4d51dc4
commit 1cf4d51dc4
parent 8acc54456e
61 changed files with 2074 additions and 1442 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-e57b5958cf4e8530d26f7c36a6e1427fb284cc70
+14ce2a2031f3761a4b957aa2e5aac446ce18b87c
--- a/cmd_debug.c
+++ b/cmd_debug.c
@ -293,11 +293,11 @@ int cmd_list(int argc, char *argv[])
 						list_modes, "list mode");
 			break;
 		case 'f':
-			opts.fix_errors = FSCK_ERR_YES;
-			opts.norecovery	= false;
+			opt_set(opts, fix_errors, FSCK_OPT_YES);
+			opt_set(opts, norecovery, false);
 			break;
 		case 'v':
-			opts.verbose_recovery = true;
+			opt_set(opts, verbose_recovery, true);
 			break;
 		case 'h':
 			list_keys_usage();
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@ -28,18 +28,19 @@ int cmd_fsck(int argc, char *argv[])
 	int opt;

 	opt_set(opts, degraded, true);
+	opt_set(opts, fix_errors, FSCK_OPT_ASK);

 	while ((opt = getopt(argc, argv, "pynfvh")) != -1)
 		switch (opt) {
 		case 'p':
-			opt_set(opts, fix_errors, FSCK_ERR_YES);
+			opt_set(opts, fix_errors, FSCK_OPT_YES);
 			break;
 		case 'y':
-			opt_set(opts, fix_errors, FSCK_ERR_YES);
+			opt_set(opts, fix_errors, FSCK_OPT_YES);
 			break;
 		case 'n':
 			opt_set(opts, nochanges, true);
-			opt_set(opts, fix_errors, FSCK_ERR_NO);
+			opt_set(opts, fix_errors, FSCK_OPT_NO);
 			break;
 		case 'f':
 			/* force check, even if filesystem marked clean: */
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@ -164,7 +164,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
 	struct bch_inode_unpacked new_inode;
 	int ret;

-	bch2_inode_init(c, &new_inode, uid, gid, mode, rdev);
+	bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);

 	ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
 				&c->unused_inode_hint);
@ -247,7 +247,6 @@ static void write_data(struct bch_fs *c,
 		       struct bch_inode_unpacked *dst_inode,
 		       u64 dst_offset, void *buf, size_t len)
 {
-	struct disk_reservation res;
 	struct bch_write_op op;
 	struct bio_vec bv;
 	struct closure cl;
@ -261,12 +260,15 @@ static void write_data(struct bch_fs *c,
 	op.wbio.bio.bi_iter.bi_size = len;
 	bch2_bio_map(&op.wbio.bio, buf);

-	int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
+	bch2_write_op_init(&op, c);
+
+	op.write_point	= writepoint_hashed(0);
+	op.pos		= POS(dst_inode->bi_inum, dst_offset >> 9);
+
+	int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0);
 	if (ret)
 		die("error reserving space in new filesystem: %s", strerror(-ret));

-	bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
-			   POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
 	closure_call(&op.cl, bch2_write, NULL, &cl);
 	closure_sync(&cl);

--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@ -243,7 +243,8 @@ static inline void bioset_free(struct bio_set *bs)

 static inline int bioset_init(struct bio_set *bs,
 			      unsigned pool_size,
-			      unsigned front_pad)
+			      unsigned front_pad,
+			      int flags)
 {
 	bs->front_pad = front_pad;
 	return 0;
@ -251,6 +252,10 @@ static inline int bioset_init(struct bio_set *bs,

 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+enum {
+	BIOSET_NEED_BVECS	= 1 << 0,
+	BIOSET_NEED_RESCUER	= 1 << 1,
+};

 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
@ -271,13 +276,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }

 extern void bio_endio(struct bio *);
-extern void bio_endio_nodec(struct bio *);
-
-static inline void bio_io_error(struct bio *bio)
-{
-	bio->bi_error = -EIO;
-	bio_endio(bio);
-}

 extern void bio_advance(struct bio *, unsigned);

--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@ -13,7 +13,27 @@ struct bio_set;
 struct bio;
 struct block_device;
 typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
+
+/*
+ * Block error status values.  See block/blk-core:blk_errors for the details.
+ */
+typedef u8 __bitwise blk_status_t;
+#define	BLK_STS_OK 0
+#define BLK_STS_NOTSUPP		((__force blk_status_t)1)
+#define BLK_STS_TIMEOUT		((__force blk_status_t)2)
+#define BLK_STS_NOSPC		((__force blk_status_t)3)
+#define BLK_STS_TRANSPORT	((__force blk_status_t)4)
+#define BLK_STS_TARGET		((__force blk_status_t)5)
+#define BLK_STS_NEXUS		((__force blk_status_t)6)
+#define BLK_STS_MEDIUM		((__force blk_status_t)7)
+#define BLK_STS_PROTECTION	((__force blk_status_t)8)
+#define BLK_STS_RESOURCE	((__force blk_status_t)9)
+#define BLK_STS_IOERR		((__force blk_status_t)10)
+
+/* hack for device mapper, don't use elsewhere: */
+#define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)
+
+#define BLK_STS_AGAIN		((__force blk_status_t)12)

 /*
 * main unit of I/O for the block layer and lower layers (ie drivers and
@ -22,7 +42,7 @@ typedef void (bio_destructor_t) (struct bio *);
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
-	int			bi_error;
+	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -197,5 +197,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)

 #define capable(cap)		true

+int blk_status_to_errno(blk_status_t status);
+blk_status_t errno_to_blk_status(int errno);
+
 #endif /* __TOOLS_LINUX_BLKDEV_H */

--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@ -14,7 +14,7 @@
 #define BUG()			do { assert(0); unreachable(); } while (0)
 #define BUG_ON(cond)		assert(!(cond))

-#define WARN_ON_ONCE(cond)	assert(!(cond))
+#define WARN_ON_ONCE(cond)	({ bool _r = (cond); if (_r) assert(0); _r; })
 #define WARN_ONCE(cond, msg)	({ bool _r = (cond); if (_r) assert(0); _r; })

 #define __WARN()		assert(0)
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@ -204,4 +204,19 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 					 const struct timespec64 rhs);

+static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+	/* Avoid division in the common cases 1 ns and 1 s. */
+	if (gran == 1) {
+		/* nothing */
+	} else if (gran == NSEC_PER_SEC) {
+		t.tv_nsec = 0;
+	} else if (gran > 1 && gran < NSEC_PER_SEC) {
+		t.tv_nsec -= t.tv_nsec % gran;
+	} else {
+		WARN(1, "illegal file time granularity: %u", gran);
+	}
+	return t;
+}
+
 #endif /* _LINUX_TIME64_H */
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
 			if (ret < 0)
 				return ret;
 			else {
-				inode->v.i_ctime =
-					current_fs_time(inode->v.i_sb);
+				inode->v.i_ctime = current_time(&inode->v);
 				mark_inode_dirty(&inode->v);
 				if (ret == 0)
 					acl = NULL;
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 		return;

 	a = bkey_s_c_to_alloc(k);
-	ca = c->devs[a.k->p.inode];
+	ca = bch_dev_bkey_exists(c, a.k->p.inode);

 	if (a.k->p.offset >= ca->mi.nbuckets)
 		return;
@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}

+	mutex_lock(&c->bucket_lock);
 	for_each_member_device(ca, c, i) {
 		bch2_recalc_min_prio(c, ca, READ);
 		bch2_recalc_min_prio(c, ca, WRITE);
 	}
+	mutex_unlock(&c->bucket_lock);

 	return 0;
 }
@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
 		return 0;

-	ca = c->devs[pos.inode];
+	ca = bch_dev_bkey_exists(c, pos.inode);

 	if (pos.offset >= ca->mi.nbuckets)
 		return 0;
@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,

 /* Bucket heap / gen */

-void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
 	struct prio_clock *clock = &c->prio_clock[rw];
 	struct bucket *g;
@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)

 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
-	struct bch_dev *ca = c->devs[ob->ptr.dev];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);

 	spin_lock(&ob->lock);
 	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,

 	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
 		struct open_bucket *ob = wp->ptrs[i];
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);

 		if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
 			BUG_ON(ca->open_buckets_partial_nr >=
@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
 	unsigned i;

 	writepoint_for_each_ptr(wp, ob, i) {
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);

 		BUG_ON(ptr_stale(ca, &ob->ptr));
 	}
@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,

 	for (i = 0; i < wp->nr_ptrs_can_use; i++) {
 		struct open_bucket *ob = wp->ptrs[i];
-		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 		struct bch_extent_ptr tmp = ob->ptr;

 		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		ra_pages += bdi->ra_pages;
 	}

-	c->bdi.ra_pages = ra_pages;
+	bch2_set_ra_pages(c, ra_pages);

 	/* Find fastest, slowest tiers with devices: */

--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -326,9 +326,9 @@ struct io_count {
 struct bch_dev {
 	struct kobject		kobj;
 	struct percpu_ref	ref;
+	struct completion	ref_completion;
 	struct percpu_ref	io_ref;
-	struct completion	stop_complete;
-	struct completion	offline_complete;
+	struct completion	io_ref_completion;

 	struct bch_fs		*fs;

@ -515,12 +515,11 @@ struct bch_fs {
 	struct closure		sb_write;
 	struct mutex		sb_lock;

-	struct backing_dev_info bdi;
-
 	/* BTREE CACHE */
 	struct bio_set		btree_read_bio;

 	struct btree_root	btree_roots[BTREE_ID_NR];
+	bool			btree_roots_dirty;
 	struct mutex		btree_root_lock;

 	struct btree_cache	btree_cache;
@ -710,6 +709,14 @@ struct bch_fs {
 #undef BCH_TIME_STAT
 };

+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
 static inline bool bch2_fs_running(struct bch_fs *c)
 {
 	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -593,18 +593,24 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);

-#define BCH_INODE_FIELDS()				\
-	BCH_INODE_FIELD(bi_atime,	64)		\
-	BCH_INODE_FIELD(bi_ctime,	64)		\
-	BCH_INODE_FIELD(bi_mtime,	64)		\
-	BCH_INODE_FIELD(bi_otime,	64)		\
-	BCH_INODE_FIELD(bi_size,	64)		\
-	BCH_INODE_FIELD(bi_sectors,	64)		\
-	BCH_INODE_FIELD(bi_uid,		32)		\
-	BCH_INODE_FIELD(bi_gid,		32)		\
-	BCH_INODE_FIELD(bi_nlink,	32)		\
-	BCH_INODE_FIELD(bi_generation,	32)		\
-	BCH_INODE_FIELD(bi_dev,		32)
+#define BCH_INODE_FIELDS()					\
+	BCH_INODE_FIELD(bi_atime,			64)	\
+	BCH_INODE_FIELD(bi_ctime,			64)	\
+	BCH_INODE_FIELD(bi_mtime,			64)	\
+	BCH_INODE_FIELD(bi_otime,			64)	\
+	BCH_INODE_FIELD(bi_size,			64)	\
+	BCH_INODE_FIELD(bi_sectors,			64)	\
+	BCH_INODE_FIELD(bi_uid,				32)	\
+	BCH_INODE_FIELD(bi_gid,				32)	\
+	BCH_INODE_FIELD(bi_nlink,			32)	\
+	BCH_INODE_FIELD(bi_generation,			32)	\
+	BCH_INODE_FIELD(bi_dev,				32)	\
+	BCH_INODE_FIELD(bi_data_checksum,		8)	\
+	BCH_INODE_FIELD(bi_compression,			8)
+
+#define BCH_INODE_FIELDS_INHERIT()				\
+	BCH_INODE_FIELD(bi_data_checksum)			\
+	BCH_INODE_FIELD(bi_compression)

 enum {
 	/*
@ -794,7 +800,7 @@ struct bch_sb_layout {
 	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
 	__u8			nr_superblocks;
 	__u8			pad[5];
-	__u64			sb_offset[61];
+	__le64			sb_offset[61];
 } __attribute__((packed, aligned(8)));

 #define BCH_SB_LAYOUT_SECTOR	7
@ -1089,6 +1095,11 @@ struct jset_entry {
 	};
 };

+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
 #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))

 enum {
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@ -1,6 +1,7 @@

 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"

@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 					const struct bkey_format *format) {}
 #endif

-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-	char *out = buf, *end = buf + size;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
-	  k->u64s, k->type, k->p.inode, k->p.offset,
-	  k->p.snapshot, k->size, k->version.lo);
-
-	BUG_ON(bkey_packed(k));
-
-	switch (k->type) {
-	case KEY_TYPE_DELETED:
-		p(" deleted");
-		break;
-	case KEY_TYPE_DISCARD:
-		p(" discard");
-		break;
-	case KEY_TYPE_ERROR:
-		p(" error");
-		break;
-	case KEY_TYPE_COOKIE:
-		p(" cookie");
-		break;
-	}
-#undef p
-
-	return out - buf;
-}
-
 struct pack_state {
 	const struct bkey_format *format;
 	unsigned		bits;	/* bits remaining in current word */
@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 	 * Extents - we have to guarantee that if an extent is packed, a trimmed
 	 * version will also pack:
 	 */
-	if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+	if (bkey_start_offset(in) <
+	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
 		return false;

 	pack_state_finish(&state, out);
@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
 			      bool *eax_zeroed)
 {
 	unsigned bits = format->bits_per_field[field];
-	u64 offset = format->field_offset[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
 	unsigned i, byte, bit_offset, align, shl, shr;

 	if (!bits && !offset) {
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -8,7 +8,6 @@
 #include "vstructs.h"

 void bch2_to_binary(char *, const u64 *, unsigned);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);

 #define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)

@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
 				 enum bch_bkey_fields nr)
 {
 	return f->bits_per_field[nr] < 64
-		? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
 		: U64_MAX;
 }

--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
 	[BKEY_TYPE_BTREE]	= &bch2_bkey_btree_ops,
 };

-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			 struct bkey_s_c k)
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+				  struct bkey_s_c k)
 {
 	const struct bkey_ops *ops = bch2_bkey_ops[type];

-	if (k.k->u64s < BKEY_U64s)
-		return "u64s too small";
-
-	if (!ops->is_extents) {
-		if (k.k->size)
-			return "nonzero size field";
-	} else {
-		if ((k.k->size == 0) != bkey_deleted(k.k))
-			return "bad size field";
-	}
-
-	if (ops->is_extents &&
-	    !k.k->size &&
-	    !bkey_deleted(k.k))
-		return "zero size field";
-
 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
 	case KEY_TYPE_DISCARD:
@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 	}
 }

-const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
-				    struct bkey_s_c k)
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch2_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	return __bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
 	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
 		return "key before start of btree node";
@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
 	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
 		return "key past end of btree node";

-	if (k.k->p.snapshot)
-		return "nonzero snapshot";
-
-	return bch2_bkey_invalid(c, btree_node_type(b), k);
+	return NULL;
 }

 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)

 	BUG_ON(!k.k->u64s);

-	invalid = bch2_btree_bkey_invalid(c, b, k);
+	invalid = bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_in_btree_node(b, k);
 	if (invalid) {
 		char buf[160];

@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 		ops->key_debugcheck(c, b, k);
 }

-char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-		       char *buf, size_t size, struct bkey_s_c k)
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	char *out = buf, *end = buf + size;

-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->val_to_text)
-		ops->val_to_text(c, buf, size, k);
+	p("u64s %u type %u ", k->u64s, k->type);

-	return buf;
+	if (bkey_cmp(k->p, POS_MAX))
+		p("%llu:%llu", k->p.inode, k->p.offset);
+	else
+		p("POS_MAX");
+
+	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+	return out - buf;
 }

-char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-			    char *buf, size_t size, struct bkey_s_c k)
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+		     char *buf, size_t size, struct bkey_s_c k)
 {
 	const struct bkey_ops *ops = bch2_bkey_ops[type];
 	char *out = buf, *end = buf + size;

-	out += bch2_bkey_to_text(out, end - out, k.k);
-
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->val_to_text) {
-		out += scnprintf(out, end - out, ": ");
-		ops->val_to_text(c, out, end - out, k);
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	default:
+		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+			ops->val_to_text(c, buf, size, k);
+		break;
 	}

-	return buf;
+	return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+
+	out += bch2_bkey_to_text(out, end - out, k.k);
+	out += scnprintf(out, end - out, ": ");
+	out += bch2_val_to_text(c, type, out, end - out, k);
+
+	return out - buf;
 }

 void bch2_bkey_swab(enum bkey_type type,
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@ -64,15 +64,19 @@ struct bkey_ops {
 	bool		is_extents;
 };

+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+				  struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
 const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
-				    struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);

 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
-		       char *, size_t, struct bkey_s_c);
-char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-			    char *, size_t, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+		     char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);

 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
 		    struct bkey_packed *);
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);

 		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = c->devs[ptr->dev];
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			size_t b = PTR_BUCKET_NR(ca, ptr);

 			if (gen_after(ca->oldest_gens[b], ptr->gen))
@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		    (!c->opts.nofsck &&
 		     fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
-				 "superblock not marked as containing replicas"))) {
+				 "superblock not marked as containing replicas (type %u)",
+				 data_type))) {
 			ret = bch2_check_mark_super(c, e, data_type);
 			if (ret)
 				return ret;
 		}

 		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = c->devs[ptr->dev];
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			struct bucket *g = PTR_BUCKET(ca, ptr);

 			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	lockdep_assert_held(&c->sb_lock);

 	for (i = 0; i < layout->nr_superblocks; i++) {
-		if (layout->sb_offset[i] == BCH_SB_SECTOR)
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
 			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
 					      BUCKET_SB, flags);

-		mark_metadata_sectors(c, ca,
-				      layout->sb_offset[i],
-				      layout->sb_offset[i] +
-				      (1 << layout->sb_max_size_bits),
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
 				      BUCKET_SB, flags);
 	}

@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		spin_lock(&ob->lock);
 		if (ob->valid) {
 			gc_pos_set(c, gc_pos_alloc(c, ob));
-			ca = c->devs[ob->ptr.dev];
+			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
 					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	}
 }

-void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	struct bucket *g;
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	struct bset_tree *t;
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
-	u64 start_time;
+	u64 start_time, seq = 0;
 	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 		bch2_time_stats_update(&c->btree_sort_time, start_time);

 	/* Make sure we preserve bset journal_seq: */
-	for (t = b->set + start_idx + 1;
-	     t < b->set + end_idx;
-	     t++)
-		start_bset->journal_seq =
-			max(start_bset->journal_seq,
-			    bset(b, t)->journal_seq);
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);

 	if (sorting_entire_node) {
 		unsigned u64s = le16_to_cpu(out->keys.u64s);
@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 {
 	struct bkey_packed *k, *prev = NULL;
 	struct bpos prev_pos = POS_MIN;
+	enum bkey_type type = btree_node_type(b);
 	bool seen_non_whiteout = false;
 	const char *err;
 	int ret = 0;
@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,

 	if (!BSET_SEPARATE_WHITEOUTS(i)) {
 		seen_non_whiteout = true;
-		whiteout_u64s = 0;
+		*whiteout_u64s = 0;
 	}

 	for (k = i->start;
@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}

 		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(btree_node_type(b), &b->format, k);
+			bch2_bkey_swab(type, &b->format, k);

 		u = bkey_disassemble(b, k, &tmp);

-		invalid = bch2_btree_bkey_invalid(c, b, u);
+		invalid = __bch2_bkey_invalid(c, type, u) ?:
+			bch2_bkey_in_btree_node(b, u) ?:
+			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
 		if (invalid) {
 			char buf[160];

-			bch2_bkey_val_to_text(c, btree_node_type(b),
-					      buf, sizeof(buf), u);
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
 			btree_err(BTREE_ERR_FIXABLE, c, b, i,
 				  "invalid bkey %s: %s", buf, invalid);

@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	struct btree_node_entry *bne;
 	struct btree_node_iter *iter;
 	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bset *i;
 	bool used_mempool;
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;
@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 		unsigned sectors, whiteout_u64s = 0;
 		struct nonce nonce;
 		struct bch_csum csum;
-		struct bset *i;

 		if (!b->written) {
 			i = &b->data->keys;
@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry

 	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);

+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		enum bkey_type type = btree_node_type(b);
+		struct bkey tmp;
+		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+		const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+		if (invalid) {
+			char buf[160];
+
+			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+			btree_err(BTREE_ERR_FIXABLE, c, b, i,
+				  "invalid bkey %s: %s", buf, invalid);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		k = bkey_next(k);
+	}
+
 	bch2_bset_build_aux_tree(b, b->set, false);

 	set_needs_whiteout(btree_bset_first(b));
@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
 		bio->bi_iter.bi_size	= btree_bytes(c);
 		submit_bio_wait(bio);
 start:
-		bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+		bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
 		percpu_ref_put(&rb->pick.ca->io_ref);

 		__set_bit(rb->pick.ca->dev_idx, avoid.d);
 		rb->pick = bch2_btree_pick_ptr(c, b, &avoid);

-		if (!bio->bi_error &&
+		if (!bio->bi_status &&
 		    !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
 			goto out;
 	} while (!IS_ERR_OR_NULL(rb->pick.ca));
@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));

 	bch2_btree_node_read(c, b, true);
-	six_unlock_write(&b->lock);

 	if (btree_node_read_error(b)) {
-		six_unlock_intent(&b->lock);
-		return -EIO;
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
 	}

 	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->lock);
 	six_unlock_intent(&b->lock);

-	return 0;
+	return ret;
 }

 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	struct closure *cl	= wbio->cl;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct bkey_i_extent *new_key;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct btree_iter iter;
+	int ret;

-	six_lock_read(&b->lock);
-	bkey_copy(&tmp.k, &b->key);
-	six_unlock_read(&b->lock);
+	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+			       BTREE_MAX_DEPTH,
+			       b->level, 0);
+retry:
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;

-	if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
-		/* Node has been freed: */
+	/* has node been freed? */
+	if (iter.nodes[b->level] != b) {
+		/* node has been freed: */
+		if (!btree_node_dying(b))
+			panic("foo4\n");
 		goto out;
 	}

+	if (!btree_node_hashed(b))
+		panic("foo5\n");
+
+	bkey_copy(&tmp.k, &b->key);
+
 	new_key = bkey_i_to_extent(&tmp.k);
+	e = extent_i_to_s(new_key);
+	extent_for_each_ptr_backwards(e, ptr)
+		if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+			bch2_extent_drop_ptr(e, ptr);

-	while (wbio->replicas_failed) {
-		unsigned idx = __fls(wbio->replicas_failed);
+	if (!bch2_extent_nr_ptrs(e.c))
+		goto err;

-		bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
-		wbio->replicas_failed ^= 1 << idx;
-	}
-
-	if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
-	    bch2_btree_node_update_key(c, b, new_key)) {
-		set_btree_node_noevict(b);
-		bch2_fatal_error(c);
-	}
+	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
 out:
+	bch2_btree_iter_unlock(&iter);
 	bio_put(&wbio->bio);
 	btree_node_write_done(c, b);
 	if (cl)
 		closure_put(cl);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	goto out;
 }

 void bch2_btree_write_error_work(struct work_struct *work)
@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
 	struct closure *cl		= !wbio->split ? wbio->cl : NULL;
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
+	unsigned long flags;

 	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);

-	if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
-	    bch2_meta_write_fault("btree"))
-		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+	if (bio->bi_status == BLK_STS_REMOVED ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}

 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
 		wbio->used_mempool,
 		wbio->data);

-	if (wbio->replicas_failed) {
-		unsigned long flags;
-
+	if (wbio->failed.nr) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bio_list_add(&c->btree_write_error_list, &wbio->bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
 		queue_work(c->wq, &c->btree_write_error_work);
 		return;
 	}
@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,

 	wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
 	wbio->cl		= parent;
+	wbio->failed.nr		= 0;
 	wbio->order		= order;
 	wbio->used_mempool	= used_mempool;
 	wbio->data		= data;
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
 	struct btree_iter *linked;
 	struct btree *b = iter->nodes[level];
-	enum btree_node_locked_type want = btree_lock_want(iter, level);
-	enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+	int want = btree_lock_want(iter, level);
+	int have = btree_node_locked_type(iter, level);

 	if (want == have)
 		return true;
@ -108,6 +108,17 @@ success:
 	return true;
 }

+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+		if (!bch2_btree_node_relock(iter, l))
+			return false;
+
+	return true;
+}
+
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   unsigned level,
@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 				     unsigned new_locks_want)
 {
 	struct btree_iter *linked;
-	unsigned l;

 	/* Drop locks we don't want anymore: */
 	if (new_locks_want < iter->locks_want)
@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 	iter->locks_want = new_locks_want;
 	btree_iter_drop_extra_locks(iter);

-	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
-		if (!bch2_btree_node_relock(iter, l))
-			goto fail;
+	if (bch2_btree_iter_relock(iter))
+		return true;

-	return true;
-fail:
 	/*
 	 * Just an optimization: ancestor nodes must be locked before child
 	 * nodes, so set locks_want on iterators that might lock ancestors
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
 	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
 }

-static inline int btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
 {
 	return level < iter->locks_want
 		? SIX_LOCK_intent
@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 }

 bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);

 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -196,6 +196,7 @@ enum btree_flags {
 	BTREE_NODE_accessed,
 	BTREE_NODE_write_in_flight,
 	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
 };

 BTREE_FLAG(read_in_flight);
@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
+BTREE_FLAG(dying);

 static inline struct btree_write *btree_current_write(struct btree *b)
 {
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,

 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
-			       struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_extent *);

 #endif /* _BCACHEFS_BTREE_UPDATE_H */
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -21,7 +21,7 @@
 static void btree_node_will_make_reachable(struct btree_update *,
 					   struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);

 /* Debug code: */

@ -686,7 +686,7 @@ retry:
 		BUG_ON(c->btree_roots[b->btree_id].as != as);
 		c->btree_roots[b->btree_id].as = NULL;

-		bch2_btree_set_root_ondisk(c, b);
+		bch2_btree_set_root_ondisk(c, b, WRITE);

 		/*
 		 * We don't have to wait anything anything here (before
@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	struct btree_write *w;
 	struct bset_tree *t;

+	set_btree_node_dying(b);
 	btree_interior_update_add_node_reference(as, b);

 	/*
@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * in with keys that aren't in the journal anymore:
 	 */
 	for_each_bset(b, t)
-		as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+		as->journal_seq = max(as->journal_seq,
+				      le64_to_cpu(bset(b, t)->journal_seq));

 	mutex_lock(&c->btree_interior_update_lock);

@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);

 	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->level < btree_node_root(c, b)->level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
 	btree_node_root(c, b) = b;
 	mutex_unlock(&c->btree_root_lock);

@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 			    gc_pos_btree_root(b->btree_id));
 }

-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
 {
 	struct btree_root *r = &c->btree_roots[b->btree_id];

@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
 	bkey_copy(&r->key, &b->key);
 	r->level = b->level;
 	r->alive = true;
+	if (rw == WRITE)
+		c->btree_roots_dirty = true;

 	mutex_unlock(&c->btree_root_lock);
 }
@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	return ret;
 }

-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
-			       struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+					 struct btree_update *as,
+					 struct btree_iter *iter,
+					 struct btree *b, struct btree *new_hash,
+					 struct bkey_i_extent *new_key)
 {
-	struct btree_update *as = NULL;
-	struct btree *parent, *new_hash = NULL;
-	struct btree_iter iter;
-	struct closure cl;
+	struct btree *parent;
 	bool must_rewrite_parent = false;
 	int ret;

-	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-			       BTREE_MAX_DEPTH,
-			       b->level, 0);
-	closure_init_stack(&cl);
-
-	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
-	if (ret)
-		return ret;
-
-retry:
-	down_read(&c->gc_lock);
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto err;
-
-	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-	if (!new_hash &&
-	    PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-		/* bch2_btree_reserve_get will unlock */
-		do {
-			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
-			closure_sync(&cl);
-		} while (ret == -EAGAIN);
-
-		BUG_ON(ret);
-
-		new_hash = bch2_btree_node_mem_alloc(c);
-	}
-
-	as = bch2_btree_update_start(c, iter.btree_id,
-				     btree_update_reserve_required(c, b),
-				     BTREE_INSERT_NOFAIL|
-				     BTREE_INSERT_USE_RESERVE|
-				     BTREE_INSERT_USE_ALLOC_RESERVE,
-				     &cl);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN || ret == -EINTR) {
-			bch2_btree_iter_unlock(&iter);
-			up_read(&c->gc_lock);
-			closure_sync(&cl);
-			goto retry;
-		}
-		goto err;
-	}
-
-	mutex_lock(&c->btree_interior_update_lock);
-
 	/*
 	 * Two corner cases that need to be thought about here:
 	 *
@ -1869,22 +1829,12 @@ retry:
 	if (b->will_make_reachable)
 		must_rewrite_parent = true;

-	/* other case: btree node being freed */
-	if (iter.nodes[b->level] != b) {
-		/* node has been freed: */
-		BUG_ON(btree_node_hashed(b));
-		mutex_unlock(&c->btree_interior_update_lock);
-		goto err;
-	}
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
 	if (must_rewrite_parent)
 		as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;

 	btree_interior_update_add_node_reference(as, b);

-	parent = iter.nodes[b->level + 1];
+	parent = iter->nodes[b->level + 1];
 	if (parent) {
 		if (new_hash) {
 			bkey_copy(&new_hash->key, &new_key->k_i);
@ -1893,8 +1843,8 @@ retry:
 			BUG_ON(ret);
 		}

-		bch2_btree_insert_node(as, parent, &iter,
-				       &keylist_single(&new_key->k_i));
+		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys);

 		if (new_hash) {
 			mutex_lock(&c->btree_cache.lock);
@ -1914,7 +1864,7 @@ retry:

 		BUG_ON(btree_node_root(c, b) != b);

-		bch2_btree_node_lock_write(b, &iter);
+		bch2_btree_node_lock_write(b, iter);

 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
 			      c->opts.btree_node_size, true,
@ -1925,14 +1875,94 @@ retry:
 					   &stats);
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
-		bkey_copy(&b->key, &new_key->k_i);
+
+		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}

 		btree_update_updated_root(as);
-		bch2_btree_node_unlock_write(b, &iter);
+		bch2_btree_node_unlock_write(b, iter);
 	}

 	bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i_extent *new_key)
+{
+	struct btree_update *as = NULL;
+	struct btree *new_hash = NULL;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch2_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
+
+	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		/* bch2_btree_reserve_get will unlock */
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = -EINTR;
+
+			bch2_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
+
+			if (!bch2_btree_iter_relock(iter))
+				goto err;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(c);
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+				     btree_update_reserve_required(c, b),
+				     BTREE_INSERT_NOFAIL|
+				     BTREE_INSERT_USE_RESERVE|
+				     BTREE_INSERT_USE_ALLOC_RESERVE,
+				     &cl);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN)
+			ret = -EINTR;
+
+		if (ret != -EINTR)
+			goto err;
+
+		bch2_btree_iter_unlock(iter);
+		up_read(&c->gc_lock);
+		closure_sync(&cl);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter))
+			goto err;
+	}
+
+	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+	if (ret)
+		goto err_free_update;
+
+	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
 		list_move(&new_hash->list, &c->btree_cache.freeable);
@ -1941,14 +1971,12 @@ out:
 		six_unlock_write(&new_hash->lock);
 		six_unlock_intent(&new_hash->lock);
 	}
-	bch2_btree_iter_unlock(&iter);
 	up_read(&c->gc_lock);
 	closure_sync(&cl);
 	return ret;
-err:
-	if (as)
-		bch2_btree_update_free(as);
-	goto out;
+err_free_update:
+	bch2_btree_update_free(as);
+	goto err;
 }

 /* Init code: */
@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	BUG_ON(btree_node_root(c, b));

 	__bch2_btree_set_root_inmem(c, b);
-	bch2_btree_set_root_ondisk(c, b);
+	bch2_btree_set_root_ondisk(c, b, READ);
 }

 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
 	BUG_ON(btree_node_root(c, b));

 	bch2_btree_set_root_inmem(as, b);
-	bch2_btree_set_root_ondisk(c, b);
+	bch2_btree_set_root_ondisk(c, b, WRITE);

 	bch2_btree_open_bucket_put(c, b);
 	six_unlock_intent(&b->lock);
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -174,9 +174,11 @@ do {									\

 #define bch2_usage_read_raw(_stats)					\
 ({									\
-	typeof(*this_cpu_ptr(_stats)) _acc = { 0 };			\
+	typeof(*this_cpu_ptr(_stats)) _acc;				\
 	int cpu;							\
 									\
+	memset(&_acc, 0, sizeof(_acc));					\
+									\
 	for_each_possible_cpu(cpu)					\
 		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
 									\
@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
-	struct bch_dev *ca = c->devs[ptr->dev];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
 	unsigned data_type = type == S_META
 		? BUCKET_BTREE : BUCKET_DATA;
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -68,16 +68,14 @@ struct bch_dev_usage {

 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-
 	/* _uncompressed_ sectors: */
+	u64			online_reserved;
+	u64			available_cache;

 	struct {
 		u64		data[S_ALLOC_NR];
 		u64		persistent_reserved;
 	}			s[BCH_REPLICAS_MAX];
-
-	u64			online_reserved;
-	u64			available_cache;
 };

 /*
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -2,6 +2,7 @@

 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
+#include "chardev.h"
 #include "super.h"
 #include "super-io.h"

@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 			return ERR_PTR(-EINVAL);

 		rcu_read_lock();
-		ca = c->devs[dev];
+		ca = rcu_dereference(c->devs[dev]);
 		if (ca)
 			percpu_ref_get(&ca->ref);
 		rcu_read_unlock();
@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)

 	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);

-	if (copy_from_user(user_devs, arg.devs,
+	if (copy_from_user(user_devs, user_arg->devs,
 			   sizeof(u64) * arg.nr_devs))
 		goto err;

--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 	}
 }

-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 unsigned opt)
 {
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
 			? BCH_CSUM_CHACHA20_POLY1305_128
 			: BCH_CSUM_CHACHA20_POLY1305_80;

-	return bch2_csum_opt_to_type(c->opts.data_checksum, true);
+	return bch2_csum_opt_to_type(opt, true);
 }

 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 	return nonce;
 }

+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
 static inline struct nonce extent_nonce(struct bversion version,
 					struct bch_extent_crc_unpacked crc)
 {
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@ -95,11 +95,17 @@ print:
 	vscnprintf(buf, sizeof(_buf), fmt, args);
 	va_end(args);

+	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		bch_err(c, "%s, exiting", buf);
+		mutex_unlock(&c->fsck_error_lock);
+		return FSCK_ERR_EXIT;
+	}
+
 	if (flags & FSCK_CAN_FIX) {
-		if (c->opts.fix_errors == FSCK_ERR_ASK) {
+		if (c->opts.fix_errors == FSCK_OPT_ASK) {
 			printk(KERN_ERR "%s: fix?", buf);
 			fix = ask_yn();
-		} else if (c->opts.fix_errors == FSCK_ERR_YES ||
+		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
 			if (print)
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@ -96,9 +96,10 @@ enum {
 };

 enum fsck_err_opts {
-	FSCK_ERR_NO,
-	FSCK_ERR_YES,
-	FSCK_ERR_ASK,
+	FSCK_OPT_EXIT,
+	FSCK_OPT_YES,
+	FSCK_OPT_NO,
+	FSCK_OPT_ASK,
 };

 enum fsck_err_ret {
@ -217,7 +218,7 @@ do {									\
 #define bcache_io_error(c, bio, fmt, ...)				\
 do {									\
 	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
-	(bio)->bi_error = -EIO;						\
+	(bio)->bi_status = BLK_STS_IOERR;					\
 } while (0)

 #endif /* _BCACHEFS_ERROR_H */
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -18,6 +18,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "super.h"
 #include "super-io.h"
 #include "util.h"
 #include "xattr.h"
@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }

+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr(e, ptr)
+		nr_ptrs += (!ptr->cached &&
+			    bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+			    BCH_MEMBER_STATE_FAILED);
+
+	return nr_ptrs;
+}
+
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e;
@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
 			    struct bkey_s_c_extent e,
 			    const struct bch_extent_ptr *ptr)
 {
-	return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+	return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
 }

 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
 				break;
 			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
-				entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+				entry->crc128.csum.hi = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.hi);
+				entry->crc128.csum.lo = (__force __le64)
+					swab64((__force u64) entry->crc128.csum.lo);
 				break;
 			case BCH_EXTENT_ENTRY_ptr:
 				break;
@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	const struct bch_extent_ptr *ptr2;
 	struct bch_dev *ca;

-	if (ptr->dev >= c->sb.nr_devices)
+	if (ptr->dev >= c->sb.nr_devices ||
+	    !c->devs[ptr->dev])
 		return "pointer to invalid device";

-	ca = c->devs[ptr->dev];
+	ca = bch_dev_bkey_exists(c, ptr->dev);
 	if (!ca)
 		return "pointer to invalid device";

@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
-			ca = c->devs[ptr->dev];
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;

 			p("ptr: %u:%llu gen %u%s", ptr->dev,
 			  (u64) ptr->offset, ptr->gen,
@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
 	struct bch_extent_crc_unpacked crc;

 	extent_for_each_ptr_crc(e, ptr, crc) {
-		struct bch_dev *ca = c->devs[ptr->dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);

 		if (ptr->cached && ptr_stale(ca, ptr))
 			continue;
@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 	bool bad;

 	extent_for_each_ptr(e, ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		g = PTR_BUCKET(ca, ptr);
 		replicas++;

@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));

 	extent_for_each_ptr(e, ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		g = PTR_BUCKET(ca, ptr);
 		replicas++;
 		ptrs_per_tier[ca->mi.tier]++;
@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
 static unsigned PTR_TIER(struct bch_fs *c,
 			 const struct bch_extent_ptr *ptr)
 {
-	return c->devs[ptr->dev]->mi.tier;
+	return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
 }

 static void bch2_extent_crc_init(union bch_extent_crc *crc,
@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
-	unsigned tier = 0, nr_cached = 0, nr_good = 0;
+	unsigned tier = 0, nr_cached = 0;
+	unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
 	bool have_higher_tier;

-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached &&
-		    c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
-			nr_good++;
-
 	if (nr_good <= c->opts.data_replicas)
 		return;

@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
 				return BCH_MERGE_NOMERGE;

 			/* We don't allow extents to straddle buckets: */
-			ca = c->devs[lp->dev];
+			ca = bch_dev_bkey_exists(c, lp->dev);

 			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
 				return BCH_MERGE_NOMERGE;
@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	}
 }

+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+	struct btree_iter iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	end.offset += size;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+			     BTREE_ITER_WITH_HOLES, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bch2_extent_is_fully_allocated(k)) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 const struct bkey_ops bch2_bkey_extent_ops = {
 	.key_invalid	= bch2_extent_invalid,
 	.key_debugcheck	= bch2_extent_debugcheck,
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);

 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);

 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 	case BCH_EXTENT_CRC32:
 		return (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc32),
-			.csum.lo		= crc->crc32.csum,
+			.csum.lo		= (__force __le64) crc->crc32.csum,
 		};
 	case BCH_EXTENT_CRC64:
 		return (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc64),
 			.nonce			= crc->crc64.nonce,
-			.csum.lo		= crc->crc64.csum_lo,
-			.csum.hi		= crc->crc64.csum_hi,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+			.csum.hi		= (__force __le64) crc->crc64.csum_hi,
 		};
 	case BCH_EXTENT_CRC128:
 		return (struct bch_extent_crc_unpacked) {
@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);

+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
 #endif /* _BCACHEFS_EXTENTS_H */
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@ -75,7 +75,7 @@ do {									\
 /* Set VFS inode flags from bcachefs inode: */
 void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
 {
-	set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
 }

 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 		return -EINVAL;

 	bi->bi_flags = newflags;
-	inode->v.i_ctime = current_fs_time(inode->v.i_sb);
+	inode->v.i_ctime = current_time(&inode->v);
 	return 0;
 }

 static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
 {
-	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);

 	return put_user(flags, arg);
 }
@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 {
 	struct fsxattr fa = { 0 };

-	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);

 	return copy_to_user(arg, &fa, sizeof(fa));
 }
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -12,6 +12,7 @@
 #include "fs-ioctl.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io.h"
 #include "journal.h"
 #include "keylist.h"
 #include "super.h"
@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
 				BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
 	} while (ret == -EINTR);

-	if (!ret) {
-		inode->ei_size	= inode_u.bi_size;
-		inode->ei_flags	= inode_u.bi_flags;
-	}
+	if (!ret)
+		inode->ei_inode = inode_u;
 out:
 	bch2_btree_iter_unlock(&iter);

@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	return __bch2_write_inode(c, inode, NULL, NULL);
 }

-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	int ret;

@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 	return ret;
 }

-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
 	int ret = 0;

@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
 	bch2_inode_init(c, &inode_u,
 			i_uid_read(&inode->v),
 			i_gid_read(&inode->v),
-			inode->v.i_mode, rdev);
+			inode->v.i_mode, rdev,
+			&dir->ei_inode);
+
 	ret = bch2_inode_create(c, &inode_u,
 				BLOCKDEV_INODE_MAX, 0,
 				&c->unused_inode_hint);
@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
 	if (unlikely(ret))
 		return ret;

-	dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
+	dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
 	mark_inode_dirty_sync(&dir->v);
 	return 0;
 }
@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,

 	lockdep_assert_held(&inode->v.i_rwsem);

-	inode->v.i_ctime = current_fs_time(dir->v.i_sb);
+	inode->v.i_ctime = current_time(&dir->v);

 	ret = bch2_inc_nlink(c, inode);
 	if (ret)
@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
 {
 	struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
 	struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-	struct timespec now = current_fs_time(old_dir->v.i_sb);
+	struct timespec now = current_time(&old_dir->v);
 	int ret;

 	lockdep_assert_held(&old_dir->v.i_rwsem);
@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
 {
 	struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
 	struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-	struct timespec now = current_fs_time(old_dir->v.i_sb);
+	struct timespec now = current_time(&old_dir->v);
 	int ret;

 	ret = bch2_dirent_rename(c,
@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->v.i_ctime	= bch2_time_to_timespec(c, bi->bi_ctime);

 	inode->ei_journal_seq	= 0;
-	inode->ei_size		= bi->bi_size;
-	inode->ei_flags		= bi->bi_flags;
-	atomic64_set(&inode->ei_sectors, bi->bi_sectors);
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+	inode->ei_inode		= *bi;

 	bch2_inode_flags_to_vfs(inode);

@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
 	inode->ei_journal_seq = 0;
-	atomic_long_set(&inode->ei_size_dirty_count, 0);
-	atomic_long_set(&inode->ei_sectors_dirty_count, 0);

 	return &inode->v;
 }
@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)

 	truncate_inode_pages_final(&inode->v.i_data);

-	if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
-		/* XXX - we want to check this stuff iff there weren't IO errors: */
-		BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
-		BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
-	}
-
 	clear_inode(&inode->v);

 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
 	sb->s_time_gran		= c->sb.time_precision;
 	c->vfs_sb		= sb;
-	sb->s_bdi		= &c->bdi;
 	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));

+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->congested_fn		= bch2_congested;
+	sb->s_bdi->congested_data	= c;
+	sb->s_bdi->ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
 	for_each_online_member(ca, c, i) {
 		struct block_device *bdev = ca->disk_sb.bdev;

--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -1,6 +1,7 @@
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H

+#include "opts.h"
 #include "str_hash.h"

 #include <linux/seqlock.h>
@ -11,22 +12,12 @@ struct bch_inode_info {

 	struct mutex		ei_update_lock;
 	u64			ei_journal_seq;
-
-	atomic_long_t		ei_size_dirty_count;
-
-	/*
-	 * these are updated whenever we update the inode in the btree - for
-	 * e.g. fsync
-	 */
-	u64			ei_size;
-	u32			ei_flags;
-
-	atomic_long_t		ei_sectors_dirty_count;
-	atomic64_t		ei_sectors;
+	unsigned long		ei_last_dirtied;

 	struct bch_hash_info	ei_str_hash;

-	unsigned long		ei_last_dirtied;
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
 };

 #define to_bch_ei(_inode)					\
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 			"hash table key at wrong offset: %llu, "
 			"hashed to %llu chain starts at %llu\n%s",
 			k.k->p.offset, hashed, h->chain.pos.offset,
-			bch2_bkey_val_to_text(c, desc.btree_id,
+			bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 					      buf, sizeof(buf), k))) {
 		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
 		if (ret) {
@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
 		if (fsck_err_on(k2.k->type == desc.key_type &&
 				!desc.cmp_bkey(k, k2), c,
 				"duplicate hash table keys:\n%s",
-				bch2_bkey_val_to_text(c, desc.btree_id,
+				bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
 						      buf, sizeof(buf), k))) {
 			ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
 			if (ret)
@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)

 		if (fsck_err_on(have_target &&
 				d.v->d_type !=
-				mode_to_type(le16_to_cpu(target.bi_mode)), c,
+				mode_to_type(target.bi_mode), c,
 				"incorrect d_type: should be %u:\n%s",
-				mode_to_type(le16_to_cpu(target.bi_mode)),
+				mode_to_type(target.bi_mode),
 				bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
 						      buf, sizeof(buf), k))) {
 			struct bkey_i_dirent *n;
@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
 			}

 			bkey_reassemble(&n->k_i, d.s_c);
-			n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
+			n->v.d_type = mode_to_type(target.bi_mode);

 			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
 					BTREE_INSERT_NOFAIL,
@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 fsck_err:
 	return ret;
 create_root:
-	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, NULL);
 	root_inode->bi_inum = BCACHEFS_ROOT_INO;

 	bch2_inode_pack(&packed, root_inode);
@ -545,7 +546,8 @@ create_lostfound:
 	if (ret)
 		return ret;

-	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+			0, root_inode);

 	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
 			       &c->unused_inode_hint);
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 		if (bch2_inode_unpack(inode, &unpacked))
 			return "invalid variable length fields";

+		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+			return "invalid data checksum type";
+
 		return NULL;
 	}
 	case BCH_INODE_BLOCKDEV:
@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 static void bch2_inode_to_text(struct bch_fs *c, char *buf,
 			       size_t size, struct bkey_s_c k)
 {
+	char *out = buf, *end = out + size;
 	struct bkey_s_c_inode inode;
 	struct bch_inode_unpacked unpacked;

@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
 	case BCH_INODE_FS:
 		inode = bkey_s_c_to_inode(k);
 		if (bch2_inode_unpack(inode, &unpacked)) {
-			scnprintf(buf, size, "(unpack error)");
+			out += scnprintf(out, end - out, "(unpack error)");
 			break;
 		}

-		scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
+#define BCH_INODE_FIELD(_name, _bits)						\
+		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
 		break;
 	}
 }
@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
 };

 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
 {
-	s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
+	s64 now = timespec_to_bch2_time(c,
+		timespec_trunc(current_kernel_time(),
+			       c->sb.time_precision));

 	memset(inode_u, 0, sizeof(*inode_u));

@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_mtime	= now;
 	inode_u->bi_ctime	= now;
 	inode_u->bi_otime	= now;
+
+	if (parent) {
+#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
+		BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+	}
 }

 int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 			struct bch_inode_unpacked inode_u;

 			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-				bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
+				bi_generation = inode_u.bi_generation + 1;
 			break;
 		}
 		case BCH_INODE_GENERATION: {
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@ -1,6 +1,8 @@
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H

+#include "opts.h"
+
 #include <linux/math64.h>

 extern const struct bkey_ops bch2_bkey_inode_ops;
@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);

 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-		    uid_t, gid_t, umode_t, dev_t);
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
 		      u64, u64, u64 *);
 int bch2_inode_truncate(struct bch_fs *, u64, u64,
@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
 	return div_s64(ns, c->sb.time_precision);
 }

+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_INODE_OPT(_name, ...)					\
+	case Opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum bch_opt_id id, u64 v)
+{
+	return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id)
+{
+	return __bch2_inode_opt_set(inode, id, 0);
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_inode_pack_test(void);
 #else
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -20,6 +20,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "super.h"
 #include "super-io.h"

 #include <linux/blkdev.h>
@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	const struct bch_extent_ptr *ptr;
 	struct bch_write_bio *n;
 	struct bch_dev *ca;
-	unsigned ptr_idx = 0;

 	BUG_ON(c->opts.nochanges);

@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 		       !c->devs[ptr->dev]);

-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);

 		if (ptr + 1 < &extent_entry_last(e)->ptr) {
 			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,

 		n->c			= c;
 		n->ca			= ca;
-		n->ptr_idx		= ptr_idx++;
 		n->submit_time_us	= local_clock_us();
 		n->bio.bi_iter.bi_sector = ptr->offset;

@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			submit_bio(&n->bio);
 		} else {
 			n->have_io_ref		= false;
-			bcache_io_error(c, &n->bio, "device has been removed");
+			n->bio.bi_status	= BLK_STS_REMOVED;
 			bio_endio(&n->bio);
 		}
 	}
@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 		op->error = bch2_journal_error(&op->c->journal);

-	bch2_disk_reservation_put(op->c, &op->res);
+	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+		bch2_disk_reservation_put(op->c, &op->res);
 	percpu_ref_put(&op->c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+	op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
+
 	closure_return(cl);
 }

@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n;
+	int ret;

 	op->flags |= BCH_WRITE_LOOPED;

+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+		bkey_copy(dst, src);
+
+		e = bkey_i_to_s_extent(dst);
+		extent_for_each_ptr_backwards(e, ptr)
+			if (test_bit(ptr->dev, op->failed.d))
+				bch2_extent_drop_ptr(e, ptr);
+
+		ret = bch2_extent_nr_ptrs(e.c)
+			? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+			: -EIO;
+		if (ret) {
+			keys->top = keys->keys;
+			op->error = ret;
+			op->flags |= BCH_WRITE_DONE;
+			goto err;
+		}
+
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+
 	if (!bch2_keylist_empty(keys)) {
 		u64 sectors_start = keylist_sectors(keys);
 		int ret = op->index_update_fn(op);
@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
 			op->error = ret;
 		}
 	}
-
+err:
 	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);

 	if (!(op->flags & BCH_WRITE_DONE))
@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
 	}
 }

-static void bch2_write_io_error(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct keylist *keys = &op->insert_keys;
-	struct bch_fs *c = op->c;
-	struct bch_extent_ptr *ptr;
-	struct bkey_i *k;
-	int ret;
-
-	for_each_keylist_key(keys, k) {
-		struct bkey_i *n = bkey_next(k);
-		struct bkey_s_extent e = bkey_i_to_s_extent(k);
-
-		extent_for_each_ptr_backwards(e, ptr)
-			if (test_bit(ptr->dev, op->failed.d))
-				bch2_extent_drop_ptr(e, ptr);
-
-		memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
-		keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
-
-		ret = bch2_extent_nr_ptrs(e.c)
-			? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
-			: -EIO;
-		if (ret) {
-			keys->top = keys->keys;
-			op->error = ret;
-			op->flags |= BCH_WRITE_DONE;
-			break;
-		}
-	}
-
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	bch2_write_index(cl);
-	return;
-}
-
 static void bch2_write_endio(struct bio *bio)
 {
 	struct closure *cl		= bio->bi_private;
@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)

 	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);

-	if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 		set_bit(ca->dev_idx, op->failed.d);
-		set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
-	}

 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@ -706,11 +697,6 @@ do_write:

 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);

-	ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
-				    BCH_DATA_USER);
-	if (ret)
-		goto err;
-
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
 	    !percpu_ref_tryget(&c->writes)) {
 		__bcache_io_error(c, "read only");
 		op->error = -EROFS;
-		bch2_disk_reservation_put(c, &op->res);
+		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+			bch2_disk_reservation_put(c, &op->res);
 		closure_return(cl);
 	}

@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 	rbio->promote = NULL;

-	__bch2_write_op_init(&op->write.op, c);
+	bch2_write_op_init(&op->write.op, c);
+	op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
+	op->write.op.compression_type =
+		bch2_compression_opt_to_type(rbio->opts.compression);

 	op->write.move_dev	= -1;
 	op->write.op.devs	= c->fastest_devs;
@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	if (rbio->split)
 		rbio = bch2_rbio_free(rbio);
 	else
-		rbio->bio.bi_error = 0;
+		rbio->bio.bi_status = 0;

 	if (!(flags & BCH_READ_NODECODE))
 		flags |= BCH_READ_MUST_CLONE;
@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
 		__bch2_read(c, rbio, iter, inode, &avoid, flags);
 }

-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
 {
 	rbio->retry = retry;

@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
 		return;

 	if (retry == READ_ERR) {
-		bch2_rbio_parent(rbio)->bio.bi_error = error;
+		bch2_rbio_parent(rbio)->bio.bi_status = error;
 		bch2_rbio_done(rbio);
 	} else {
 		bch2_rbio_punt(rbio, bch2_rbio_retry,
@ -1236,7 +1227,7 @@ csum_err:
 	 */
 	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
 		rbio->flags |= BCH_READ_MUST_BOUNCE;
-		bch2_rbio_error(rbio, READ_RETRY, -EIO);
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
 		return;
 	}

@ -1245,13 +1236,13 @@ csum_err:
 		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
 	return;
 decompression_err:
 	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
 			  rbio->pos.inode,
 			  (u64) rbio->bvec_iter.bi_sector);
-	bch2_rbio_error(rbio, READ_ERR, -EIO);
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 	return;
 }

@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;

-	if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
+	if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}

@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
 		atomic_long_inc(&c->read_realloc_races);

 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_error(rbio, READ_RETRY, -EINTR);
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
 		else
-			bch2_rbio_error(rbio, READ_ERR, -EINTR);
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
 		return;
 	}

@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,

 		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
 					DIV_ROUND_UP(sectors, PAGE_SECTORS),
-					&c->bio_read_split));
+					&c->bio_read_split),
+				 orig->opts);

 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
 		split = true;
@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		 * lose the error)
 		 */
 		rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
-						&c->bio_read_split));
+						&c->bio_read_split),
+				 orig->opts);
 		rbio->bio.bi_iter = iter;
 		split = true;
 	} else {
@ -1428,6 +1421,8 @@ noclone:
 		bch2_read_endio(&rbio->bio);

 		ret = rbio->retry;
+		if (rbio->split)
+			rbio = bch2_rbio_free(rbio);
 		if (!ret)
 			bch2_rbio_done(rbio);
 	}
@ -1503,7 +1498,7 @@ err:
 	 * possibly bigger than the memory that was
 	 * originally allocated)
 	 */
-	rbio->bio.bi_error = -EINTR;
+	rbio->bio.bi_status = BLK_STS_AGAIN;
 	bio_endio(&rbio->bio);
 	return;
 }
@ -1561,6 +1556,7 @@ retry:
 			case READ_RETRY:
 				goto retry;
 			case READ_ERR:
+				rbio->bio.bi_status = BLK_STS_IOERR;
 				bio_endio(&rbio->bio);
 				return;
 			};
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);

+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
@ -29,11 +31,12 @@ enum bch_write_flags {
 	BCH_WRITE_PAGES_STABLE		= (1 << 4),
 	BCH_WRITE_PAGES_OWNED		= (1 << 5),
 	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
+	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),

 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 7),
-	BCH_WRITE_DONE			= (1 << 8),
-	BCH_WRITE_LOOPED		= (1 << 9),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 8),
+	BCH_WRITE_DONE			= (1 << 9),
+	BCH_WRITE_LOOPED		= (1 << 10),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 		? op->journal_seq_p : &op->journal_seq;
 }

+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+	op->journal_seq_p = journal_seq;
+	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
 	return op->alloc_reserve == RESERVE_MOVINGGC
@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)

 int bch2_write_index_default(struct bch_write_op *);

-static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
 {
 	op->c			= c;
 	op->io_wq		= index_update_wq(op);
 	op->flags		= 0;
 	op->written		= 0;
 	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c);
+	op->csum_type		= bch2_data_checksum_type(c, c->opts.data_checksum);
 	op->compression_type	=
 		bch2_compression_opt_to_type(c->opts.compression);
 	op->nr_replicas		= 0;
@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
 	op->index_update_fn	= bch2_write_index_default;
 }

-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct disk_reservation res,
-				      struct bch_devs_mask *devs,
-				      struct write_point_specifier write_point,
-				      struct bpos pos,
-				      u64 *journal_seq, unsigned flags)
-{
-	__bch2_write_op_init(op, c);
-	op->flags	= flags;
-	op->nr_replicas	= res.nr_replicas;
-	op->pos		= pos;
-	op->res		= res;
-	op->devs	= devs;
-	op->write_point	= write_point;
-
-	if (journal_seq) {
-		op->journal_seq_p = journal_seq;
-		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-	}
-}
-
 void bch2_write(struct closure *);

 static inline struct bch_write_bio *wbio_init(struct bio *bio)
@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
 				    struct extent_pick_ptr *pick,
 				    unsigned flags)
 {
-	rbio->_state = 0;
 	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
 }

 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 			     u64 inode)
 {
-	rbio->_state = 0;
+	BUG_ON(rbio->_state);
 	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
 		    BCH_READ_RETRY_IF_STALE|
 		    BCH_READ_MAY_PROMOTE|
 		    BCH_READ_USER_MAPPED);
 }

-static inline struct bch_read_bio *rbio_init(struct bio *bio)
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
 {
 	struct bch_read_bio *rbio = to_rbio(bio);

-	rbio->_state = 0;
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
 	return rbio;
 }

--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@ -6,6 +6,7 @@
 #include "buckets_types.h"
 #include "extents_types.h"
 #include "keylist_types.h"
+#include "opts.h"
 #include "super_types.h"

 #include <linux/llist.h>
@ -56,6 +57,8 @@ struct bch_read_bio {

 	struct promote_op	*promote;

+	struct bch_io_opts	opts;
+
 	struct work_struct	work;

 	struct bio		bio;
@ -69,8 +72,7 @@ struct bch_write_bio {
 	struct closure		*cl;
 	};

-	u8			ptr_idx;
-	u8			replicas_failed;
+	struct bch_devs_list	failed;
 	u8			order;

 	unsigned		split:1,
@ -90,8 +92,8 @@ struct bch_write_op {
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;

+	unsigned		written; /* sectors */
 	u16			flags;
-	u16			written; /* sectors */
 	s8			error;

 	unsigned		csum_type:4;
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -338,8 +338,8 @@ struct journal_list {
 * Given a journal entry we just read, add it to the list of journal entries to
 * be replayed:
 */
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
-		    struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
 {
 	struct journal_replay *i, *pos;
 	struct list_head *where;
@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 	__le64 last_seq;
 	int ret;

-	mutex_lock(&jlist->lock);
-
 	last_seq = !list_empty(jlist->head)
 		? list_last_entry(jlist->head, struct journal_replay,
 				  list)->j.last_seq
@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
 				    le64_to_cpu(j->seq));
-
-			ret = JOURNAL_ENTRY_ADD_OK;
-			goto out;
+			goto found;
 		}

 		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@ -395,12 +391,16 @@ add:
 		goto out;
 	}

-	memcpy(&i->j, j, bytes);
 	list_add(&i->list, where);
+	i->devs.nr = 0;
+	memcpy(&i->j, j, bytes);
+found:
+	if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+			 c, "duplicate journal entries on same device"))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
 	ret = JOURNAL_ENTRY_ADD_OK;
 out:
 fsck_err:
-	mutex_unlock(&jlist->lock);
 	return ret;
 }

@ -496,8 +496,8 @@ fsck_err:
 #define journal_entry_err_on(cond, c, msg, ...)				\
 	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)

-static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
-				    int write)
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+					  int write)
 {
 	struct jset_entry *entry;
 	int ret = 0;
@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
 		if (journal_entry_err_on(vstruct_next(entry) >
 					 vstruct_last(j), c,
 				"journal entry extends past end of jset")) {
-			j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+			j->u64s = cpu_to_le32((u64 *) entry - j->_data);
 			break;
 		}

@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
 			"invalid journal entry: last_seq > seq"))
 		j->last_seq = j->seq;

-	return __journal_entry_validate(c, j, write);
+	return 0;
 fsck_err:
 	return ret;
 }
@ -722,7 +722,10 @@ reread:			sectors_read = min_t(unsigned,

 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);

-		ret = journal_entry_add(c, jlist, j);
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
 			*entries_found = true;
@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,

 	for_each_jset_entry_type(entry, &i->j,
 			JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
-		seq = le64_to_cpu(entry->_data[0]);
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+		seq = le64_to_cpu(bl_entry->seq);

 		bch_verbose(c, "blacklisting existing journal seq %llu", seq);

@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
 		    "filesystem marked clean but journal has keys to replay");

+	list_for_each_entry(i, list, list) {
+		ret = journal_entry_validate_entries(c, &i->j, READ);
+		if (ret)
+			goto fsck_err;
+	}
+
 	i = list_last_entry(list, struct journal_replay, list);

 	unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		INIT_LIST_HEAD(&p->list);
 		INIT_LIST_HEAD(&p->flushed);
 		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
 	}

 	mutex_lock(&j->blacklist_lock);
@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));

 		atomic_set(&p->count, 1);
+		p->devs = i->devs;

 		if (journal_seq_blacklist_read(j, i, p)) {
 			mutex_unlock(&j->blacklist_lock);
@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 {
 	struct journal_buf *w = journal_prev_buf(j);

-	atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
+	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);

 	if (!need_write_just_set &&
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
 	INIT_LIST_HEAD(&p->list);
 	INIT_LIST_HEAD(&p->flushed);
 	atomic_set(&p->count, count);
+	p->devs.nr = 0;
 }

 static void __bch2_journal_next_entry(struct journal *j)
@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 		   bch2_journal_error(j));
 }

+int bch2_journal_flush_all_pins(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool flush;
+
+	bch2_journal_flush_pins(j, U64_MAX);
+
+	spin_lock(&j->lock);
+	flush = last_seq(j) != j->last_seq_ondisk ||
+		c->btree_roots_dirty;
+	spin_unlock(&j->lock);
+
+	return flush ? bch2_journal_meta(j) : 0;
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	 * i.e. whichever device was limiting the current journal entry size.
 	 */
 	extent_for_each_ptr_backwards(e, ptr) {
-		ca = c->devs[ptr->dev];
+		   ca = bch_dev_bkey_exists(c, ptr->dev);

 		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
 		    ca->journal.sectors_free <= sectors)
@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
 	struct bch_dev *ca = bio->bi_private;
 	struct journal *j = &ca->fs->journal;

-	if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
 	    bch2_meta_write_fault("journal")) {
 		/* Was this a flush or an actual journal write? */
 		if (ca->journal.ptr_idx != U8_MAX) {
@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
 		if (r->alive)
 			bch2_journal_add_btree_root(w, i, &r->key, r->level);
 	}
+	c->btree_roots_dirty = false;
 	mutex_unlock(&c->btree_root_lock);

 	journal_write_compact(jset);
@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));

 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    __journal_entry_validate(c, jset, WRITE))
+	    journal_entry_validate_entries(c, jset, WRITE))
 		goto err;

 	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
 				  journal_nonce(jset), jset);

 	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    __journal_entry_validate(c, jset, WRITE))
+	    journal_entry_validate_entries(c, jset, WRITE))
 		goto err;

 	sectors = vstruct_sectors(jset, c->block_bits);
@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
 				  BCH_DATA_JOURNAL))
 		goto err;

+	journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+			bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
 		goto no_io;

 	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
-		ca = c->devs[ptr->dev];
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
 	return bch2_journal_flush_seq(j, seq);
 }

+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 seq = 0;
+	unsigned iter;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (bch2_dev_list_has_dev(p->devs, dev_idx))
+			seq = journal_pin_seq(j, p);
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < atomic64_read(&j->seq)) {
+		seq = max(seq, last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	 * journal entries, then force a brand new empty journal entry to be
 	 * written:
 	 */
-	bch2_journal_flush_pins(j, U64_MAX);
-	bch2_journal_flush_async(j, NULL);
-	bch2_journal_meta(j);
+	bch2_journal_flush_all_pins(j);

 	cancel_delayed_work_sync(&j->write_work);
 	cancel_delayed_work_sync(&j->reclaim_work);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -118,6 +118,8 @@
 */
 struct journal_replay {
 	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
 	struct jset		j;
 };

@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
 void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);

 struct closure;
 struct bch_fs;
@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);

 void bch2_journal_halt(struct journal *);

--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -34,6 +34,7 @@ struct journal_entry_pin_list {
 	struct list_head		list;
 	struct list_head		flushed;
 	atomic_t			count;
+	struct bch_devs_list		devs;
 };

 struct journal;
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)

 #define MAX_DATA_OFF_ITER	10

-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented.  The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				    int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 keys_moved, sectors_moved;
@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	return ret;
 }

-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
 static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 			       enum btree_id id)
 {
@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 *   is written.
 */

-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				     int flags)
 {
-	struct bch_fs *c = ca->fs;
 	unsigned i;
 	int ret = 0;

@ -240,37 +222,31 @@ err:
 	return ret;
 }

-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
-
-static int bch2_flag_key_bad(struct btree_iter *iter,
-			    struct bch_dev *ca,
-			    struct bkey_s_c_extent orig)
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-	BKEY_PADDED(key) tmp;
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct bch_fs *c = ca->fs;
+	return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+		bch2_dev_metadata_migrate(c, ca, flags);
+}

-	bkey_reassemble(&tmp.key, orig.s_c);
-	e = bkey_i_to_s_extent(&tmp.key);
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	struct bch_extent_ptr *ptr;
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;

 	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == ca->dev_idx)
+		if (ptr->dev == dev_idx)
 			bch2_extent_drop_ptr(e, ptr);

-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize(c, e.s);
+	nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;

-	return bch2_btree_insert_at(c, NULL, NULL, NULL,
-				   BTREE_INSERT_ATOMIC,
-				   BTREE_INSERT_ENTRY(iter, &tmp.key));
+	return 0;
 }

 /*
@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
 * that we've already tried to move the data MAX_DATA_OFF_ITER times and
 * are not likely to succeed if we try again.
 */
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
+	struct bkey_s_extent e;
+	BKEY_PADDED(key) tmp;
 	struct btree_iter iter;
 	int ret = 0;

@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (!bkey_extent_is_data(k.k))
 			goto advance;

-		e = bkey_s_c_to_extent(k);
-		if (!bch2_extent_has_device(e, ca->dev_idx))
+		if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
 			goto advance;

-		ret = bch2_flag_key_bad(&iter, ca, e);
+		bkey_reassemble(&tmp.key, k);
+		e = bkey_i_to_s_extent(&tmp.key);
+
+		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, e.s);
+
+		if (bkey_extent_is_data(e.k) &&
+		    (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+			break;
+
+		iter.pos = bkey_start_pos(&tmp.key.k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &tmp.key));

 		/*
 		 * don't want to leave ret == -EINTR, since if we raced and
@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (ret)
 			break;

-		/*
-		 * If the replica we're dropping was dirty and there is an
-		 * additional cached replica, the cached replica will now be
-		 * considered dirty - upon inserting the new version of the key,
-		 * the bucket accounting will be updated to reflect the fact
-		 * that the cached data is now dirty and everything works out as
-		 * if by magic without us having to do anything.
-		 *
-		 * The one thing we need to be concerned with here is there's a
-		 * race between when we drop any stale pointers from the key
-		 * we're about to insert, and when the key actually gets
-		 * inserted and the cached data is marked as dirty - we could
-		 * end up trying to insert a key with a pointer that should be
-		 * dirty, but points to stale data.
-		 *
-		 * If that happens the insert code just bails out and doesn't do
-		 * the insert - however, it doesn't return an error. Hence we
-		 * need to always recheck the current key before advancing to
-		 * the next:
-		 */
 		continue;
 advance:
 		if (bkey_extent_is_data(k.k)) {
@ -357,3 +335,80 @@ advance:

 	return ret;
 }
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_extent *new_key;
+retry:
+			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+						    dev_idx)) {
+				bch2_btree_iter_set_locks_want(&iter, 0);
+
+				ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+							    BCH_DATA_BTREE);
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_extent(&tmp.k);
+
+				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+
+				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_btree_iter_unlock(&iter);
+
+		/* btree root */
+		mutex_lock(&c->btree_root_lock);
+		mutex_unlock(&c->btree_root_lock);
+	}
+
+	ret = 0;
+out:
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+err:
+	bch2_btree_iter_unlock(&iter);
+	goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@ -1,8 +1,7 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H

-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);

 #endif /* _BCACHEFS_MIGRATE_H */
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -3,6 +3,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "inode.h"
 #include "io.h"
 #include "move.h"
 #include "super-io.h"
@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);

-	if (likely(!io->rbio.bio.bi_error)) {
+	if (likely(!io->rbio.bio.bi_status)) {
 		bch2_migrate_write_init(&io->write, &io->rbio);
 		closure_call(&io->write.op.cl, bch2_write, NULL, cl);
 	}
@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
 			  struct write_point_specifier wp,
 			  int btree_insert_flags,
 			  int move_device,
+			  struct bch_io_opts opts,
 			  struct bkey_s_c k)
 {
 	struct extent_pick_ptr pick;
@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
 		goto err;
 	}

+	io->rbio.opts = opts;
 	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;

-	__bch2_write_op_init(&io->write.op, c);
 	io->write.btree_insert_flags = btree_insert_flags;
 	io->write.move_dev	= move_device;
+
+	bch2_write_op_init(&io->write.op, c);
+	io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+	io->write.op.compression_type =
+		bch2_compression_opt_to_type(opts.compression);
 	io->write.op.devs	= devs;
 	io->write.op.write_point = wp;

@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct moving_context ctxt;
+	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
 	struct btree_iter iter;
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
+	u64 cur_inum = U64_MAX;
 	int ret = 0;

 	bch2_move_ctxt_init(&ctxt);
@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
 		    (bch2_btree_iter_unlock(&iter),
 		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
 			break;
-
+peek:
 		k = bch2_btree_iter_peek(&iter);
 		if (!k.k)
 			break;
@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
 		if (ret)
 			break;

-		if (!bkey_extent_is_data(k.k) ||
-		    !pred(arg, bkey_s_c_to_extent(k)))
+		if (!bkey_extent_is_data(k.k))
+			goto next;
+
+		if (cur_inum != k.k->p.inode) {
+			struct bch_inode_unpacked inode;
+
+			/* don't hold btree locks while looking up inode: */
+			bch2_btree_iter_unlock(&iter);
+
+			opts = bch2_opts_to_inode_opts(c->opts);
+			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+				bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+			cur_inum = k.k->p.inode;
+			goto peek;
+		}
+
+		if (!pred(arg, bkey_s_c_to_extent(k)))
 			goto next;

 		/* unlock before doing IO: */
@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,

 		if (bch2_move_extent(c, &ctxt, devs, wp,
 				     btree_insert_flags,
-				     move_device, k)) {
+				     move_device, opts, k)) {
 			/* memory allocation failure, wait for some IO to finish */
 			bch2_move_ctxt_wait_for_io(&ctxt);
 			continue;
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }

+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
 u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
 #define BCH_OPT(_name, ...)						\
 	case Opt_##_name:						\
-		return opts->_name;					\
-
+		return opts->_name;
 	BCH_OPTS()
 #undef BCH_OPT
-
 	default:
 		BUG();
 	}
@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
 	case Opt_##_name:						\
 		opt_set(*opts, _name, v);				\
 		break;
-
 	BCH_OPTS()
 #undef BCH_OPT
-
 	default:
 		BUG();
 	}
@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
 #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
 	if (_sb_opt != NO_SB_OPT)					\
 		opt_set(opts, _name, _sb_opt(sb));
-
 	BCH_OPTS()
 #undef BCH_OPT

@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
 #undef BCH_OPT
 };

-static int bch2_opt_lookup(const char *name)
+int bch2_opt_lookup(const char *name)
 {
 	const struct bch_option *i;

@ -247,3 +255,52 @@ no_val:
 	pr_err("Mount option %s requires a value", name);
 	return -1;
 }
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+	struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -181,10 +181,7 @@ do {									\

 static inline struct bch_opts bch2_opts_empty(void)
 {
-	struct bch_opts opts;
-
-	memset(&opts, 0, sizeof(opts));
-	return opts;
+	return (struct bch_opts) { 0 };
 }

 void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@ -215,12 +212,35 @@ struct bch_option {

 extern const struct bch_option bch2_opt_table[];

+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);

 struct bch_opts bch2_opts_from_sb(struct bch_sb *);

+int bch2_opt_lookup(const char *);
 int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);

+/* inode opts: */
+
+#define BCH_INODE_OPTS()					\
+	BCH_INODE_OPT(data_checksum,			8)	\
+	BCH_INODE_OPT(compression,			8)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
 #endif /* _BCACHEFS_OPTS_H */
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -12,6 +12,8 @@
 #include <linux/sort.h>

 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
 static const char *bch2_sb_validate_replicas(struct bch_sb *);

 static inline void __bch2_sb_layout_size_assert(void)
@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		return NULL;

 	f = __bch2_sb_field_resize(sb->sb, f, u64s);
-	f->type = type;
+	f->type = cpu_to_le32(type);
 	return f;
 }

@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
 	}

 	f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
-	f->type = type;
+	f->type = cpu_to_le32(type);
 	return f;
 }

@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)

 	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
 	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
+		return "Invalid number of data replicas";
+
+	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+		return "Invalid metadata checksum type";
+
+	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+		return "Invalid compression type";

 	if (!BCH_SB_BTREE_NODE_SIZE(sb))
 		return "Btree node size not set";
@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
 		if (src_f->type == BCH_SB_FIELD_journal)
 			continue;

-		dst_f = bch2_sb_field_get(dst, src_f->type);
+		dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
 		dst_f = __bch2_sb_field_resize(dst, dst_f,
 				le32_to_cpu(src_f->u64s));

@ -601,7 +612,7 @@ reread:

 	/* XXX: verify MACs */
 	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
-			    (struct nonce) { 0 }, sb->sb);
+			    null_nonce(), sb->sb);

 	if (bch2_crc_cmp(csum, sb->sb->csum))
 		return "bad checksum reading superblock";
@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
 got_super:
 	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
 		 le64_to_cpu(ret->sb->version),
-		 le64_to_cpu(ret->sb->flags),
+		 le64_to_cpu(ret->sb->flags[0]),
 		 le64_to_cpu(ret->sb->seq),
-		 le16_to_cpu(ret->sb->u64s));
+		 le32_to_cpu(ret->sb->u64s));

 	err = "Superblock block size smaller than device block size";
 	if (le16_to_cpu(ret->sb->block_size) << 9 <
@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)

 	/* XXX: return errors directly */

-	if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
 		ca->sb_write_error = 1;

 	closure_put(&ca->fs->sb_write);
@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)

 	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
 	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
-				(struct nonce) { 0 }, sb);
+				null_nonce(), sb);

 	bio_reset(bio);
 	bio->bi_bdev		= ca->disk_sb.bdev;
@ -830,7 +841,12 @@ out:
 	bch2_sb_update(c);
 }

-/* replica information: */
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)

 static inline struct bch_replicas_cpu_entry *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 	return (void *) r->entries + r->entry_size * i;
 }

+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
 				     unsigned dev)
 {
@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
 		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
 }

+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+			     enum bch_data_type data_type,
+			     struct bch_replicas_cpu_entry *r,
+			     unsigned *max_dev)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached) {
+			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
+			replicas_set_dev(r, ptr->dev);
+			nr++;
+		}
+	return nr;
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_cpu_entry new_entry,
+		       unsigned max_dev)
+{
+	struct bch_replicas_cpu *new;
+	unsigned i, nr, entry_size;
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size = max(entry_size, old->entry_size);
+	nr = old->nr + 1;
+
+	new = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      nr * entry_size, GFP_NOIO);
+	if (!new)
+		return NULL;
+
+	new->nr		= nr;
+	new->entry_size	= entry_size;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(new, i),
+		       cpu_replicas_entry(old, i),
+		       min(new->entry_size, old->entry_size));
+
+	memcpy(cpu_replicas_entry(new, old->nr),
+	       &new_entry,
+	       new->entry_size);
+
+	bch2_cpu_replicas_sort(new);
+	return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	return max_dev < replicas_dev_slots(r) &&
+		eytzinger0_find(r->entries, r->nr,
+				r->entry_size,
+				memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+				struct bch_replicas_cpu_entry new_entry,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
+	int ret = -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+
+	old_gc = rcu_dereference_protected(c->replicas_gc,
+					   lockdep_is_held(&c->sb_lock));
+	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+		if (!new_gc)
+			goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+	/* recheck, might have raced */
+	if (replicas_has_entry(old_r, new_entry, max_dev))
+		goto out;
+
+	new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+	if (!new_r)
+		goto err;
+
+	ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+	if (ret)
+		goto err;
+
+	if (new_gc) {
+		rcu_assign_pointer(c->replicas_gc, new_gc);
+		kfree_rcu(old_gc, rcu);
+	}
+
+	rcu_assign_pointer(c->replicas, new_r);
+	kfree_rcu(old_r, rcu);
+
+	bch2_write_super(c);
+out:
+	ret = 0;
+err:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *r, *gc_r;
+	bool marked;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, search, max_dev) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+			  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+
+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+		return 0;
+
+	return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+				  struct bch_devs_list *devs,
+				  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search = { .data_type = data_type };
+	unsigned i, max_dev = 0;
+
+	if (!devs->nr)
+		return 0;
+
+	for (i = 0; i < devs->nr; i++) {
+		max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+		replicas_set_dev(&search, devs->devs[i]);
+	}
+
+	return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+	struct bch_replicas_cpu *new_r, *old_r;
+	int ret = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+
+	new_r = rcu_dereference_protected(c->replicas_gc,
+					  lockdep_is_held(&c->sb_lock));
+
+	if (err) {
+		rcu_assign_pointer(c->replicas_gc, NULL);
+		kfree_rcu(new_r, rcu);
+		goto err;
+	}
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+
+	rcu_assign_pointer(c->replicas, new_r);
+	rcu_assign_pointer(c->replicas_gc, NULL);
+	kfree_rcu(old_r, rcu);
+
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_cpu *dst, *src;
+	struct bch_replicas_cpu_entry *e;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc);
+
+	src = rcu_dereference_protected(c->replicas,
+					lockdep_is_held(&c->sb_lock));
+
+	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      src->nr * src->entry_size, GFP_NOIO);
+	if (!dst) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	dst->nr		= 0;
+	dst->entry_size	= src->entry_size;
+
+	for_each_cpu_replicas_entry(src, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(dst, dst->nr++),
+			       e, dst->entry_size);
+
+	bch2_cpu_replicas_sort(dst);
+
+	rcu_assign_pointer(c->replicas_gc, dst);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
 					unsigned *nr,
 					unsigned *bytes,
@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
 		}
 	}

-	eytzinger0_sort(cpu_r->entries,
-			cpu_r->nr,
-			cpu_r->entry_size,
-			memcmp, NULL);
+	bch2_cpu_replicas_sort(cpu_r);
 	return cpu_r;
 }

@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	struct bch_sb_field_replicas *sb_r;
 	struct bch_replicas_cpu *cpu_r, *old_r;

-	lockdep_assert_held(&c->sb_lock);
-
 	sb_r	= bch2_sb_get_replicas(c->disk_sb);
 	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
 	if (!cpu_r)
 		return -ENOMEM;

-	old_r = c->replicas;
+	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
 	rcu_assign_pointer(c->replicas, cpu_r);
 	if (old_r)
 		kfree_rcu(old_r, rcu);
@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	return 0;
 }

-static void bkey_to_replicas(struct bkey_s_c_extent e,
-			     enum bch_data_type data_type,
-			     struct bch_replicas_cpu_entry *r,
-			     unsigned *max_dev)
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
 {
-	const struct bch_extent_ptr *ptr;
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *sb_e;
+	struct bch_replicas_cpu_entry *e;
+	size_t i, bytes;

-	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
-	       data_type >= BCH_DATA_NR);
+	bytes = sizeof(struct bch_sb_field_replicas);

-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
+	for_each_cpu_replicas_entry(r, e) {
+		bytes += sizeof(struct bch_replicas_entry);
+		for (i = 0; i < r->entry_size - 1; i++)
+			bytes += hweight8(e->devs[i]);
+	}

-	*max_dev = 0;
+	sb_r = bch2_fs_sb_resize_replicas(c,
+			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;

-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached) {
-			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
-			replicas_set_dev(r, ptr->dev);
-		}
-}
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);

-/*
- * for when gc of replica information is in progress:
- */
-static int bch2_update_gc_replicas(struct bch_fs *c,
-				   struct bch_replicas_cpu *gc_r,
-				   struct bkey_s_c_extent e,
-				   enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry new_e;
-	struct bch_replicas_cpu *new;
-	unsigned i, nr, entry_size, max_dev;
+	sb_e = sb_r->entries;
+	for_each_cpu_replicas_entry(r, e) {
+		sb_e->data_type = e->data_type;

-	bkey_to_replicas(e, data_type, &new_e, &max_dev);
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i))
+				sb_e->devs[sb_e->nr++] = i;

-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
-	entry_size = max(entry_size, gc_r->entry_size);
-	nr = gc_r->nr + 1;
+		sb_e = replicas_entry_next(sb_e);

-	new = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      nr * entry_size, GFP_NOIO);
-	if (!new)
-		return -ENOMEM;
+		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+	}

-	new->nr		= nr;
-	new->entry_size	= entry_size;
-
-	for (i = 0; i < gc_r->nr; i++)
-		memcpy(cpu_replicas_entry(new, i),
-		       cpu_replicas_entry(gc_r, i),
-		       gc_r->entry_size);
-
-	memcpy(cpu_replicas_entry(new, nr - 1),
-	       &new_e,
-	       new->entry_size);
-
-	eytzinger0_sort(new->entries,
-			new->nr,
-			new->entry_size,
-			memcmp, NULL);
-
-	rcu_assign_pointer(c->replicas_gc, new);
-	kfree_rcu(gc_r, rcu);
 	return 0;
 }

-static bool replicas_has_extent(struct bch_replicas_cpu *r,
-				struct bkey_s_c_extent e,
-				enum bch_data_type data_type)
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
 {
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;

-	bkey_to_replicas(e, data_type, &search, &max_dev);
+	mi	= bch2_sb_get_members(sb);
+	sb_r	= bch2_sb_get_replicas(sb);
+	if (!sb_r)
+		return NULL;

-	return max_dev < replicas_dev_slots(r) &&
-		eytzinger0_find(r->entries, r->nr,
-				r->entry_size,
-				memcmp, &search) < r->nr;
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr)
+			goto err;
+
+		err = "invalid replicas entry: too many devices";
+		if (e->nr >= BCH_REPLICAS_MAX)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		goto err;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_cpu_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_cpu_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		err = "duplicate replicas entry";
+		if (!memcmp(l, r, cpu_r->entry_size))
+			goto err;
+	}
+
+	err = NULL;
+err:
+	kfree(cpu_r);
+	return err;
 }

+/* Query replicas: */
+
 bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
 			  enum bch_data_type data_type)
 {
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
 	bool ret;

+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+		return true;
+
 	rcu_read_lock();
-	ret = replicas_has_extent(rcu_dereference(c->replicas),
-				  e, data_type);
+	ret = replicas_has_entry(rcu_dereference(c->replicas),
+				 search, max_dev);
 	rcu_read_unlock();

 	return ret;
 }

-noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
-					  struct bkey_s_c_extent e,
-					  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu *gc_r;
-	const struct bch_extent_ptr *ptr;
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry *new_entry;
-	unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
-	int ret = 0;
-
-	mutex_lock(&c->sb_lock);
-
-	gc_r = rcu_dereference_protected(c->replicas_gc,
-					 lockdep_is_held(&c->sb_lock));
-	if (gc_r &&
-	    !replicas_has_extent(gc_r, e, data_type)) {
-		ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
-		if (ret)
-			goto err;
-	}
-
-	/* recheck, might have raced */
-	if (bch2_sb_has_replicas(c, e, data_type)) {
-		mutex_unlock(&c->sb_lock);
-		return 0;
-	}
-
-	new_entry_bytes = sizeof(struct bch_replicas_entry) +
-		bch2_extent_nr_dirty_ptrs(e.s_c);
-
-	sb_r = bch2_sb_get_replicas(c->disk_sb);
-
-	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
-
-	new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
-
-	sb_r = bch2_fs_sb_resize_replicas(c,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
-				     sizeof(u64)));
-	if (!sb_r) {
-		ret = -ENOSPC;
-		goto err;
-	}
-
-	new_entry = (void *) sb_r + bytes;
-	new_entry->data_type = data_type;
-	new_entry->nr = 0;
-
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached)
-			new_entry->devs[new_entry->nr++] = ptr->dev;
-
-	ret = bch2_sb_replicas_to_cpu_replicas(c);
-	if (ret) {
-		memset(new_entry, 0,
-		       vstruct_end(&sb_r->field) - (void *) new_entry);
-		goto err;
-	}
-
-	bch2_write_super(c);
-err:
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu *gc_r;
-	bool marked;
-
-	rcu_read_lock();
-	marked = replicas_has_extent(rcu_dereference(c->replicas),
-				     e, data_type) &&
-		(!(gc_r = rcu_dereference(c->replicas_gc)) ||
-		 replicas_has_extent(gc_r, e, data_type));
-	rcu_read_unlock();
-
-	if (marked)
-		return 0;
-
-	return bch2_check_mark_super_slowpath(c, e, data_type);
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-					struct bch_devs_mask online_devs)
+					      struct bch_devs_mask online_devs)
 {
+	struct bch_sb_field_members *mi;
 	struct bch_replicas_cpu_entry *e;
 	struct bch_replicas_cpu *r;
 	unsigned i, dev, dev_slots, nr_online, nr_offline;
@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
 		ret.replicas[i].nr_online = UINT_MAX;

+	mi = bch2_sb_get_members(c->disk_sb);
 	rcu_read_lock();
+
 	r = rcu_dereference(c->replicas);
-	dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
+	dev_slots = replicas_dev_slots(r);

-	for (i = 0; i < r->nr; i++) {
-		e = cpu_replicas_entry(r, i);
-
-		BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+	for_each_cpu_replicas_entry(r, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);

 		nr_online = nr_offline = 0;

@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 			if (!replicas_test_dev(e, dev))
 				continue;

+			BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
+
 			if (test_bit(dev, online_devs.d))
 				nr_online++;
 			else
@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bch_replicas_cpu_entry *e;
 	struct bch_replicas_cpu *r;
-	unsigned i, ret = 0;
+	unsigned ret = 0;

 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 	if (ca->dev_idx >= replicas_dev_slots(r))
 		goto out;

-	for (i = 0; i < r->nr; i++) {
-		e = cpu_replicas_entry(r, i);
-
+	for_each_cpu_replicas_entry(r, e)
 		if (replicas_test_dev(e, ca->dev_idx)) {
 			ret |= 1 << e->data_type;
 			break;
 		}
-	}
 out:
 	rcu_read_unlock();

 	return ret;
 }
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
-{
-	struct bch_sb_field_members *mi;
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_cpu *cpu_r = NULL;
-	struct bch_replicas_entry *e;
-	const char *err;
-	unsigned i;
-
-	mi	= bch2_sb_get_members(sb);
-	sb_r	= bch2_sb_get_replicas(sb);
-	if (!sb_r)
-		return NULL;
-
-	for_each_replicas_entry(sb_r, e) {
-		err = "invalid replicas entry: invalid data type";
-		if (e->data_type >= BCH_DATA_NR)
-			goto err;
-
-		err = "invalid replicas entry: too many devices";
-		if (e->nr >= BCH_REPLICAS_MAX)
-			goto err;
-
-		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr; i++)
-			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-				goto err;
-	}
-
-	err = "cannot allocate memory";
-	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-	if (!cpu_r)
-		goto err;
-
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      memcmp, NULL);
-
-	for (i = 0; i + 1 < cpu_r->nr; i++) {
-		struct bch_replicas_cpu_entry *l =
-			cpu_replicas_entry(cpu_r, i);
-		struct bch_replicas_cpu_entry *r =
-			cpu_replicas_entry(cpu_r, i + 1);
-
-		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
-		err = "duplicate replicas entry";
-		if (!memcmp(l, r, cpu_r->entry_size))
-			goto err;
-	}
-
-	err = NULL;
-err:
-	kfree(cpu_r);
-	return err;
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_cpu *r, *old_r;
-	struct bch_replicas_entry *dst_e;
-	size_t i, j, bytes, dev_slots;
-	int ret = 0;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-
-	r = rcu_dereference_protected(c->replicas_gc,
-				      lockdep_is_held(&c->sb_lock));
-
-	if (err) {
-		rcu_assign_pointer(c->replicas_gc, NULL);
-		kfree_rcu(r, rcu);
-		goto err;
-	}
-
-	dev_slots = replicas_dev_slots(r);
-
-	bytes = sizeof(struct bch_sb_field_replicas);
-
-	for (i = 0; i < r->nr; i++) {
-		struct bch_replicas_cpu_entry *e =
-			cpu_replicas_entry(r, i);
-
-		bytes += sizeof(struct bch_replicas_entry);
-		for (j = 0; j < r->entry_size - 1; j++)
-			bytes += hweight8(e->devs[j]);
-	}
-
-	sb_r = bch2_fs_sb_resize_replicas(c,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
-	if (!sb_r) {
-		ret = -ENOSPC;
-		goto err;
-	}
-
-	memset(&sb_r->entries, 0,
-	       vstruct_end(&sb_r->field) -
-	       (void *) &sb_r->entries);
-
-	dst_e = sb_r->entries;
-	for (i = 0; i < r->nr; i++) {
-		struct bch_replicas_cpu_entry *src_e =
-			cpu_replicas_entry(r, i);
-
-		dst_e->data_type = src_e->data_type;
-
-		for (j = 0; j < dev_slots; j++)
-			if (replicas_test_dev(src_e, j))
-				dst_e->devs[dst_e->nr++] = j;
-
-		dst_e = replicas_entry_next(dst_e);
-	}
-
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas, r);
-	rcu_assign_pointer(c->replicas_gc, NULL);
-	kfree_rcu(old_r, rcu);
-
-	bch2_write_super(c);
-err:
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-	struct bch_replicas_cpu *r, *src;
-	unsigned i;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-	BUG_ON(c->replicas_gc);
-
-	src = rcu_dereference_protected(c->replicas,
-					lockdep_is_held(&c->sb_lock));
-
-	r = kzalloc(sizeof(struct bch_replicas_cpu) +
-		    src->nr * src->entry_size, GFP_NOIO);
-	if (!r) {
-		mutex_unlock(&c->sb_lock);
-		return -ENOMEM;
-	}
-
-	r->entry_size = src->entry_size;
-	r->nr = 0;
-
-	for (i = 0; i < src->nr; i++) {
-		struct bch_replicas_cpu_entry *dst_e =
-			cpu_replicas_entry(r, r->nr);
-		struct bch_replicas_cpu_entry *src_e =
-			cpu_replicas_entry(src, i);
-
-		if (!(src_e->data_type & typemask)) {
-			memcpy(dst_e, src_e, r->entry_size);
-			r->nr++;
-		}
-	}
-
-	eytzinger0_sort(r->entries,
-			r->nr,
-			r->entry_size,
-			memcmp, NULL);
-
-	rcu_assign_pointer(c->replicas_gc, r);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);

 /* replicas: */

-/* iterate over bch_sb_field_replicas: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
 bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
 int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+				  enum bch_data_type);

 struct replicas_status {
 	struct {
@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);

+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
 #endif /* _BCACHEFS_SUPER_IO_H */
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
 	return c;
 }

-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
 {
+	struct bch_fs *c = data;
 	struct backing_dev_info *bdi;
 	struct bch_dev *ca;
 	unsigned i;
@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
 	return ret;
 }

-static int bch2_congested_fn(void *data, int bdi_bits)
-{
-	struct bch_fs *c = data;
-
-	return bch2_congested(c, bdi_bits);
-}
-
 /* Filesystem RO/RW: */

 /*
@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
 	 */
-	bch2_journal_flush_pins(&c->journal, U64_MAX);
+	bch2_journal_flush_all_pins(&c->journal);

 	if (!bch2_journal_error(&c->journal))
 		bch2_btree_verify_flushed(c);
@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
-	if (c->bdi.bdi_list.next)
-		bdi_destroy(&c->bdi);
 	lg_lock_free(&c->usage_lock);
 	free_percpu(c->usage_percpu);
 	mempool_exit(&c->btree_bounce_pool);
@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
 	percpu_ref_exit(&c->writes);
-	kfree(c->replicas);
+	kfree(rcu_dereference_protected(c->replicas, 1));

 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)

 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (c->devs[i])
-			bch2_dev_free(c->devs[i]);
+			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));

 	closure_debug_destroy(&c->cl);
 	kobject_put(&c->kobj);
@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 				      sizeof(struct btree_update)) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_read_bio, 1,
-			offsetof(struct btree_read_bio, bio)) ||
-	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+			offsetof(struct btree_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS) ||
 	    mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
 					 c->opts.btree_node_size,
@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
 	    mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
-	    bdi_setup_and_register(&c->bdi, "bcachefs") ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;

-	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-	c->bdi.congested_fn	= bch2_congested_fn;
-	c->bdi.congested_data	= c;
-
 	mi = bch2_sb_get_members(c->disk_sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb, mi, i) &&
@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 				continue;

 			err = "error reading btree root";
-			if (bch2_btree_root_read(c, i, k, level))
-				goto err;
+			if (bch2_btree_root_read(c, i, k, level)) {
+				if (i != BTREE_ID_ALLOC)
+					goto err;
+
+				mustfix_fsck_err(c, "error reading btree root");
+			}
 		}

 		err = "error reading allocation information";
@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		closure_sync(&cl);

 		bch2_inode_init(c, &inode, 0, 0,
-			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 		inode.bi_inum = BCACHEFS_ROOT_INO;

 		bch2_inode_pack(&packed_inode, &inode);
@ -877,6 +872,7 @@ out:
 	bch2_journal_entries_free(&journal);
 	return err;
 err:
+fsck_err:
 	closure_sync(&cl);

 	switch (ret) {
@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
 	kobject_put(&ca->kobj);
 }

-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
-	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
-	complete(&ca->offline_complete);
-}
-
 static void __bch2_dev_offline(struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;

 	lockdep_assert_held(&c->state_lock);

+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
 	__bch2_dev_read_only(c, ca);

-	reinit_completion(&ca->offline_complete);
+	reinit_completion(&ca->io_ref_completion);
 	percpu_ref_kill(&ca->io_ref);
-	wait_for_completion(&ca->offline_complete);
+	wait_for_completion(&ca->io_ref_completion);

 	if (ca->kobj.state_in_sysfs) {
 		struct kobject *block =
@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
 	bch2_dev_journal_exit(ca);
 }

-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
 {
 	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);

-	complete(&ca->stop_complete);
+	complete(&ca->ref_completion);
 }

-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 {
-	struct bch_fs *c = ca->fs;
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);

-	lockdep_assert_held(&c->state_lock);
-
-	BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
-	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
-	synchronize_rcu();
-
-	reinit_completion(&ca->stop_complete);
-	percpu_ref_kill(&ca->ref);
-	wait_for_completion(&ca->stop_complete);
+	complete(&ca->io_ref_completion);
 }

 static int bch2_dev_sysfs_online(struct bch_dev *ca)
@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 		return -ENOMEM;

 	kobject_init(&ca->kobj, &bch2_dev_ktype);
-	init_completion(&ca->stop_complete);
-	init_completion(&ca->offline_complete);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);

 	ca->dev_idx = dev_idx;
 	__set_bit(ca->dev_idx, ca->self.d);
@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 		DIV_ROUND_UP(BTREE_NODE_RESERVE,
 			     ca->mi.bucket_size / c->opts.btree_node_size);

-	if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
-	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
 		       GFP_KERNEL) ||
@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    bioset_init(&ca->replica_set, 4,
-			offsetof(struct bch_write_bio, bio)) ||
+			offsetof(struct bch_write_bio, bio), 0) ||
 	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;

@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	struct bch_dev *ca;
 	int ret;

-	lockdep_assert_held(&c->sb_lock);
-
 	if (le64_to_cpu(sb->sb->seq) >
 	    le64_to_cpu(c->disk_sb->seq))
 		bch2_sb_to_fs(c, sb->sb);
@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
 	       !c->devs[sb->sb->dev_idx]);

-	ca = c->devs[sb->sb->dev_idx];
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
 	if (ca->disk_sb.bdev) {
 		bch_err(c, "already have device online in slot %u",
 			sb->sb->dev_idx);
 		return -EINVAL;
 	}

+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
 	ret = bch2_dev_journal_init(ca, sb->sb);
 	if (ret)
 		return ret;
@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (bch2_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");

-	bch2_mark_dev_superblock(c, ca, 0);
+	bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);

 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 {
 	struct replicas_status s;
 	struct bch_sb_field_members *mi;
+	struct bch_dev *ca;
 	unsigned i, flags = c->opts.degraded
 		? BCH_FORCE_IF_DEGRADED
 		: 0;
@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 		mutex_lock(&c->sb_lock);
 		mi = bch2_sb_get_members(c->disk_sb);

-		for (i = 0; i < c->disk_sb->nr_devices; i++)
-			if (bch2_dev_exists(c->disk_sb, mi, i) &&
-			    !bch2_dev_is_online(c->devs[i]) &&
-			    (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
-			     c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+		for (i = 0; i < c->disk_sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb, mi, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
+			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
 				mutex_unlock(&c->sb_lock);
 				return false;
 			}
+		}
 		mutex_unlock(&c->sb_lock);
 	}

@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 *
 	 * flag_data_bad() does not check btree pointers
 	 */
-	ret = bch2_flag_data_bad(ca);
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
-		bch_err(ca, "Remove failed");
+		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
 	}

 	data = bch2_dev_has_data(c, ca);
 	if (data) {
-		bch_err(ca, "Remove failed, still has data (%x)", data);
+		char data_has_str[100];
+		bch2_scnprint_flag_list(data_has_str,
+					sizeof(data_has_str),
+					bch2_data_types,
+					data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
 		goto err;
 	}

-	bch2_journal_meta(&c->journal);
+	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				      POS(ca->dev_idx, 0),
+				      POS(ca->dev_idx + 1, 0),
+				      ZERO_VERSION,
+				      NULL, NULL, NULL);
+	if (ret) {
+		bch_err(ca, "Remove failed, error deleting alloc info");
+		goto err;
+	}
+
+	/*
+	 * must flush all existing journal entries, they might have
+	 * (overwritten) keys that point to the device we're removing:
+	 */
+	ret = bch2_journal_flush_all_pins(&c->journal);
+	if (ret) {
+		bch_err(ca, "Remove failed, journal error");
+		goto err;
+	}

 	__bch2_dev_offline(ca);
-	bch2_dev_stop(ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
 	bch2_dev_free(ca);

 	/*
@ -1542,7 +1568,7 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

-	ca = c->devs[dev_idx];
+	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = "journal alloc failed";
 		if (bch2_dev_journal_alloc(ca))
@ -1568,7 +1594,7 @@ err:
 /* Hot add existing device to running filesystem: */
 int bch2_dev_online(struct bch_fs *c, const char *path)
 {
-	struct bch_sb_handle sb = { 0 };
+	struct bch_sb_handle sb = { NULL };
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	const char *err;
@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	}
 	mutex_unlock(&c->sb_lock);

-	ca = c->devs[dev_idx];
+	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 		return -EINVAL;
 	}

-	__bch2_dev_read_only(c, ca);
 	__bch2_dev_offline(ca);

 	mutex_unlock(&c->state_lock);
@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned data;
-	int ret;
+	int ret = 0;

 	mutex_lock(&c->state_lock);

 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		bch_err(ca, "Cannot migrate data off RW device");
-		mutex_unlock(&c->state_lock);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}

-	mutex_unlock(&c->state_lock);
-
-	ret = bch2_move_data_off_device(ca);
+	ret = bch2_dev_data_migrate(c, ca, 0);
 	if (ret) {
 		bch_err(ca, "Error migrating data: %i", ret);
-		return ret;
-	}
-
-	ret = bch2_move_metadata_off_device(ca);
-	if (ret) {
-		bch_err(ca, "Error migrating metadata: %i", ret);
-		return ret;
+		goto err;
 	}

 	data = bch2_dev_has_data(c, ca);
 	if (data) {
 		bch_err(ca, "Migrate error: data still present (%x)", data);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
-
-	return 0;
+err:
+	mutex_unlock(&c->state_lock);
+	return ret;
 }

 /* Filesystem open: */
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
 		}
 }

+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	devs->devs[devs->nr++] = dev;
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 					      struct bch_devs_mask *mask)
 {
@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	__for_each_online_member(ca, c, iter,				\
 		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))

+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
 /* XXX kill, move to struct bch_fs */
 static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 {
@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)

 struct bch_fs *bch2_bdev_to_fs(struct block_device *);
 struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(struct bch_fs *, int);
+int bch2_congested(void *, int);

 bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
 			   enum bch_member_state, int);
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
 }

-const char * const bch2_rw[] = {
+static const char * const bch2_rw[] = {
 	"read",
 	"write",
 	NULL
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@ -6,7 +6,6 @@
 #include "clock.h"
 #include "extents.h"
 #include "io.h"
-#include "keylist.h"
 #include "move.h"
 #include "super-io.h"
 #include "tier.h"
@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
 		return false;

 	extent_for_each_ptr(e, ptr)
-		if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+		if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
 			replicas++;

 	return replicas < c->opts.data_replicas;
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -34,8 +34,12 @@ struct closure;
 #define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
 #define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)

-#define memcpy(_dst, _src, _len)					\
+#define memcpy(dst, src, len)						\
 ({									\
+	void *_dst = (dst);						\
+	const void *_src = (src);					\
+	size_t _len = (len);						\
+									\
 	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
 		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
 	memcpy(_dst, _src, _len);					\
--- a/libbcachefs/vstructs.h
+++ b/libbcachefs/vstructs.h
@ -9,10 +9,10 @@
 */
 #define __vstruct_u64s(_s)						\
 ({									\
-	( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s)		\
-	: type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s)		\
-	: type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s)		\
-	: ((_s)->u64s));						\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
 })

 #define __vstruct_bytes(_type, _u64s)					\
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "compress.h"
 #include "extents.h"
 #include "fs.h"
 #include "str_hash.h"
@ -358,6 +359,129 @@ static const struct xattr_handler bch_xattr_security_handler = {
 	.flags	= BCH_XATTR_INDEX_SECURITY,
 };

+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+	const struct bch_option *opt;
+	int ret, id;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+
+	if (opt->type == BCH_OPT_STR)
+		ret = snprintf(buffer, size, "%s", opt->choices[v]);
+	else
+		ret = snprintf(buffer, size, "%llu", v);
+
+	return ret <= size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
+};
+
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bch2_inode_opt_set(bi, s->id, s->v);
+	else
+		bch2_inode_opt_clear(bi, s->id);
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int ret;
+
+	s.id = bch2_opt_lookup(name);
+	if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+		return -EINVAL;
+
+	opt = bch2_opt_table + s.id;
+
+	if (value) {
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(opt, buf, &s.v);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		if (s.id == Opt_compression) {
+			mutex_lock(&c->sb_lock);
+			ret = bch2_check_set_has_compressed_data(c, s.v);
+			mutex_unlock(&c->sb_lock);
+
+			if (ret)
+				return ret;
+		}
+
+		s.defined = true;
+	} else {
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+#endif
+	NULL
+};
+
 static const struct xattr_handler *bch_xattr_handler_map[] = {
 	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
 	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
@ -368,15 +492,6 @@ static const struct xattr_handler *bch_xattr_handler_map[] = {
 	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
 };

-const struct xattr_handler *bch2_xattr_handlers[] = {
-	&bch_xattr_user_handler,
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-	&bch_xattr_trusted_handler,
-	&bch_xattr_security_handler,
-	NULL
-};
-
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
 {
 	return type < ARRAY_SIZE(bch_xattr_handler_map)
--- a/linux/bio.c
+++ b/linux/bio.c
@ -19,7 +19,38 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+
+static const struct {
+	int		err;
+	const char	*name;
+} blk_errors[] = {
+	[BLK_STS_OK]		= { 0,		"" },
+	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
+	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
+	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
+	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
+	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
+	[BLK_STS_NEXUS]		= { -EBADE,	"critical nexus" },
+	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
+	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
+	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
+	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
+
+	/* device mapper special case, should not leak out: */
+	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
+
+	/* everything else not covered above: */
+	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
+};
+
+int blk_status_to_errno(blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+		return -EIO;
+	return blk_errors[idx].err;
+}

 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			struct bio *src, struct bvec_iter *src_iter)
@ -199,8 +230,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;

-	if (!parent->bi_error)
-		parent->bi_error = bio->bi_error;
+	if (!parent->bi_status)
+		parent->bi_status = bio->bi_status;
 	bio_put(bio);
 	return parent;
 }
@ -233,27 +264,6 @@ again:
 		bio->bi_end_io(bio);
 }

-void bio_endio_nodec(struct bio *bio)
-{
-	goto nodec;
-
-	while (bio) {
-		if (unlikely(!bio_remaining_done(bio)))
-			break;
-nodec:
-		if (bio->bi_end_io == bio_chain_endio) {
-			struct bio *parent = bio->bi_private;
-			parent->bi_error = bio->bi_error;
-			bio_put(bio);
-			bio = parent;
-		} else {
-			if (bio->bi_end_io)
-				bio->bi_end_io(bio);
-			bio = NULL;
-		}
-	}
-}
-
 void bio_reset(struct bio *bio)
 {
 	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@ -32,7 +32,7 @@ void generic_make_request(struct bio *bio)
 		ret = fdatasync(bio->bi_bdev->bd_fd);
 		if (ret) {
 			fprintf(stderr, "fsync error: %m\n");
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 			return;
 		}
@ -106,7 +106,7 @@ int submit_bio_wait(struct bio *bio)
 	submit_bio(bio);
 	wait_for_completion(&done);

-	return bio->bi_error;
+	return blk_status_to_errno(bio->bi_status);
 }

 int blkdev_issue_discard(struct block_device *bdev,
@ -235,10 +235,8 @@ static int aio_completion_thread(void *arg)
 		for (ev = events; ev < events + ret; ev++) {
 			struct bio *bio = (struct bio *) ev->data;

-			if (ev->res < 0)
-				bio->bi_error = ev->res;
-			else if (ev->res != bio->bi_iter.bi_size)
-				bio->bi_error = -EIO;
+			if (ev->res != bio->bi_iter.bi_size)
+				bio->bi_status = BLK_STS_IOERR;

 			bio_endio(bio);
 		}