diff --git a/.bcachefs_revision b/.bcachefs_revision
index 9f202c51..7d7555ff 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-9736cbbc5cc39f6c666befdd787788b6ce6497f6
+46af7258b951a79a66511172ab8772ad2dfaa4e3
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2384a5e3..e3c3a9b4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -10,6 +10,8 @@
 #include <linux/types.h>
 #include <linux/bvec.h>
 #include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
 
 struct bio_set;
 struct bio;
@@ -63,6 +65,8 @@ struct block_device {
 	struct gendisk *	bd_disk;
 	struct gendisk		__bd_disk;
 	int			bd_fd;
+
+	struct mutex		bd_holder_lock;
 };
 
 #define bdev_kobj(_bdev) (&((_bdev)->kobj))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b295bd9a..6964396e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -65,7 +65,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev);
 sector_t get_capacity(struct gendisk *disk);
 
 struct blk_holder_ops {
-        void (*mark_dead)(struct block_device *bdev);
+        void (*mark_dead)(struct block_device *bdev, bool surprise);
+	void (*sync)(struct block_device *bdev);
+	int (*freeze)(struct block_device *bdev);
+	int (*thaw)(struct block_device *bdev);
 };
 
 static inline struct block_device *file_bdev(struct file *file)
@@ -80,8 +83,12 @@ int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
 	void			*s_fs_info;
+	struct rw_semaphore	s_umount;
 };
 
+static inline void evict_inodes(struct super_block *sb) {}
+static inline int sync_filesystem(struct super_block *) { return 0; }
+
 /*
  * File types
  *
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f395ce7f..b9d0ea22 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -9,6 +9,8 @@ struct dentry {
 	struct inode *d_inode;
 };
 
+static inline void shrink_dcache_sb(struct super_block *) {}
+
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
 #define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
 
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index d2c3f59a..b432bb6e 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -536,6 +536,7 @@ struct bch_dev {
 	 */
 	struct bch_member_cpu	mi;
 	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
+	unsigned long		write_errors_start;
 
 	__uuid_t		uuid;
 	char			name[BDEVNAME_SIZE];
@@ -1002,15 +1003,11 @@ struct bch_fs {
 	wait_queue_head_t	copygc_running_wq;
 
 	/* STRIPES: */
-	GENRADIX(struct stripe) stripes;
 	GENRADIX(struct gc_stripe) gc_stripes;
 
 	struct hlist_head	ec_stripes_new[32];
 	spinlock_t		ec_stripes_new_lock;
 
-	ec_stripes_heap		ec_stripes_heap;
-	struct mutex		ec_stripes_heap_lock;
-
 	/* ERASURE CODING */
 	struct list_head	ec_stripe_head_list;
 	struct mutex		ec_stripe_head_lock;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 13cc0833..7a5b0d21 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -690,7 +690,8 @@ struct bch_sb_field_ext {
 	x(cached_backpointers,		BCH_VERSION(1, 21))		\
 	x(stripe_backpointers,		BCH_VERSION(1, 22))		\
 	x(stripe_lru,			BCH_VERSION(1, 23))		\
-	x(casefolding,			BCH_VERSION(1, 24))
+	x(casefolding,			BCH_VERSION(1, 24))		\
+	x(extent_flags,			BCH_VERSION(1, 25))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -859,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,	struct bch_sb, flags[5], 32, 48);
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
 					struct bch_sb, flags[5], 48, 64);
 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
@@ -927,7 +929,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
 	 BIT_ULL(BCH_FEATURE_new_siphash)|		\
 	 BIT_ULL(BCH_FEATURE_btree_ptr_v2)|		\
 	 BIT_ULL(BCH_FEATURE_new_varint)|		\
-	 BIT_ULL(BCH_FEATURE_journal_no_flush))
+	 BIT_ULL(BCH_FEATURE_journal_no_flush)|		\
+	 BIT_ULL(BCH_FEATURE_incompat_version_field))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index ca755e8d..1ec1f90e 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 		return NULL;
 	}
 
-	bch2_btree_lock_init(&b->c, 0);
+	bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
 
 	__bch2_btree_node_to_freelist(bc, b);
 	return b;
@@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 		}
 
 	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
-	if (!b) {
+	if (b) {
+		bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
+	} else {
 		mutex_unlock(&bc->lock);
 		bch2_trans_unlock(trans);
 		b = __btree_node_mem_alloc(c, GFP_KERNEL);
 		if (!b)
 			goto err;
+		bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
 		mutex_lock(&bc->lock);
 	}
 
-	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
-
 	BUG_ON(!six_trylock_intent(&b->c.lock));
 	BUG_ON(!six_trylock_write(&b->c.lock));
 
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 80a0094b..6638bb1f 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1187,7 +1187,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			     le64_to_cpu(i->journal_seq),
 			     b->written, b->written + sectors, ptr_written);
 
-		b->written += sectors;
+		b->written = min(b->written + sectors, btree_sectors(c));
 
 		if (blacklisted && !first)
 			continue;
@@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
 		bch_info(c, "retrying read");
 		ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
 		rb->have_ioref		= ca != NULL;
+		rb->start_time		= local_clock();
 		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
 		bio->bi_iter.bi_size	= btree_buf_bytes(b);
@@ -1339,17 +1340,22 @@ static void btree_node_read_work(struct work_struct *work)
 		} else {
 			bio->bi_status = BLK_STS_REMOVED;
 		}
+
+		bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+					   rb->start_time, !bio->bi_status);
 start:
 		printbuf_reset(&buf);
 		bch2_btree_pos_to_text(&buf, c, b);
-		bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
-				   "btree read error %s for %s",
-				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
+
+		if (ca && bio->bi_status)
+			bch_err_dev_ratelimited(ca,
+					"btree read error %s for %s",
+					bch2_blk_status_to_str(bio->bi_status), buf.buf);
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
 
-		bch2_mark_io_failure(&failed, &rb->pick);
+		bch2_mark_io_failure(&failed, &rb->pick, false);
 
 		can_retry = bch2_bkey_pick_read_device(c,
 				bkey_i_to_s_c(&b->key),
@@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);
 	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= rb->have_ioref
+		? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
 
-	if (rb->have_ioref) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
-		bch2_latency_acct(ca, rb->start_time, READ);
-	}
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+				   rb->start_time, !bio->bi_status);
 
 	queue_work(c->btree_read_complete_wq, &rb->work);
 }
@@ -2075,6 +2080,11 @@ static void btree_node_write_work(struct work_struct *work)
 		container_of(work, struct btree_write_bio, work);
 	struct bch_fs *c	= wbio->wbio.c;
 	struct btree *b		= wbio->wbio.bio.bi_private;
+	unsigned commit_flags =
+		BCH_WATERMARK_interior_updates|
+		BCH_TRANS_COMMIT_journal_reclaim|
+		BCH_TRANS_COMMIT_no_enospc|
+		BCH_TRANS_COMMIT_no_check_rw;
 	u64 start_time		= wbio->start_time;
 	int ret = 0;
 
@@ -2083,38 +2093,24 @@ static void btree_node_write_work(struct work_struct *work)
 		wbio->wbio.used_mempool,
 		wbio->data);
 
-	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
-		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
-	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = -BCH_ERR_btree_node_write_all_failed;
-		goto err;
-	}
-
-	if (wbio->wbio.first_btree_write) {
-		if (wbio->wbio.failed.nr) {
-
-		}
-	} else {
+	if (wbio->wbio.failed.nr) {
+		ret = bch2_trans_do(c,
+			bch2_btree_node_rewrite_key_get_iter(trans, b,
+					commit_flags));
+	} else if (!wbio->wbio.first_btree_write) {
 		ret = bch2_trans_do(c,
 			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-					BCH_WATERMARK_interior_updates|
-					BCH_TRANS_COMMIT_journal_reclaim|
-					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_check_rw,
-					!wbio->wbio.failed.nr));
-		if (ret)
-			goto err;
+					commit_flags, true));
 	}
-out:
+
+	if (ret) {
+		set_btree_node_noevict(b);
+		bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+				     "writing btree node: %s", bch2_err_str(ret));
+	}
+
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b, start_time);
-	return;
-err:
-	set_btree_node_noevict(b);
-	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
-			     "writing btree node: %s", bch2_err_str(ret));
-	goto out;
 }
 
 static void btree_node_write_endio(struct bio *bio)
@@ -2126,16 +2122,17 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct btree *b			= wbio->bio.bi_private;
 	struct bch_dev *ca		= wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-	unsigned long flags;
 
-	if (wbio->have_ioref)
-		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+				   wbio->submit_time, !bio->bi_status);
 
-	if (!ca ||
-	    bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
-			       "btree write error: %s",
-			       bch2_blk_status_to_str(bio->bi_status)) ||
-	    bch2_meta_write_fault("btree")) {
+	if (ca && bio->bi_status)
+		bch_err_dev_ratelimited(ca,
+				   "btree write error: %s",
+				   bch2_blk_status_to_str(bio->bi_status));
+
+	if (bio->bi_status) {
+		unsigned long flags;
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 1821f40c..edce5943 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
 	}
 
 	if (ck) {
-		bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+		bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
 		ck->c.cached = true;
 		goto lock;
 	}
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index 10b805a6..caef65ad 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -7,9 +7,10 @@
 static struct lock_class_key bch2_btree_node_lock_key;
 
 void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
-			  enum six_lock_init_flags flags)
+			  enum six_lock_init_flags flags,
+			  gfp_t gfp)
 {
-	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
 	lockdep_set_notrack_class(&b->lock);
 }
 
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index b54ef48e..b33ab7af 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -13,7 +13,7 @@
 #include "btree_iter.h"
 #include "six.h"
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
 
 void bch2_trans_unlock_noassert(struct btree_trans *);
 void bch2_trans_unlock_write(struct btree_trans *);
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
index a7f06dee..67816132 100644
--- a/libbcachefs/btree_node_scan.c
+++ b/libbcachefs/btree_node_scan.c
@@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	bio->bi_iter.bi_sector	= offset;
 	bch2_bio_map(bio, bn, PAGE_SIZE);
 
+	u64 submit_time = local_clock();
 	submit_bio_wait(bio);
-	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
-			       "IO error in try_read_btree_node() at %llu: %s",
-			       offset, bch2_blk_status_to_str(bio->bi_status)))
+
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+	if (bio->bi_status) {
+		bch_err_dev_ratelimited(ca,
+				"IO error in try_read_btree_node() at %llu: %s",
+				offset, bch2_blk_status_to_str(bio->bi_status));
 		return;
+	}
 
 	if (le64_to_cpu(bn->magic) != bset_magic(c))
 		return;
@@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
 err:
 	bio_put(bio);
 	free_page((unsigned long) buf);
-	percpu_ref_get(&ca->io_ref);
+	percpu_ref_put(&ca->io_ref);
 	closure_put(w->cl);
 	kfree(w);
 	return 0;
@@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
 			continue;
 
 		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
-		struct task_struct *t;
-
 		if (!w) {
 			percpu_ref_put(&ca->io_ref);
 			ret = -ENOMEM;
 			goto err;
 		}
 
-		percpu_ref_get(&ca->io_ref);
-		closure_get(&cl);
 		w->cl		= &cl;
 		w->f		= f;
 		w->ca		= ca;
 
-		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+		struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
 		ret = PTR_ERR_OR_ZERO(t);
 		if (ret) {
 			percpu_ref_put(&ca->io_ref);
-			closure_put(&cl);
-			f->ret = ret;
-			bch_err(c, "error starting kthread: %i", ret);
+			kfree(w);
+			bch_err_msg(c, ret, "starting kthread");
 			break;
 		}
+
+		closure_get(&cl);
+		percpu_ref_get(&ca->io_ref);
+		wake_up_process(t);
 	}
 err:
 	closure_sync(&cl);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index aac2947a..d3e0cf01 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -2126,6 +2126,31 @@ err_free_update:
 	goto out;
 }
 
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
+			    struct btree *b)
+{
+	bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
+				  BTREE_MAX_DEPTH, b->c.level,
+				  BTREE_ITER_intent);
+	int ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	/* has node been freed? */
+	if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		ret = -BCH_ERR_btree_node_dying;
+		goto err;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
 int bch2_btree_node_rewrite(struct btree_trans *trans,
 			    struct btree_iter *iter,
 			    struct btree *b,
@@ -2191,7 +2216,29 @@ err:
 	goto out;
 }
 
-int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+				       enum btree_id btree, unsigned level,
+				       struct bkey_i *k, unsigned flags)
+{
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter,
+				  btree, k->k.p,
+				  BTREE_MAX_DEPTH, level, 0);
+	struct btree *b = bch2_btree_iter_peek_node(&iter);
+	int ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
+
+	bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
+	ret = found
+		? bch2_btree_node_rewrite(trans, &iter, b, flags)
+		: -ENOENT;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
 				enum btree_id btree, unsigned level,
 				struct bpos pos, unsigned flags)
 {
@@ -2211,6 +2258,19 @@ err:
 	return ret;
 }
 
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
+					 struct btree *b, unsigned flags)
+{
+	struct btree_iter iter;
+	int ret = get_iter_to_node(trans, &iter, b);
+	if (ret)
+		return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
+
+	ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 struct async_btree_rewrite {
 	struct bch_fs		*c;
 	struct work_struct	work;
@@ -2220,57 +2280,14 @@ struct async_btree_rewrite {
 	struct bkey_buf		key;
 };
 
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
-					  struct async_btree_rewrite *a)
-{
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter,
-				  a->btree_id, a->key.k->k.p,
-				  BTREE_MAX_DEPTH, a->level, 0);
-	struct btree *b = bch2_btree_iter_peek_node(&iter);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto out;
-
-	bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
-	ret = found
-		? bch2_btree_node_rewrite(trans, &iter, b, 0)
-		: -ENOENT;
-
-#if 0
-	/* Tracepoint... */
-	if (!ret || ret == -ENOENT) {
-		struct bch_fs *c = trans->c;
-		struct printbuf buf = PRINTBUF;
-
-		if (!ret) {
-			prt_printf(&buf, "rewrite node:\n  ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
-		} else {
-			prt_printf(&buf, "node to rewrite not found:\n  want: ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
-			prt_printf(&buf, "\n  got:  ");
-			if (b)
-				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-			else
-				prt_str(&buf, "(null)");
-		}
-		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-#endif
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
 static void async_btree_node_rewrite_work(struct work_struct *work)
 {
 	struct async_btree_rewrite *a =
 		container_of(work, struct async_btree_rewrite, work);
 	struct bch_fs *c = a->c;
 
-	int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
+	int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
+						a->btree_id, a->level, a->key.k, 0));
 	if (ret != -ENOENT)
 		bch_err_fn_ratelimited(c, ret);
 
@@ -2514,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 					unsigned commit_flags, bool skip_triggers)
 {
 	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
-				  BTREE_MAX_DEPTH, b->c.level,
-				  BTREE_ITER_intent);
-	ret = bch2_btree_iter_traverse(&iter);
+	int ret = get_iter_to_node(trans, &iter, b);
 	if (ret)
-		goto out;
-
-	/* has node been freed? */
-	if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
-		/* node has been freed: */
-		BUG_ON(!btree_node_dying(b));
-		goto out;
-	}
-
-	BUG_ON(!btree_node_hashed(b));
+		return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
 
 	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
 			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
 
 	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
 					 commit_flags, skip_triggers);
-out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index b5be250b..be71cd73 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -169,9 +169,12 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 			    struct btree *, unsigned);
-int bch2_btree_node_rewrite_key(struct btree_trans *,
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
 				enum btree_id, unsigned,
 				struct bpos, unsigned);
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
+					 struct btree *, unsigned);
+
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 7e484afe..522574bc 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
 
 	prt_str_indented(out, "extra replicas:\t");
 	prt_u64(out, data_opts->extra_replicas);
-	prt_newline(out);
 }
 
 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
@@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 	return 0;
 }
 
+static bool can_write_extent(struct bch_fs *c,
+			     struct bch_devs_list *devs_have,
+			     unsigned target)
+{
+	struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
+
+	darray_for_each(*devs_have, i)
+		__clear_bit(*i, devs.d);
+
+	return !bch2_is_zero(&devs, sizeof(devs));
+}
+
 int bch2_data_update_init(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  struct moving_context *ctxt,
@@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans,
 		ptr_bit <<= 1;
 	}
 
+	if (!can_write_extent(c, &m->op.devs_have,
+			      m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
+		/*
+		 * Check if we have rw devices not in devs_have: this can happen
+		 * if we're trying to move data on a ro or failed device
+		 *
+		 * If we can't move it, we need to clear the rebalance_work bit,
+		 * if applicable
+		 *
+		 * Also, copygc should skip ro/failed devices:
+		 */
+		return -BCH_ERR_data_update_done_no_rw_devs;
+	}
+
 	unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
 
 	/*
diff --git a/libbcachefs/dirent_format.h b/libbcachefs/dirent_format.h
index 2e766032..a46dbddd 100644
--- a/libbcachefs/dirent_format.h
+++ b/libbcachefs/dirent_format.h
@@ -44,9 +44,9 @@ struct bch_dirent {
 		__u8		d_pad;
 		__le16		d_name_len;
 		__le16		d_cf_name_len;
-		__u8		d_names[0];
+		__u8		d_names[];
 	} d_cf_name_block __packed;
-	__u8			d_name[0];
+	__DECLARE_FLEX_ARRAY(__u8, d_name);
 	} __packed;
 } __packed __aligned(8);
 
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 1090cdb7..865cc53a 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -105,6 +105,7 @@ struct ec_bio {
 	struct bch_dev		*ca;
 	struct ec_stripe_buf	*buf;
 	size_t			idx;
+	u64			submit_time;
 	struct bio		bio;
 };
 
@@ -494,38 +495,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			return ret;
 	}
 
-	if (flags & BTREE_TRIGGER_atomic) {
-		struct stripe *m = genradix_ptr(&c->stripes, idx);
-
-		if (!m) {
-			struct printbuf buf1 = PRINTBUF;
-			struct printbuf buf2 = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf1, c, old);
-			bch2_bkey_val_to_text(&buf2, c, new);
-			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
-					    "old %s\n"
-					    "new %s", idx, buf1.buf, buf2.buf);
-			printbuf_exit(&buf2);
-			printbuf_exit(&buf1);
-			bch2_inconsistent_error(c);
-			return -1;
-		}
-
-		if (!new_s) {
-			bch2_stripes_heap_del(c, m, idx);
-
-			memset(m, 0, sizeof(*m));
-		} else {
-			stripe_to_mem(m, new_s);
-
-			if (!old_s)
-				bch2_stripes_heap_insert(c, m, idx);
-			else
-				bch2_stripes_heap_update(c, m, idx);
-		}
-	}
-
 	return 0;
 }
 
@@ -748,14 +717,15 @@ static void ec_block_endio(struct bio *bio)
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca,
-			       bio_data_dir(bio)
-			       ? BCH_MEMBER_ERROR_write
-			       : BCH_MEMBER_ERROR_read,
-			       "erasure coding %s error: %s",
+	bch2_account_io_completion(ca, bio_data_dir(bio),
+				   ec_bio->submit_time, !bio->bi_status);
+
+	if (bio->bi_status) {
+		bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
 			       str_write_read(bio_data_dir(bio)),
-			       bch2_blk_status_to_str(bio->bi_status)))
+			       bch2_blk_status_to_str(bio->bi_status));
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
 
 	int stale = dev_ptr_stale(ca, ptr);
 	if (stale) {
@@ -818,6 +788,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 		ec_bio->ca			= ca;
 		ec_bio->buf			= buf;
 		ec_bio->idx			= idx;
+		ec_bio->submit_time		= local_clock();
 
 		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
 		ec_bio->bio.bi_end_io		= ec_block_endio;
@@ -939,26 +910,6 @@ err:
 
 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 {
-	ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
-	if (idx >= h->size) {
-		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
-			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
-		mutex_lock(&c->ec_stripes_heap_lock);
-		if (n.size > h->size) {
-			memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
-			n.nr = h->nr;
-			swap(*h, n);
-		}
-		mutex_unlock(&c->ec_stripes_heap_lock);
-
-		free_heap(&n);
-	}
-
-	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
-		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
 	if (c->gc_pos.phase != GC_PHASE_not_running &&
 	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
 		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -1031,155 +982,26 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
 	s->idx = 0;
 }
 
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-
-	lockdep_assert_held(&c->ec_stripes_heap_lock);
-
-	if (h->nr &&
-	    h->data[0].blocks_nonempty == 0 &&
-	    !bch2_stripe_is_open(c, h->data[0].idx))
-		return h->data[0].idx;
-
-	return 0;
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
-						   size_t i)
-{
-	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
-	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
-{
-	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
-	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-
-	return ((_l->blocks_nonempty > _r->blocks_nonempty) <
-		(_l->blocks_nonempty < _r->blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
-{
-	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
-	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-	ec_stripes_heap *_h = (ec_stripes_heap *)h;
-	size_t i = _l - _h->data;
-	size_t j = _r - _h->data;
-
-	swap(*_l, *_r);
-
-	ec_stripes_heap_set_backpointer(_h, i);
-	ec_stripes_heap_set_backpointer(_h, j);
-}
-
-static const struct min_heap_callbacks callbacks = {
-	.less = ec_stripes_heap_cmp,
-	.swp = ec_stripes_heap_swap,
-};
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m = genradix_ptr(&c->stripes, idx);
-
-	BUG_ON(m->heap_idx >= h->nr);
-	BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
-			   struct stripe *m, size_t idx)
-{
-	mutex_lock(&c->ec_stripes_heap_lock);
-	heap_verify_backpointer(c, idx);
-
-	min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
-	mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
-			      struct stripe *m, size_t idx)
-{
-	mutex_lock(&c->ec_stripes_heap_lock);
-	BUG_ON(min_heap_full(&c->ec_stripes_heap));
-
-	genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
-	min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
-			.idx = idx,
-			.blocks_nonempty = m->blocks_nonempty,
-		}),
-		&callbacks,
-		&c->ec_stripes_heap);
-
-	heap_verify_backpointer(c, idx);
-	mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
-			      struct stripe *m, size_t idx)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	bool do_deletes;
-	size_t i;
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	heap_verify_backpointer(c, idx);
-
-	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
-	i = m->heap_idx;
-	min_heap_sift_up(h,	i, &callbacks, &c->ec_stripes_heap);
-	min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
-
-	heap_verify_backpointer(c, idx);
-
-	do_deletes = stripe_idx_to_delete(c) != 0;
-	mutex_unlock(&c->ec_stripes_heap_lock);
-
-	if (do_deletes)
-		bch2_do_stripe_deletes(c);
-}
-
 /* stripe deletion */
 
 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_stripe s;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
-			       BTREE_ITER_intent);
-	ret = bkey_err(k);
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+					       BTREE_ID_stripes, POS(0, idx),
+					       BTREE_ITER_intent);
+	int ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (k.k->type != KEY_TYPE_stripe) {
-		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	s = bkey_s_c_to_stripe(k);
-	for (unsigned i = 0; i < s.v->nr_blocks; i++)
-		if (stripe_blockcount_get(s.v, i)) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
-			printbuf_exit(&buf);
-			ret = -EINVAL;
-			goto err;
-		}
-
-	ret = bch2_btree_delete_at(trans, &iter, 0);
+	/*
+	 * We expect write buffer races here
+	 * Important: check stripe_is_open with stripe key locked:
+	 */
+	if (k.k->type == KEY_TYPE_stripe &&
+	    !bch2_stripe_is_open(trans->c, idx) &&
+	    stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
+		ret = bch2_btree_delete_at(trans, &iter, 0);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -1194,21 +1016,16 @@ static void ec_stripe_delete_work(struct work_struct *work)
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, ec_stripe_delete_work);
 
-	while (1) {
-		mutex_lock(&c->ec_stripes_heap_lock);
-		u64 idx = stripe_idx_to_delete(c);
-		mutex_unlock(&c->ec_stripes_heap_lock);
-
-		if (!idx)
-			break;
-
-		int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-					ec_stripe_delete(trans, idx));
-		bch_err_fn(c, ret);
-		if (ret)
-			break;
-	}
-
+	bch2_trans_run(c,
+		bch2_btree_write_buffer_tryflush(trans) ?:
+		for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
+				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
+				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
+				0, lru_k,
+				NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc, ({
+			ec_stripe_delete(trans, lru_k.k->p.offset);
+		})));
 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
@@ -1557,6 +1374,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	if (ret)
 		goto err;
 err:
+	trace_stripe_create(c, s->idx, ret);
+
 	bch2_disk_reservation_put(c, &s->res);
 
 	for (i = 0; i < v->nr_blocks; i++)
@@ -1998,39 +1817,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
 	return 0;
 }
 
-static s64 get_existing_stripe(struct bch_fs *c,
-			       struct ec_stripe_head *head)
+static int __get_existing_stripe(struct btree_trans *trans,
+				 struct ec_stripe_head *head,
+				 struct ec_stripe_buf *stripe,
+				 u64 idx)
 {
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m;
-	size_t heap_idx;
-	u64 stripe_idx;
-	s64 ret = -1;
+	struct bch_fs *c = trans->c;
 
-	if (may_create_new_stripe(c))
-		return -1;
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+					  BTREE_ID_stripes, POS(0, idx), 0);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
 
-	mutex_lock(&c->ec_stripes_heap_lock);
-	for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
-		/* No blocks worth reusing, stripe will just be deleted: */
-		if (!h->data[heap_idx].blocks_nonempty)
-			continue;
+	/* We expect write buffer races here */
+	if (k.k->type != KEY_TYPE_stripe)
+		goto out;
 
-		stripe_idx = h->data[heap_idx].idx;
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	if (stripe_lru_pos(s.v) <= 1)
+		goto out;
 
-		m = genradix_ptr(&c->stripes, stripe_idx);
-
-		if (m->disk_label	== head->disk_label &&
-		    m->algorithm	== head->algo &&
-		    m->nr_redundant	== head->redundancy &&
-		    m->sectors		== head->blocksize &&
-		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
-		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
-			ret = stripe_idx;
-			break;
-		}
+	if (s.v->disk_label		== head->disk_label &&
+	    s.v->algorithm		== head->algo &&
+	    s.v->nr_redundant		== head->redundancy &&
+	    le16_to_cpu(s.v->sectors)	== head->blocksize &&
+	    bch2_try_open_stripe(c, head->s, idx)) {
+		bkey_reassemble(&stripe->key, k);
+		ret = 1;
 	}
-	mutex_unlock(&c->ec_stripes_heap_lock);
+out:
+	bch2_set_btree_iter_dontneed(&iter);
+err:
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -2082,24 +1902,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 				       struct ec_stripe_new *s)
 {
 	struct bch_fs *c = trans->c;
-	s64 idx;
-	int ret;
 
 	/*
 	 * If we can't allocate a new stripe, and there's no stripes with empty
 	 * blocks for us to reuse, that means we have to wait on copygc:
 	 */
-	idx = get_existing_stripe(c, h);
-	if (idx < 0)
-		return -BCH_ERR_stripe_alloc_blocked;
+	if (may_create_new_stripe(c))
+		return -1;
 
-	ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
-	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
-			     "reading stripe key: %s", bch2_err_str(ret));
-	if (ret) {
-		bch2_stripe_close(c, s);
-		return ret;
+	struct btree_iter lru_iter;
+	struct bkey_s_c lru_k;
+	int ret = 0;
+
+	for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
+			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
+			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
+			0, lru_k, ret) {
+		ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
+		if (ret)
+			break;
 	}
+	bch2_trans_iter_exit(trans, &lru_iter);
+	if (!ret)
+		ret = -BCH_ERR_stripe_alloc_blocked;
+	if (ret == 1)
+		ret = 0;
+	if (ret)
+		return ret;
 
 	return init_new_stripe_from_existing(c, s);
 }
@@ -2397,46 +2226,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
-				   BTREE_ITER_prefetch, k, ({
-			if (k.k->type != KEY_TYPE_stripe)
-				continue;
-
-			ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
-			if (ret)
-				break;
-
-			struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-
-			stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
-
-			bch2_stripes_heap_insert(c, m, k.k->p.offset);
-			0;
-		})));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	ec_stripes_heap *h = &c->ec_stripes_heap;
-	struct stripe *m;
-	size_t i;
-
-	mutex_lock(&c->ec_stripes_heap_lock);
-	for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
-		m = genradix_ptr(&c->stripes, h->data[i].idx);
-
-		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
-		       h->data[i].blocks_nonempty,
-		       m->nr_blocks - m->nr_redundant,
-		       m->nr_redundant);
-		if (bch2_stripe_is_open(c, h->data[i].idx))
-			prt_str(out, " open");
-		prt_newline(out);
-	}
-	mutex_unlock(&c->ec_stripes_heap_lock);
+	return 0;
 }
 
 static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -2507,15 +2297,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 	BUG_ON(!list_empty(&c->ec_stripe_new_list));
 
-	free_heap(&c->ec_stripes_heap);
-	genradix_free(&c->stripes);
 	bioset_exit(&c->ec_bioset);
 }
 
 void bch2_fs_ec_init_early(struct bch_fs *c)
 {
 	spin_lock_init(&c->ec_stripes_new_lock);
-	mutex_init(&c->ec_stripes_heap_lock);
 
 	INIT_LIST_HEAD(&c->ec_stripe_head_list);
 	mutex_init(&c->ec_stripe_head_lock);
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index cd1c837e..8f2228e5 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s)
 	if (!s)
 		return 0;
 
-	unsigned blocks_empty = 0, blocks_nonempty = 0;
+	unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
 
-	for (unsigned i = 0; i < s->nr_blocks; i++) {
-		blocks_empty	+=  !stripe_blockcount_get(s, i);
-		blocks_nonempty	+= !!stripe_blockcount_get(s, i);
-	}
+	for (unsigned i = 0; i < nr_data; i++)
+		blocks_empty += !stripe_blockcount_get(s, i);
 
 	/* Will be picked up by the stripe_delete worker */
-	if (!blocks_nonempty)
+	if (blocks_empty == nr_data)
 		return STRIPE_LRU_POS_EMPTY;
 
 	if (!blocks_empty)
@@ -260,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
 			unsigned, unsigned, unsigned,
 			enum bch_watermark, struct closure *);
 
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
-
 void bch2_do_stripe_deletes(struct bch_fs *);
 void bch2_ec_do_stripe_creates(struct bch_fs *);
 void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
@@ -300,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
 
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_ec_exit(struct bch_fs *);
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 37558cc2..06144bfd 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -31,11 +31,4 @@ struct gc_stripe {
 	struct bch_replicas_padded r;
 };
 
-struct ec_stripe_heap_entry {
-	size_t			idx;
-	unsigned		blocks_nonempty;
-};
-
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
-
 #endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 89df9781..531fe575 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -119,6 +119,7 @@
 	x(ENOENT,			ENOENT_dev_idx_not_found)		\
 	x(ENOENT,			ENOENT_inode_no_backpointer)		\
 	x(ENOENT,			ENOENT_no_snapshot_tree_subvol)		\
+	x(ENOENT,			btree_node_dying)			\
 	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
 	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
 	x(EEXIST,			EEXIST_str_hash_set)			\
@@ -185,6 +186,7 @@
 	x(BCH_ERR_data_update_done,	data_update_done_no_writes_needed)	\
 	x(BCH_ERR_data_update_done,	data_update_done_no_snapshot)		\
 	x(BCH_ERR_data_update_done,	data_update_done_no_dev_refs)		\
+	x(BCH_ERR_data_update_done,	data_update_done_no_rw_devs)		\
 	x(EINVAL,			device_state_not_allowed)		\
 	x(EINVAL,			member_info_missing)			\
 	x(EINVAL,			mismatched_block_size)			\
@@ -205,6 +207,7 @@
 	x(EINVAL,			no_resize_with_buckets_nouse)		\
 	x(EINVAL,			inode_unpack_error)			\
 	x(EINVAL,			varint_decode_error)			\
+	x(EOPNOTSUPP,			may_not_use_incompat_feature)		\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
@@ -269,12 +272,29 @@
 	x(EIO,				mark_stripe)				\
 	x(EIO,				stripe_reconstruct)			\
 	x(EIO,				key_type_error)				\
+	x(EIO,				extent_poisened)			\
 	x(EIO,				no_device_to_read_from)			\
 	x(EIO,				missing_indirect_extent)		\
 	x(EIO,				invalidate_stripe_to_dev)		\
 	x(EIO,				no_encryption_key)			\
 	x(EIO,				insufficient_journal_devices)		\
 	x(EIO,				device_offline)				\
+	x(EIO,				EIO_fault_injected)			\
+	x(EIO,				data_read)				\
+	x(BCH_ERR_data_read,		data_read_retry)			\
+	x(BCH_ERR_data_read_retry,	data_read_retry_avoid)			\
+	x(BCH_ERR_data_read_retry_avoid,data_read_device_offline)		\
+	x(BCH_ERR_data_read_retry_avoid,data_read_io_err)			\
+	x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err)		\
+	x(BCH_ERR_data_read_retry_avoid,data_read_csum_err)			\
+	x(BCH_ERR_data_read_retry,	data_read_csum_err_maybe_userspace)	\
+	x(BCH_ERR_data_read,		data_read_decompress_err)		\
+	x(BCH_ERR_data_read,		data_read_decrypt_err)			\
+	x(BCH_ERR_data_read,		data_read_ptr_stale_race)		\
+	x(BCH_ERR_data_read_retry,	data_read_ptr_stale_retry)		\
+	x(BCH_ERR_data_read,		data_read_no_encryption_key)		\
+	x(BCH_ERR_data_read,		data_read_buffer_too_small)		\
+	x(BCH_ERR_data_read,		data_read_key_overwritten)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 3f93a5a6..6d68c89a 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
 {
 	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
 	struct bch_fs *c = ca->fs;
-	bool dev;
+
+	/* XXX: if it's reads or checksums that are failing, set it to failed */
 
 	down_write(&c->state_lock);
-	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
-				    BCH_FORCE_IF_DEGRADED);
-	if (dev
-	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
-				  BCH_FORCE_IF_DEGRADED)
-	    : bch2_fs_emergency_read_only(c))
+	unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+	if (write_errors_start &&
+	    time_after(jiffies,
+		       write_errors_start + c->opts.write_error_timeout * HZ)) {
+		if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+			goto out;
+
+		bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+						 BCH_FORCE_IF_DEGRADED);
+
 		bch_err(ca,
-			"too many IO errors, setting %s RO",
+			"writes erroring for %u seconds, setting %s ro",
+			c->opts.write_error_timeout,
 			dev ? "device" : "filesystem");
+		if (!dev)
+			bch2_fs_emergency_read_only(c);
+
+	}
+out:
 	up_write(&c->state_lock);
 }
 
 void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
 {
 	atomic64_inc(&ca->errors[type]);
-	//queue_work(system_long_wq, &ca->io_error_work);
+
+	if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+		ca->write_errors_start = jiffies;
+
+	queue_work(system_long_wq, &ca->io_error_work);
 }
 
 enum ask_yn {
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index b3cc69f2..7d3f0e2a 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -216,27 +216,37 @@ void bch2_io_error_work(struct work_struct *);
 /* Does the error handling without logging a message */
 void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
 
-#define bch2_dev_io_err_on(cond, ca, _type, ...)			\
-({									\
-	bool _ret = (cond);						\
-									\
-	if (_ret) {							\
-		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
-		bch2_io_error(ca, _type);				\
-	}								\
-	_ret;								\
-})
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
 
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...)			\
-({									\
-	bool _ret = (cond);						\
-									\
-	if (_ret) {							\
-		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
-		bch2_io_error(ca, _type);				\
-	}								\
-	_ret;								\
-})
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
+						enum bch_member_error_type type,
+						bool success)
+{
+	if (likely(success)) {
+		if (type == BCH_MEMBER_ERROR_write &&
+		    ca->write_errors_start)
+			ca->write_errors_start = 0;
+	} else {
+		bch2_io_error(ca, type);
+	}
+}
+
+static inline void bch2_account_io_completion(struct bch_dev *ca,
+					      enum bch_member_error_type type,
+					      u64 submit_time, bool success)
+{
+	if (unlikely(!ca))
+		return;
+
+	if (type != BCH_MEMBER_ERROR_checksum)
+		bch2_latency_acct(ca, submit_time, type);
+
+	bch2_account_io_success_fail(ca, type, success);
+}
 
 int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
 
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 78a51d96..f62ee96b 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -28,6 +28,13 @@
 #include "trace.h"
 #include "util.h"
 
+static const char * const bch2_extent_flags_strs[] = {
+#define x(n, v)	[BCH_EXTENT_FLAG_##n] = #n,
+	BCH_EXTENT_FLAGS()
+#undef x
+	NULL,
+};
+
 static unsigned bch2_crc_field_size_max[] = {
 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
 }
 
 void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p)
+			  struct extent_ptr_decoded *p,
+			  bool csum_error)
 {
 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
 
@@ -59,25 +67,28 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
 
 		f = &failed->devs[failed->nr++];
-		f->dev		= p->ptr.dev;
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else if (p->idx != f->idx) {
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else {
-		f->nr_failed++;
+		memset(f, 0, sizeof(*f));
+		f->dev = p->ptr.dev;
 	}
+
+	if (p->do_ec_reconstruct)
+		f->failed_ec = true;
+	else if (!csum_error)
+		f->failed_io = true;
+	else
+		f->failed_csum_nr++;
 }
 
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+static inline u64 dev_latency(struct bch_dev *ca)
 {
-	struct bch_dev *ca = bch2_dev_rcu(c, dev);
 	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
 }
 
+static inline int dev_failed(struct bch_dev *ca)
+{
+	return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+}
+
 /*
  * returns true if p1 is better than p2:
  */
@@ -85,9 +96,18 @@ static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
 			      const struct extent_ptr_decoded p2)
 {
-	if (likely(!p1.idx && !p2.idx)) {
-		u64 l1 = dev_latency(c, p1.ptr.dev);
-		u64 l2 = dev_latency(c, p2.ptr.dev);
+	if (likely(!p1.do_ec_reconstruct &&
+		   !p2.do_ec_reconstruct)) {
+		struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
+		struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
+
+		int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+
+		if (failed_delta)
+			return failed_delta < 0;
+
+		u64 l1 = dev_latency(ca1);
+		u64 l2 = dev_latency(ca2);
 
 		/*
 		 * Square the latencies, to bias more in favor of the faster
@@ -103,9 +123,9 @@ static inline bool ptr_better(struct bch_fs *c,
 	}
 
 	if (bch2_force_reconstruct_read)
-		return p1.idx > p2.idx;
+		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
 
-	return p1.idx < p2.idx;
+	return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
 }
 
 /*
@@ -114,19 +134,24 @@ static inline bool ptr_better(struct bch_fs *c,
  * other devices, it will still pick a pointer from avoid.
  */
 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-			      struct bch_io_failures *failed,
-			      struct extent_ptr_decoded *pick,
-			      int dev)
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick,
+			       int dev)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	struct bch_dev_io_failures *f;
+	unsigned csum_retry = 0;
+	bool have_csum_retries = false;
 	int ret = 0;
 
 	if (k.k->type == KEY_TYPE_error)
 		return -BCH_ERR_key_type_error;
 
+	if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
+		return -BCH_ERR_extent_poisened;
+again:
 	rcu_read_lock();
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		/*
@@ -154,20 +179,28 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
 			continue;
 
-		f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f)
-			p.idx = f->nr_failed < f->nr_retries
-				? f->idx
-				: f->idx + 1;
+		if (unlikely(failed) &&
+		    (f = bch2_dev_io_failures(failed, p.ptr.dev))) {
+			have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
 
-		if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
-			p.idx++;
+			if (p.has_ec &&
+			    !f->failed_ec &&
+			    (f->failed_io || f->failed_csum_nr))
+				p.do_ec_reconstruct = true;
+			else if (f->failed_io ||
+				 f->failed_csum_nr > csum_retry)
+				continue;
+		}
 
-		if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
-			p.idx++;
+		if (!ca || !bch2_dev_is_online(ca)) {
+			if (p.has_ec)
+				p.do_ec_reconstruct = true;
+			else
+				continue;
+		}
 
-		if (p.idx > (unsigned) p.has_ec)
-			continue;
+		if (p.has_ec && bch2_force_reconstruct_read)
+			p.do_ec_reconstruct = true;
 
 		if (ret > 0 && !ptr_better(c, p, *pick))
 			continue;
@@ -177,6 +210,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 	}
 	rcu_read_unlock();
 
+	if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
+		     have_csum_retries &&
+		     csum_retry < BCH_MAX_CSUM_RETRIES)) {
+		csum_retry++;
+		goto again;
+	}
+
 	return ret;
 }
 
@@ -1002,7 +1042,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
 
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
 
-	return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+	return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
 }
 
 void bch2_extent_ptr_set_cached(struct bch_fs *c,
@@ -1225,6 +1265,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
 			break;
 
+		case BCH_EXTENT_ENTRY_flags:
+			prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
+			break;
+
 		default:
 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
 			return;
@@ -1386,6 +1430,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 #endif
 			break;
 		}
+		case BCH_EXTENT_ENTRY_flags:
+			bkey_fsck_err_on(entry != ptrs.start,
+					 c, extent_flags_not_at_start,
+					 "extent flags entry not at start");
+			break;
 		}
 	}
 
@@ -1452,6 +1501,28 @@ void bch2_ptr_swab(struct bkey_s k)
 	}
 }
 
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
+{
+	int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
+	if (ret)
+		return ret;
+
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+	if (ptrs.start != ptrs.end &&
+	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
+		ptrs.start->flags.flags = flags;
+	} else {
+		struct bch_extent_flags f = {
+			.type	= BIT(BCH_EXTENT_ENTRY_flags),
+			.flags	= flags,
+		};
+		__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
+	}
+
+	return 0;
+}
+
 /* Generic extent code: */
 
 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1497,8 +1568,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 				entry->crc128.offset += sub;
 				break;
 			case BCH_EXTENT_ENTRY_stripe_ptr:
-				break;
 			case BCH_EXTENT_ENTRY_rebalance:
+			case BCH_EXTENT_ENTRY_flags:
 				break;
 			}
 
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8fae6b23..b4058502 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -320,8 +320,8 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 ({									\
 	__label__ out;							\
 									\
-	(_ptr).idx	= 0;						\
 	(_ptr).has_ec	= false;					\
+	(_ptr).do_ec_reconstruct = false;				\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
 		switch (__extent_entry_type(_entry)) {			\
@@ -401,7 +401,7 @@ out:									\
 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
 						 unsigned);
 void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *);
+			  struct extent_ptr_decoded *, bool);
 int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 			       struct bch_io_failures *,
 			       struct extent_ptr_decoded *, int);
@@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
 		ptr1.unwritten	== ptr2.unwritten &&
 		ptr1.offset	== ptr2.offset &&
 		ptr1.dev	== ptr2.dev &&
-		ptr1.dev	== ptr2.dev);
+		ptr1.gen	== ptr2.gen);
 }
 
 void bch2_ptr_swab(struct bkey_s);
@@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
 	k->size = new_size;
 }
 
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
+{
+	if (ptrs.start != ptrs.end &&
+	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
+		return ptrs.start->flags.flags;
+	return 0;
+}
+
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
+{
+	return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
+}
+
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
+
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/extents_format.h b/libbcachefs/extents_format.h
index c198dfc3..74c0252c 100644
--- a/libbcachefs/extents_format.h
+++ b/libbcachefs/extents_format.h
@@ -79,8 +79,9 @@
 	x(crc64,		2)		\
 	x(crc128,		3)		\
 	x(stripe_ptr,		4)		\
-	x(rebalance,		5)
-#define BCH_EXTENT_ENTRY_MAX	6
+	x(rebalance,		5)		\
+	x(flags,		6)
+#define BCH_EXTENT_ENTRY_MAX	7
 
 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
 #endif
 };
 
+#define BCH_EXTENT_FLAGS()		\
+	x(poisoned,		0)
+
+enum bch_extent_flags_e {
+#define x(n, v)	BCH_EXTENT_FLAG_##n = v,
+	BCH_EXTENT_FLAGS()
+#undef x
+};
+
+struct bch_extent_flags {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:7,
+				flags:57;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			flags:57,
+				type:7;
+#endif
+};
+
 /* bch_extent_rebalance: */
 #include "rebalance_format.h"
 
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index 43d6c341..f8b8e598 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -20,21 +20,23 @@ struct bch_extent_crc_unpacked {
 };
 
 struct extent_ptr_decoded {
-	unsigned			idx;
 	bool				has_ec;
+	unsigned			do_ec_reconstruct;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
 	struct bch_extent_stripe_ptr	ec;
 };
 
+#define BCH_MAX_CSUM_RETRIES		3
+
 struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
-		u8		idx;
-		u8		nr_failed;
-		u8		nr_retries;
-	}			devs[BCH_REPLICAS_MAX];
+		unsigned	failed_csum_nr:4,
+				failed_io:1,
+				failed_ec:1;
+	}			devs[BCH_REPLICAS_MAX + 1];
 };
 
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index ca70a3de..fbc3da59 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -268,16 +268,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	struct bkey_s_c dirent_k =
-		bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
-				 &dir_hash, dir, name, BTREE_ITER_intent);
-	ret = bkey_err(dirent_k);
-	if (ret)
-		goto err;
-
-	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum);
-	if (ret > 0)
-		ret = -ENOENT;
+	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+				       name, &inum, BTREE_ITER_intent);
 	if (ret)
 		goto err;
 
@@ -334,7 +326,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
 	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-	dir_u->bi_size	-= bkey_bytes(dirent_k.k);
 
 	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
 				    &dir_hash, &dirent_iter,
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 94bf34b9..717e7b94 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
 	ret = bch2_truncate_folio(inode, iattr->ia_size);
 	if (unlikely(ret < 0))
 		goto err;
+	ret = 0;
 
 	truncate_setsize(&inode->v, iattr->ia_size);
 
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 4465a2a8..5b47b94f 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
 		if (ret < 0)
 			return ret;
 
-		if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding))
-			return -EOPNOTSUPP;
+		ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
+		if (ret)
+			return ret;
 
 		bch2_check_set_feature(c, BCH_FEATURE_casefolding);
 #else
@@ -243,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
 	int ret = 0;
 	subvol_inum inum;
 
-	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
 	if (!kname)
 		return -ENOMEM;
 
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 2c011a46..459ca825 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
 
 	bch2_opts_apply(&c->opts, opts);
 
-	ret = bch2_fs_start(c);
-	if (ret)
-		goto err_stop_fs;
+	/*
+	 * need to initialise sb and set c->vfs_sb _before_ starting fs,
+	 * for blk_holder_ops
+	 */
 
 	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
 	ret = PTR_ERR_OR_ZERO(sb);
@@ -2282,6 +2283,10 @@ got_sb:
 
 	sb->s_shrink->seeks = 0;
 
+	ret = bch2_fs_start(c);
+	if (ret)
+		goto err_put_super;
+
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	ret = PTR_ERR_OR_ZERO(vinode);
 	bch_err_msg(c, ret, "mounting: error getting root inode");
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9bf316e7..0e85131d 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1978,31 +1978,10 @@ fsck_err:
 	return ret;
 }
 
-static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	darray_for_each(w->inodes, i)
-		if (fsck_err_on(i->inode.bi_size != i->i_size,
-				trans, inode_dir_wrong_nlink,
-				"directory %llu:%u with wrong i_size: got %llu, should be %llu",
-				w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
-			i->inode.bi_size = i->i_size;
-			ret = bch2_fsck_write_inode(trans, &i->inode);
-			if (ret)
-				break;
-		}
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
 {
 	u32 restart_count = trans->restart_count;
 	return check_subdir_count_notnested(trans, w) ?:
-		check_dir_i_size_notnested(trans, w) ?:
 		trans_was_restarted(trans, restart_count);
 }
 
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 821ff222..652dbc58 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -329,10 +329,17 @@ nopromote:
 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
 				   struct bch_read_bio *rbio, struct bpos read_pos)
 {
-	return lockrestart_do(trans,
+	int ret = lockrestart_do(trans,
 		bch2_inum_offset_err_msg_trans(trans, out,
 				(subvol_inum) { rbio->subvol, read_pos.inode },
 				read_pos.offset << 9));
+	if (ret)
+		return ret;
+
+	if (rbio->flags & BCH_READ_data_update)
+		prt_str(out, "(internal move) ");
+
+	return 0;
 }
 
 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@@ -341,10 +348,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
 	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
 }
 
-#define READ_RETRY_AVOID	1
-#define READ_RETRY		2
-#define READ_ERR		3
-
 enum rbio_context {
 	RBIO_CONTEXT_NULL,
 	RBIO_CONTEXT_HIGHPRI,
@@ -375,6 +378,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 {
 	BUG_ON(rbio->bounce && !rbio->split);
 
+	if (rbio->have_ioref) {
+		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
+		percpu_ref_put(&ca->io_ref);
+	}
+
 	if (rbio->split) {
 		struct bch_read_bio *parent = rbio->parent;
 
@@ -408,13 +416,90 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 	bio_endio(&rbio->bio);
 }
 
-static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter,
-				     struct bch_io_failures *failed,
-				     unsigned flags)
+static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
+				       struct bch_read_bio *rbio,
+				       struct btree_iter *iter)
+{
+	if (rbio->flags & BCH_READ_data_update) {
+		struct data_update *u = container_of(rbio, struct data_update, rbio);
+
+		return bch2_bkey_get_iter(trans, iter,
+					  u->btree_id, bkey_start_pos(&u->k.k->k), 0);
+	} else {
+		struct bpos pos = rbio->read_pos;
+		int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
+		if (ret)
+			return bkey_s_c_err(ret);
+
+		return bch2_bkey_get_iter(trans, iter,
+					  BTREE_ID_extents, pos, 0);
+	}
+}
+
+static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
+						      struct bch_read_bio *rbio,
+						      struct bch_io_failures *failed)
+{
+	struct btree_iter iter = {};
+	struct bkey_s_c k;
+	int ret = lockrestart_do(trans,
+				 bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
+
+	if (!ret) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+		bkey_for_each_ptr(ptrs, ptr)
+			if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
+				bch2_mark_io_failure(failed, &rbio->pick,
+					rbio->ret == -BCH_ERR_data_read_csum_err);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+}
+
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c k, struct bch_io_failures *failed)
+{
+	u64 flags = bch2_bkey_extent_flags(k);
+	if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+		return 0;
+
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	/*
+	 * Make sure we actually attempt to read and got checksum failures from
+	 * every replica
+	 */
+
+	rcu_read_lock();
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+		if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
+			continue;
+
+		struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
+		if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
+				   bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
+	return  PTR_ERR_OR_ZERO(new) ?:
+		bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+}
+
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
+					struct bch_read_bio *rbio,
+					struct bvec_iter bvec_iter,
+					struct bch_io_failures *failed,
+					unsigned flags)
 {
 	struct data_update *u = container_of(rbio, struct data_update, rbio);
-	struct btree_trans *trans = bch2_trans_get(c);
 retry:
 	bch2_trans_begin(trans);
 
@@ -429,7 +514,7 @@ retry:
 
 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
 		/* extent we wanted to read no longer exists: */
-		rbio->hole = true;
+		rbio->ret = -BCH_ERR_data_read_key_overwritten;
 		goto err;
 	}
 
@@ -441,14 +526,19 @@ retry:
 err:
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (ret == READ_RETRY)
+	if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
 		goto retry;
-	if (ret)
-		rbio->bio.bi_status = BLK_STS_IOERR;
+
+	if (ret) {
+		if (ret == -BCH_ERR_no_device_to_read_from && failed)
+			maybe_poison_extent(trans, &iter, k, failed);
+
+		rbio->bio.bi_status	= BLK_STS_IOERR;
+		rbio->ret		= ret;
+	}
 
 	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
-	bch2_rbio_done(rbio);
-	bch2_trans_put(trans);
+	return ret;
 }
 
 static void bch2_rbio_retry(struct work_struct *work)
@@ -463,16 +553,22 @@ static void bch2_rbio_retry(struct work_struct *work)
 		.inum	= rbio->read_pos.inode,
 	};
 	struct bch_io_failures failed = { .nr = 0 };
+	struct btree_trans *trans = bch2_trans_get(c);
 
 	trace_io_read_retry(&rbio->bio);
 	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
 		     bvec_iter_sectors(rbio->bvec_iter));
 
-	if (rbio->retry == READ_RETRY_AVOID)
-		bch2_mark_io_failure(&failed, &rbio->pick);
+	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+		mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
 
-	if (!rbio->split)
-		rbio->bio.bi_status = 0;
+	if (!rbio->split) {
+		rbio->bio.bi_status	= 0;
+		rbio->ret		= 0;
+	}
+
+	unsigned subvol		= rbio->subvol;
+	struct bpos read_pos	= rbio->read_pos;
 
 	rbio = bch2_rbio_free(rbio);
 
@@ -481,29 +577,55 @@ static void bch2_rbio_retry(struct work_struct *work)
 	flags &= ~BCH_READ_last_fragment;
 	flags |= BCH_READ_must_clone;
 
-	if (flags & BCH_READ_data_update)
-		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
-	else
-		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	int ret = flags & BCH_READ_data_update
+		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
+		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
+
+	if (ret) {
+		rbio->ret = ret;
+		rbio->bio.bi_status = BLK_STS_IOERR;
+	} else {
+		struct printbuf buf = PRINTBUF;
+
+		lockrestart_do(trans,
+			bch2_inum_offset_err_msg_trans(trans, &buf,
+					(subvol_inum) { subvol, read_pos.inode },
+					read_pos.offset << 9));
+		if (rbio->flags & BCH_READ_data_update)
+			prt_str(&buf, "(internal move) ");
+		prt_str(&buf, "successful retry");
+
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	bch2_rbio_done(rbio);
+	bch2_trans_put(trans);
 }
 
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-			    blk_status_t error)
+static void bch2_rbio_error(struct bch_read_bio *rbio,
+			    int ret, blk_status_t blk_error)
 {
-	rbio->retry = retry;
-	rbio->saw_error = true;
+	BUG_ON(ret >= 0);
+
+	rbio->ret		= ret;
+	rbio->bio.bi_status	= blk_error;
+
+	bch2_rbio_parent(rbio)->saw_error = true;
 
 	if (rbio->flags & BCH_READ_in_retry)
 		return;
 
-	if (retry == READ_ERR) {
-		rbio = bch2_rbio_free(rbio);
-
-		rbio->bio.bi_status = error;
-		bch2_rbio_done(rbio);
-	} else {
+	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
 		bch2_rbio_punt(rbio, bch2_rbio_retry,
 			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	} else {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->ret		= ret;
+		rbio->bio.bi_status	= blk_error;
+
+		bch2_rbio_done(rbio);
 	}
 }
 
@@ -519,15 +641,13 @@ static void bch2_read_io_err(struct work_struct *work)
 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
 	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
 
-	if (ca) {
-		bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+	if (ca)
 		bch_err_ratelimited(ca, "%s", buf.buf);
-	} else {
+	else
 		bch_err_ratelimited(c, "%s", buf.buf);
-	}
 
 	printbuf_exit(&buf);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
 }
 
 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -609,14 +729,12 @@ static void bch2_read_csum_err(struct work_struct *work)
 	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
 
 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	if (ca) {
-		bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+	if (ca)
 		bch_err_ratelimited(ca, "%s", buf.buf);
-	} else {
+	else
 		bch_err_ratelimited(c, "%s", buf.buf);
-	}
 
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
 	printbuf_exit(&buf);
 }
 
@@ -636,7 +754,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
 	else
 		bch_err_ratelimited(c, "%s", buf.buf);
 
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
 	printbuf_exit(&buf);
 }
 
@@ -656,16 +774,53 @@ static void bch2_read_decrypt_err(struct work_struct *work)
 	else
 		bch_err_ratelimited(c, "%s", buf.buf);
 
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
 	printbuf_exit(&buf);
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+
+static void corrupt_bio(struct bio *bio)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+	bio_for_each_segment(bv, bio, iter) {
+		unsigned u64s = bv.bv_len / sizeof(u64);
+
+		if (offset < u64s) {
+			u64 *segment = bvec_kmap_local(&bv);
+			segment[offset] = get_random_u64();
+			kunmap_local(segment);
+			return;
+		}
+		offset -= u64s;
+	}
+}
+
+static inline void maybe_corrupt_bio(struct bio *bio)
+{
+	if (bch2_read_corrupt_ratio &&
+	    !get_random_u32_below(bch2_read_corrupt_ratio))
+		corrupt_bio(bio);
+}
+#else
+static inline void maybe_corrupt_bio(struct bio *bio)
+{
+}
+#endif
+
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
 	struct bch_read_bio *rbio =
 		container_of(work, struct bch_read_bio, work);
 	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
 	struct bio *src		= &rbio->bio;
 	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
 	struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -686,8 +841,26 @@ static void __bch2_read_endio(struct work_struct *work)
 		src->bi_iter			= rbio->bvec_iter;
 	}
 
+	maybe_corrupt_bio(src);
+
 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
+
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
+		rbio->flags |= BCH_READ_must_bounce;
+		bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
+				BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+	if (!csum_good)
 		goto csum_err;
 
 	/*
@@ -760,17 +933,6 @@ out:
 	memalloc_nofs_restore(nofs_flags);
 	return;
 csum_err:
-	/*
-	 * Checksum error: if the bio wasn't bounced, we may have been
-	 * reading into buffers owned by userspace (that userspace can
-	 * scribble over) - retry the read, bouncing it this time:
-	 */
-	if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
-		rbio->flags |= BCH_READ_must_bounce;
-		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-		goto out;
-	}
-
 	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
 	goto out;
 decompression_err:
@@ -790,10 +952,8 @@ static void bch2_read_endio(struct bio *bio)
 	struct workqueue_struct *wq = NULL;
 	enum rbio_context context = RBIO_CONTEXT_NULL;
 
-	if (rbio->have_ioref) {
-		bch2_latency_acct(ca, rbio->submit_time, READ);
-		percpu_ref_put(&ca->io_ref);
-	}
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+				   rbio->submit_time, !bio->bi_status);
 
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
@@ -808,9 +968,9 @@ static void bch2_read_endio(struct bio *bio)
 		trace_and_count(c, io_read_reuse_race, &rbio->bio);
 
 		if (rbio->flags & BCH_READ_retry_if_stale)
-			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
 		else
-			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
 		return;
 	}
 
@@ -883,7 +1043,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	struct bch_read_bio *rbio = NULL;
 	bool bounce = false, read_full = false, narrow_crcs = false;
 	struct bpos data_pos = bkey_start_pos(k.k);
-	int pick_ret;
+	int ret = 0;
 
 	if (bkey_extent_is_inline_data(k.k)) {
 		unsigned bytes = min_t(unsigned, iter.bi_size,
@@ -899,16 +1059,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		goto out_read_done;
 	}
 retry_pick:
-	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
+	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
 
 	/* hole or reservation - just zero fill: */
-	if (!pick_ret)
+	if (!ret)
 		goto hole;
 
-	if (unlikely(pick_ret < 0)) {
+	if (unlikely(ret < 0)) {
 		struct printbuf buf = PRINTBUF;
 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
-		prt_printf(&buf, "no device to read from: %s\n  ", bch2_err_str(pick_ret));
+		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
 		bch2_bkey_val_to_text(&buf, c, k);
 
 		bch_err_ratelimited(c, "%s", buf.buf);
@@ -924,6 +1084,7 @@ retry_pick:
 
 		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
+		ret = -BCH_ERR_data_read_no_encryption_key;
 		goto err;
 	}
 
@@ -940,7 +1101,7 @@ retry_pick:
 	    ca &&
 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
-		bch2_mark_io_failure(failed, &pick);
+		bch2_mark_io_failure(failed, &pick, false);
 		percpu_ref_put(&ca->io_ref);
 		goto retry_pick;
 	}
@@ -984,10 +1145,10 @@ retry_pick:
 		 */
 		struct data_update *u = container_of(orig, struct data_update, rbio);
 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
-			BUG();
 			if (ca)
 				percpu_ref_put(&ca->io_ref);
-			goto hole;
+			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+			goto out_read_done;
 		}
 
 		iter.bi_size	= pick.crc.compressed_size << 9;
@@ -1067,8 +1228,7 @@ retry_pick:
 	rbio->flags		= flags;
 	rbio->have_ioref	= ca != NULL;
 	rbio->narrow_crcs	= narrow_crcs;
-	rbio->hole		= 0;
-	rbio->retry		= 0;
+	rbio->ret		= 0;
 	rbio->context		= 0;
 	rbio->pick		= pick;
 	rbio->subvol		= orig->subvol;
@@ -1104,7 +1264,7 @@ retry_pick:
 		trace_and_count(c, io_read_split, &orig->bio);
 	}
 
-	if (!rbio->pick.idx) {
+	if (likely(!rbio->pick.do_ec_reconstruct)) {
 		if (unlikely(!rbio->have_ioref)) {
 			struct printbuf buf = PRINTBUF;
 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1114,7 +1274,9 @@ retry_pick:
 			bch_err_ratelimited(c, "%s", buf.buf);
 			printbuf_exit(&buf);
 
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			bch2_rbio_error(rbio,
+					-BCH_ERR_data_read_device_offline,
+					BLK_STS_IOERR);
 			goto out;
 		}
 
@@ -1140,7 +1302,8 @@ retry_pick:
 	} else {
 		/* Attempting reconstruct read: */
 		if (bch2_ec_read_extent(trans, rbio, k)) {
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
+					BLK_STS_IOERR);
 			goto out;
 		}
 
@@ -1156,25 +1319,22 @@ out:
 		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);
 
-		ret = rbio->retry;
+		ret = rbio->ret;
 		rbio = bch2_rbio_free(rbio);
 
-		if (ret == READ_RETRY_AVOID) {
-			bch2_mark_io_failure(failed, &pick);
-			ret = READ_RETRY;
-		}
-
-		if (!ret)
-			goto out_read_done;
+		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
+			bch2_mark_io_failure(failed, &pick,
+					ret == -BCH_ERR_data_read_csum_err);
 
 		return ret;
 	}
 
 err:
 	if (flags & BCH_READ_in_retry)
-		return READ_ERR;
+		return ret;
 
-	orig->bio.bi_status = BLK_STS_IOERR;
+	orig->bio.bi_status	= BLK_STS_IOERR;
+	orig->ret		= ret;
 	goto out_read_done;
 
 hole:
@@ -1186,20 +1346,21 @@ hole:
 	 * to read no longer exists we have to signal that:
 	 */
 	if (flags & BCH_READ_data_update)
-		orig->hole = true;
+		orig->ret = -BCH_ERR_data_read_key_overwritten;
 
 	zero_fill_bio_iter(&orig->bio, iter);
 out_read_done:
-	if (flags & BCH_READ_last_fragment)
+	if ((flags & BCH_READ_last_fragment) &&
+	    !(flags & BCH_READ_in_retry))
 		bch2_rbio_done(orig);
 	return 0;
 }
 
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-		 struct bvec_iter bvec_iter, subvol_inum inum,
-		 struct bch_io_failures *failed, unsigned flags)
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
+		struct bvec_iter bvec_iter, subvol_inum inum,
+		struct bch_io_failures *failed, unsigned flags)
 {
-	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_buf sk;
 	struct bkey_s_c k;
@@ -1232,6 +1393,23 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		if (ret)
 			goto err;
 
+		if (unlikely(flags & BCH_READ_in_retry)) {
+			struct data_update *u = flags & BCH_READ_data_update
+				? container_of(rbio, struct data_update, rbio)
+				: NULL;
+
+			if (u &&
+			    !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
+				/* extent we wanted to read no longer exists: */
+				ret = -BCH_ERR_data_read_key_overwritten;
+				goto err;
+			}
+
+			if (!bkey_deleted(&sk.k->k) &&
+			    !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
+				failed->nr = 0;
+		}
+
 		s64 offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
 		unsigned sectors = k.k->size - offset_into_extent;
@@ -1271,28 +1449,32 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 err:
 		if (ret &&
 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-		    ret != READ_RETRY &&
-		    ret != READ_RETRY_AVOID)
+		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
 			break;
 	}
 
-	bch2_trans_iter_exit(trans, &iter);
+	if (unlikely(ret)) {
+		if (ret == -BCH_ERR_no_device_to_read_from && failed)
+			maybe_poison_extent(trans, &iter, k, failed);
 
-	if (ret) {
 		struct printbuf buf = PRINTBUF;
 		lockrestart_do(trans,
 			bch2_inum_offset_err_msg_trans(trans, &buf, inum,
 						       bvec_iter.bi_sector << 9));
-		prt_printf(&buf, "read error %i from btree lookup", ret);
+		prt_printf(&buf, "read error: %s", bch2_err_str(ret));
 		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bch2_rbio_done(rbio);
+		rbio->bio.bi_status	= BLK_STS_IOERR;
+		rbio->ret		= ret;
+
+		if (!(flags & BCH_READ_in_retry))
+			bch2_rbio_done(rbio);
 	}
 
-	bch2_trans_put(trans);
+	bch2_trans_iter_exit(trans, &iter);
 	bch2_bkey_buf_exit(&sk, c);
+	return ret;
 }
 
 void bch2_fs_io_read_exit(struct bch_fs *c)
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
index 73275da5..edcf50a4 100644
--- a/libbcachefs/io_read.h
+++ b/libbcachefs/io_read.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_IO_READ_H
 
 #include "bkey_buf.h"
+#include "btree_iter.h"
 #include "reflink.h"
 
 struct bch_read_bio {
@@ -40,13 +41,12 @@ struct bch_read_bio {
 				split:1,
 				have_ioref:1,
 				narrow_crcs:1,
-				hole:1,
 				saw_error:1,
-				retry:2,
 				context:2;
 	};
 	u16			_state;
 	};
+	s16			ret;
 
 	struct extent_ptr_decoded pick;
 
@@ -141,22 +141,21 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 			   data_btree, k, offset_into_extent, NULL, flags, -1);
 }
 
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 subvol_inum, struct bch_io_failures *, unsigned flags);
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
+		subvol_inum, struct bch_io_failures *, unsigned flags);
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 			     subvol_inum inum)
 {
-	struct bch_io_failures failed = { .nr = 0 };
-
 	BUG_ON(rbio->_state);
 
 	rbio->subvol = inum.subvol;
 
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
-		    BCH_READ_retry_if_stale|
-		    BCH_READ_may_promote|
-		    BCH_READ_user_mapped);
+	bch2_trans_run(c,
+		__bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
+			    BCH_READ_retry_if_stale|
+			    BCH_READ_may_promote|
+			    BCH_READ_user_mapped));
 }
 
 static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
@@ -166,6 +165,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
 
 	rbio->c		= orig->c;
 	rbio->_state	= 0;
+	rbio->ret	= 0;
 	rbio->split	= true;
 	rbio->parent	= orig;
 	rbio->opts	= orig->opts;
@@ -182,6 +182,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
 	rbio->start_time	= local_clock();
 	rbio->c			= c;
 	rbio->_state		= 0;
+	rbio->ret	= 0;
 	rbio->opts		= opts;
 	rbio->bio.bi_end_io	= end_io;
 	return rbio;
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index 738bdbfb..dbfcb28f 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio)
 		? bch2_dev_have_ref(c, wbio->dev)
 		: NULL;
 
-	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+				   wbio->submit_time, !bio->bi_status);
+
+	if (bio->bi_status) {
+		bch_err_inum_offset_ratelimited(ca,
 				    op->pos.inode,
 				    wbio->inode_offset << 9,
 				    "data write error: %s",
-				    bch2_blk_status_to_str(bio->bi_status))) {
+				    bch2_blk_status_to_str(bio->bi_status));
 		set_bit(wbio->dev, op->failed.d);
 		op->flags |= BCH_WRITE_io_error;
 	}
@@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio)
 		set_bit(wbio->dev, op->devs_need_flush->d);
 	}
 
-	if (wbio->have_ioref) {
-		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+	if (wbio->have_ioref)
 		percpu_ref_put(&ca->io_ref);
-	}
 
 	if (wbio->bounce)
 		bch2_bio_free_pages_pool(c, bio);
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
index bf942566..62773053 100644
--- a/libbcachefs/io_write.h
+++ b/libbcachefs/io_write.h
@@ -11,12 +11,6 @@
 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
 
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *, bool);
 
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 3d097de8..8d4f3bfa 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1096,8 +1096,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
 
 /* allocate journal on a device: */
 
-static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
-					 bool new_fs, struct closure *cl)
+static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
+					    bool new_fs, struct closure *cl)
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
@@ -1225,26 +1225,20 @@ err_free:
 	return ret;
 }
 
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-				unsigned nr)
+static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
+					    unsigned nr, bool new_fs)
 {
 	struct journal_device *ja = &ca->journal;
-	struct closure cl;
 	int ret = 0;
 
+	struct closure cl;
 	closure_init_stack(&cl);
 
-	down_write(&c->state_lock);
-
 	/* don't handle reducing nr of buckets yet: */
 	if (nr < ja->nr)
-		goto unlock;
+		return 0;
 
-	while (ja->nr < nr) {
+	while (!ret && ja->nr < nr) {
 		struct disk_reservation disk_res = { 0, 0, 0 };
 
 		/*
@@ -1257,25 +1251,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		 * filesystem-wide allocation will succeed, this is a device
 		 * specific allocation - we can hang here:
 		 */
+		if (!new_fs) {
+			ret = bch2_disk_reservation_get(c, &disk_res,
+							bucket_to_sector(ca, nr - ja->nr), 1, 0);
+			if (ret)
+				break;
+		}
 
-		ret = bch2_disk_reservation_get(c, &disk_res,
-						bucket_to_sector(ca, nr - ja->nr), 1, 0);
-		if (ret)
-			break;
+		ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
 
-		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+		if (ret == -BCH_ERR_bucket_alloc_blocked ||
+		    ret == -BCH_ERR_open_buckets_empty)
+			ret = 0; /* wait and retry */
 
 		bch2_disk_reservation_put(c, &disk_res);
-
 		closure_sync(&cl);
-
-		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
-			break;
 	}
 
-	bch_err_fn(c, ret);
-unlock:
+	return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	down_write(&c->state_lock);
+	int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
 	up_write(&c->state_lock);
+
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1301,7 +1308,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 		     min(1 << 13,
 			 (1 << 24) / ca->mi.bucket_size));
 
-	ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
+	ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
 err:
 	bch_err_fn(ca, ret);
 	return ret;
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 7d59ccc0..331c9d76 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1041,13 +1041,19 @@ reread:
 			bio->bi_iter.bi_sector = offset;
 			bch2_bio_map(bio, buf->data, sectors_read << 9);
 
+			u64 submit_time = local_clock();
 			ret = submit_bio_wait(bio);
 			kfree(bio);
 
-			if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
-					       "journal read error: sector %llu",
-					       offset) ||
-			    bch2_meta_read_fault("journal")) {
+			if (!ret && bch2_meta_read_fault("journal"))
+				ret = -BCH_ERR_EIO_fault_injected;
+
+			bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+						   submit_time, !ret);
+
+			if (ret) {
+				bch_err_dev_ratelimited(ca,
+					"journal read error: sector %llu", offset);
 				/*
 				 * We don't error out of the recovery process
 				 * here, since the relevant journal entry may be
@@ -1110,13 +1116,16 @@ reread:
 		struct bch_csum csum;
 		csum_good = jset_csum_good(c, j, &csum);
 
-		if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
-				       "%s",
-				       (printbuf_reset(&err),
-					prt_str(&err, "journal "),
-					bch2_csum_err_msg(&err, csum_type, j->csum, csum),
-					err.buf)))
+		bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+		if (!csum_good) {
+			bch_err_dev_ratelimited(ca, "%s",
+				(printbuf_reset(&err),
+				 prt_str(&err, "journal "),
+				 bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+				 err.buf));
 			saw_bad = true;
+		}
 
 		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
 			     j->encrypted_start,
@@ -1655,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	}
 
 	bool completed = false;
+	bool do_discards = false;
 
 	for (seq = journal_last_unwritten_seq(j);
 	     seq <= journal_cur_seq(j);
@@ -1667,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done)
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
-			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
 			bch2_reset_alloc_cursors(c);
 		}
@@ -1718,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done)
 	 */
 	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
+
+	if (do_discards)
+		bch2_do_discards(c);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1727,13 +1739,16 @@ static void journal_write_endio(struct bio *bio)
 	struct journal *j = &ca->fs->journal;
 	struct journal_buf *w = j->buf + jbio->buf_idx;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+				   jbio->submit_time, !bio->bi_status);
+
+	if (bio->bi_status) {
+		bch_err_dev_ratelimited(ca,
 			       "error writing journal entry %llu: %s",
 			       le64_to_cpu(w->data->seq),
-			       bch2_blk_status_to_str(bio->bi_status)) ||
-	    bch2_meta_write_fault("journal")) {
-		unsigned long flags;
+			       bch2_blk_status_to_str(bio->bi_status));
 
+		unsigned long flags;
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
@@ -1762,7 +1777,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
 			     sectors);
 
 		struct journal_device *ja = &ca->journal;
-		struct bio *bio = &ja->bio[w->idx]->bio;
+		struct journal_bio *jbio = ja->bio[w->idx];
+		struct bio *bio = &jbio->bio;
+
+		jbio->submit_time	= local_clock();
+
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1794,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
+	/*
+	 * Wait for previous journal writes to comelete; they won't necessarily
+	 * be flushed if they're still in flight
+	 */
 	if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
 		spin_lock(&j->lock);
 		if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index a0b17c6e..fd82f5d8 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -175,6 +175,7 @@ typedef DARRAY(u64)		darray_u64;
 struct journal_bio {
 	struct bch_dev		*ca;
 	unsigned		buf_idx;
+	u64			submit_time;
 
 	struct bio		bio;
 };
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index e944f279..a3096e2a 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -125,8 +125,8 @@ static void move_write(struct moving_io *io)
 				     &ctxt->stats->sectors_error_corrected);
 	}
 
-	if (unlikely(io->write.rbio.bio.bi_status ||
-		     io->write.rbio.hole ||
+	if (unlikely(io->write.rbio.ret ||
+		     io->write.rbio.bio.bi_status ||
 		     io->write.data_opts.scrub)) {
 		move_free(io);
 		return;
@@ -816,7 +816,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 		if (!bp.v->level)
 			ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
 		else if (!data_opts.scrub)
-			ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+			ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
 		else
 			ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
 
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 4ecb721c..fa19fc44 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 				  struct move_bucket *b, u64 time)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_alloc_v4 _a;
-	const struct bch_alloc_v4 *a;
-	int ret;
 
-	if (bch2_bucket_is_open(trans->c,
-				b->k.bucket.inode,
-				b->k.bucket.offset))
+	if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
 		return 0;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-			       b->k.bucket, BTREE_ITER_cached);
-	ret = bkey_err(k);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+				       b->k.bucket, BTREE_ITER_cached);
+	int ret = bkey_err(k);
 	if (ret)
 		return ret;
 
@@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	if (!ca)
 		goto out;
 
-	a = bch2_alloc_to_v4(k, &_a);
+	if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+	    !bch2_dev_is_online(ca))
+		goto out_put;
+
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
 	b->k.gen	= a->gen;
 	b->sectors	= bch2_bucket_sectors_dirty(*a);
 	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
 
 	ret = lru_idx && lru_idx <= time;
-
+out_put:
 	bch2_dev_put(ca);
 out:
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 071a92ec..afb89d31 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -145,6 +145,11 @@ enum fsck_err_opts {
 	  OPT_STR(bch2_error_actions),					\
 	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_fix_safe,		\
 	  NULL,		"Action to take on filesystem error")		\
+	x(write_error_timeout,		u16,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(1, 300),						\
+	  BCH_SB_WRITE_ERROR_TIMEOUT,	30,				\
+	  NULL,		"Number of consecutive write errors allowed before kicking out a device")\
 	x(metadata_replicas,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
index 41855796..e89b9c78 100644
--- a/libbcachefs/recovery_passes_types.h
+++ b/libbcachefs/recovery_passes_types.h
@@ -24,7 +24,7 @@
 	x(check_topology,			 4, 0)					\
 	x(accounting_read,			39, PASS_ALWAYS)			\
 	x(alloc_read,				 0, PASS_ALWAYS)			\
-	x(stripes_read,				 1, PASS_ALWAYS)			\
+	x(stripes_read,				 1, 0)					\
 	x(initialize_subvolumes,		 2, 0)					\
 	x(snapshots_read,			 3, PASS_ALWAYS)			\
 	x(check_allocations,			 5, PASS_FSCK)				\
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 50118661..68172c6e 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	u64 dst_done = 0;
 	u32 dst_snapshot, src_snapshot;
 	bool reflink_p_may_update_opts_field =
-		bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
+		!bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
 	int ret = 0, ret2 = 0;
 
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 21130ead..acb5d845 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -91,9 +91,6 @@
 	  BCH_FSCK_ERR_accounting_mismatch,			\
 	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
 	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
-	x(directory_size,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
-	  BCH_FSCK_ERR_directory_size_mismatch)			\
 	x(cached_backpointers,					\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
 	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index cdafd877..67455beb 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -179,6 +179,7 @@ enum bch_fsck_flags {
 	x(ptr_crc_redundant,					160,	0)		\
 	x(ptr_crc_nonce_mismatch,				162,	0)		\
 	x(ptr_stripe_redundant,					163,	0)		\
+	x(extent_flags_not_at_start,				306,	0)		\
 	x(reservation_key_nr_replicas_invalid,			164,	0)		\
 	x(reflink_v_refcount_wrong,				165,	FSCK_AUTOFIX)	\
 	x(reflink_v_pos_bad,					292,	0)		\
@@ -316,7 +317,7 @@ enum bch_fsck_flags {
 	x(directory_size_mismatch,				303,	FSCK_AUTOFIX)	\
 	x(dirent_cf_name_too_big,				304,	0)		\
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
-	x(MAX,							306,	0)
+	x(MAX,							307,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index b29b6c6c..38261638 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
 	return ret;
 }
 
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
 {
 	return bch2_dev_is_online(ca) &&
 		ca->mi.state != BCH_MEMBER_STATE_failed;
@@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
 
 static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
 {
+	might_sleep();
+
 	rcu_read_lock();
 	struct bch_dev *ca = bch2_dev_rcu(c, dev);
 	if (ca && !percpu_ref_tryget(&ca->io_ref))
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index 7e7c66a1..7c403427 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock)
 EXPORT_SYMBOL_GPL(six_lock_exit);
 
 void __six_lock_init(struct six_lock *lock, const char *name,
-		     struct lock_class_key *key, enum six_lock_init_flags flags)
+		     struct lock_class_key *key, enum six_lock_init_flags flags,
+		     gfp_t gfp)
 {
 	atomic_set(&lock->state, 0);
 	raw_spin_lock_init(&lock->wait_lock);
@@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
 		 * failure if they wish by checking lock->readers, but generally
 		 * will not want to treat it as an error.
 		 */
-		lock->readers = alloc_percpu(unsigned);
+		lock->readers = alloc_percpu_gfp(unsigned, gfp);
 	}
 #endif
 }
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index c142e06b..59b851cf 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -164,18 +164,19 @@ enum six_lock_init_flags {
 };
 
 void __six_lock_init(struct six_lock *lock, const char *name,
-		     struct lock_class_key *key, enum six_lock_init_flags flags);
+		     struct lock_class_key *key, enum six_lock_init_flags flags,
+		     gfp_t gfp);
 
 /**
  * six_lock_init - initialize a six lock
  * @lock:	lock to initialize
  * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
  */
-#define six_lock_init(lock, flags)					\
+#define six_lock_init(lock, flags, gfp)					\
 do {									\
 	static struct lock_class_key __key;				\
 									\
-	__six_lock_init((lock), #lock, &__key, flags);			\
+	__six_lock_init((lock), #lock, &__key, flags, gfp);			\
 } while (0)
 
 /**
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 0edc8814..ee32d043 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -25,9 +25,6 @@
 #include <linux/sort.h>
 #include <linux/string_choices.h>
 
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-};
-
 struct bch2_metadata_version {
 	u16		version;
 	const char	*name;
@@ -69,14 +66,22 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
 	return v;
 }
 
-void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
 {
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
-		max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
-	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
+	int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
+		   version <= c->sb.version_incompat_allowed)
+		? 0
+		: -BCH_ERR_may_not_use_incompat_feature;
+
+	if (!ret) {
+		mutex_lock(&c->sb_lock);
+		SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+			max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	return ret;
 }
 
 const char * const bch2_sb_fields[] = {
@@ -366,7 +371,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field_members_v1 *mi;
 	enum bch_opt_id opt_id;
-	u16 block_size;
 	int ret;
 
 	ret = bch2_sb_compatible(sb, out);
@@ -385,8 +389,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 		return -BCH_ERR_invalid_sb_features;
 	}
 
-	block_size = le16_to_cpu(sb->block_size);
-
 	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
 		prt_printf(out, "Bad user UUID (got zeroes)");
 		return -BCH_ERR_invalid_sb_uuid;
@@ -452,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 
 		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
 			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+		if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+			SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
 	}
 
 #ifdef __KERNEL__
@@ -743,7 +748,7 @@ retry:
 	memset(sb, 0, sizeof(*sb));
 	sb->mode	= BLK_OPEN_READ;
 	sb->have_bio	= true;
-	sb->holder	= kmalloc(1, GFP_KERNEL);
+	sb->holder	= kzalloc(sizeof(*sb->holder), GFP_KERNEL);
 	if (!sb->holder)
 		return -ENOMEM;
 
@@ -906,16 +911,16 @@ static void write_super_endio(struct bio *bio)
 {
 	struct bch_dev *ca = bio->bi_private;
 
+	bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
+
 	/* XXX: return errors directly */
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca,
-			       bio_data_dir(bio)
-			       ? BCH_MEMBER_ERROR_write
-			       : BCH_MEMBER_ERROR_read,
-			       "superblock %s error: %s",
+	if (bio->bi_status) {
+		bch_err_dev_ratelimited(ca, "superblock %s error: %s",
 			       str_write_read(bio_data_dir(bio)),
-			       bch2_blk_status_to_str(bio->bi_status)))
+			       bch2_blk_status_to_str(bio->bi_status));
 		ca->sb_write_error = 1;
+	}
 
 	closure_put(&ca->fs->sb_write);
 	percpu_ref_put(&ca->io_ref);
@@ -1154,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c)
 				  !can_mount_with_written), c,
 		": Unable to write superblock to sufficient devices (from %ps)",
 		(void *) _RET_IP_))
-		ret = -1;
+		ret = -BCH_ERR_erofs_sb_err;
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
@@ -1211,11 +1216,12 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
 		bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
 
 	c->disk_sb.sb->version = cpu_to_le16(new_version);
-	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 
-	if (incompat)
+	if (incompat) {
+		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
 			max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
+	}
 }
 
 static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index f1ab4f94..167dd98f 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version)
 void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
 enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
 
-void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
 
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
-						 enum bcachefs_metadata_version version)
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
+						enum bcachefs_metadata_version version)
 {
-	if (unlikely(version > c->sb.version_incompat)) {
-		if (version > c->sb.version_incompat_allowed)
-			return false;
-		bch2_set_version_incompat(c, version);
-	}
-	return true;
+	return likely(version <= c->sb.version_incompat)
+		? 0
+		: bch2_set_version_incompat(c, version);
 }
 
 static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 10c281ad..cffad3b6 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c)
 	}
 
 	set_bit(BCH_FS_started, &c->flags);
+	wake_up(&c->ro_ref_wait);
 
 	if (c->opts.read_only) {
 		bch2_fs_read_only(c);
@@ -1431,6 +1432,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	ca->disk_sb = *sb;
 	memset(sb, 0, sizeof(*sb));
 
+	/*
+	 * Stash pointer to the filesystem for blk_holder_ops - note that once
+	 * attached to a filesystem, we will always close the block device
+	 * before tearing down the filesystem object.
+	 */
+	ca->disk_sb.holder->c = ca->fs;
+
 	ca->dev = ca->disk_sb.bdev->bd_dev;
 
 	percpu_ref_reinit(&ca->io_ref);
@@ -2016,6 +2024,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 	return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
 }
 
+/* blk_holder_ops: */
+
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
+	__releases(&bdev->bd_holder_lock)
+{
+	struct bch_sb_handle_holder *holder = bdev->bd_holder;
+	struct bch_fs *c = holder->c;
+
+	if (c && !bch2_ro_ref_tryget(c))
+		c = NULL;
+
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	if (c)
+		wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
+	return c;
+}
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
+{
+	for_each_member_device(c, ca)
+		if (ca->disk_sb.bdev == bdev)
+			return ca;
+	return NULL;
+}
+
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+	struct bch_fs *c = bdev_get_fs(bdev);
+	if (!c)
+		return;
+
+	struct super_block *sb = c->vfs_sb;
+	if (sb) {
+		/*
+		 * Not necessary, c->ro_ref guards against the filesystem being
+		 * unmounted - we only take this to avoid a warning in
+		 * sync_filesystem:
+		 */
+		down_read(&sb->s_umount);
+	}
+
+	down_write(&c->state_lock);
+	struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+	if (!ca)
+		goto unlock;
+
+	if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
+		__bch2_dev_offline(c, ca);
+	} else {
+		if (sb) {
+			if (!surprise)
+				sync_filesystem(sb);
+			shrink_dcache_sb(sb);
+			evict_inodes(sb);
+		}
+
+		bch2_journal_flush(&c->journal);
+		bch2_fs_emergency_read_only(c);
+	}
+
+	bch2_dev_put(ca);
+unlock:
+	if (sb)
+		up_read(&sb->s_umount);
+	up_write(&c->state_lock);
+	bch2_ro_ref_put(c);
+}
+
+static void bch2_fs_bdev_sync(struct block_device *bdev)
+{
+	struct bch_fs *c = bdev_get_fs(bdev);
+	if (!c)
+		return;
+
+	struct super_block *sb = c->vfs_sb;
+	if (sb) {
+		/*
+		 * Not necessary, c->ro_ref guards against the filesystem being
+		 * unmounted - we only take this to avoid a warning in
+		 * sync_filesystem:
+		 */
+		down_read(&sb->s_umount);
+		sync_filesystem(sb);
+		up_read(&sb->s_umount);
+	}
+
+	bch2_ro_ref_put(c);
+}
+
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+	.mark_dead		= bch2_fs_bdev_mark_dead,
+	.sync			= bch2_fs_bdev_sync,
+};
+
 /* Filesystem open: */
 
 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 04f8287e..23533bce 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
 int bch2_fs_start(struct bch_fs *);
 struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
 
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
+
 #endif /* _BCACHEFS_SUPER_H */
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 368a63d9..3a899f79 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -2,13 +2,19 @@
 #ifndef _BCACHEFS_SUPER_TYPES_H
 #define _BCACHEFS_SUPER_TYPES_H
 
+struct bch_fs;
+
+struct bch_sb_handle_holder {
+	struct bch_fs		*c;
+};
+
 struct bch_sb_handle {
 	struct bch_sb		*sb;
 	struct file		*s_bdev_file;
 	struct block_device	*bdev;
 	char			*sb_name;
 	struct bio		*bio;
-	void			*holder;
+	struct bch_sb_handle_holder *holder;
 	size_t			buffer_size;
 	blk_mode_t		mode;
 	unsigned		have_layout:1;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index a9953181..2ed3f755 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -174,7 +174,6 @@ read_attribute(journal_debug);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_reserve_cache);
-read_attribute(stripes_heap);
 read_attribute(open_buckets);
 read_attribute(open_buckets_partial);
 read_attribute(nocow_lock_table);
@@ -355,9 +354,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_btree_reserve_cache)
 		bch2_btree_reserve_cache_to_text(out, c);
 
-	if (attr == &sysfs_stripes_heap)
-		bch2_stripes_heap_to_text(out, c);
-
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c, NULL);
 
@@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_btree_key_cache,
 	&sysfs_btree_reserve_cache,
 	&sysfs_new_stripes,
-	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
 	&sysfs_open_buckets_partial,
 #ifdef BCH_WRITE_REF_DEBUG
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 5718988d..c8669a6b 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race,
 	TP_ARGS(bio)
 );
 
+/* ec.c */
+
+TRACE_EVENT(stripe_create,
+	TP_PROTO(struct bch_fs *c, u64 idx, int ret),
+	TP_ARGS(c, idx, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		idx			)
+		__field(int,		ret			)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->idx			= idx;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d,%d idx %llu ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->idx,
+		  __entry->ret)
+);
+
 /* Journal */
 
 DEFINE_EVENT(bch_fs, journal_full,
diff --git a/linux/blkdev.c b/linux/blkdev.c
index e496fc11..eb257d8b 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -208,6 +208,8 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
 	bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
 	bdev->bd_inode		= &bdev->__bd_inode;
 
+	mutex_init(&bdev->bd_holder_lock);
+
 	struct file *file = calloc(sizeof(*file), 1);
 	file->f_inode = bdev->bd_inode;