From c0836924b19ae84ad95d7ec97455c96f61b81201 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 16 Mar 2025 16:08:41 -0400
Subject: [PATCH] Update bcachefs sources to 4d28432bcc5f bcachefs: Validate
 bch_sb.offset field

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 .bcachefs_revision               |   2 +-
 include/crypto/sha2.h            |   6 +
 include/linux/math64.h           |  67 +++++++++
 include/linux/random.h           |  17 +++
 libbcachefs/alloc_background.c   |   2 +-
 libbcachefs/bcachefs.h           |   1 -
 libbcachefs/bcachefs_format.h    |   2 +
 libbcachefs/btree_cache.c        |   1 +
 libbcachefs/btree_io.c           |  45 +++---
 libbcachefs/btree_iter.c         |  14 --
 libbcachefs/btree_update.h       |   8 +
 libbcachefs/btree_write_buffer.c |  21 ++-
 libbcachefs/buckets.h            |   4 +-
 libbcachefs/checksum.c           |  23 +--
 libbcachefs/checksum.h           |   2 +
 libbcachefs/data_update.c        | 104 ++++++-------
 libbcachefs/ec.c                 |  16 +-
 libbcachefs/errcode.h            |  17 ++-
 libbcachefs/extents.c            | 147 +++++++++---------
 libbcachefs/extents.h            |   5 +-
 libbcachefs/extents_types.h      |   7 +-
 libbcachefs/fs-io-buffered.c     |   3 +
 libbcachefs/fs.c                 |  71 ++++-----
 libbcachefs/inode.c              |  14 +-
 libbcachefs/io_read.c            | 249 +++++++++----------------------
 libbcachefs/io_read.h            |  20 +--
 libbcachefs/io_write.c           |  38 ++++-
 libbcachefs/journal_io.c         |   5 -
 libbcachefs/move.c               |  40 +++--
 libbcachefs/movinggc.c           |  11 ++
 libbcachefs/opts.h               |   5 +
 libbcachefs/rebalance.c          |  42 +++++-
 libbcachefs/recovery.c           |   2 +-
 libbcachefs/sb-counters_format.h |   1 +
 libbcachefs/str_hash.h           |   8 +-
 libbcachefs/super-io.c           |  40 +++--
 libbcachefs/super-io.h           |   2 +
 libbcachefs/super.c              |  14 +-
 libbcachefs/sysfs.c              |  70 +++++++++
 libbcachefs/trace.h              |   5 +
 libbcachefs/util.c               |  36 +++--
 libbcachefs/util.h               |  14 +-
 42 files changed, 691 insertions(+), 510 deletions(-)

diff --git a/.bcachefs_revision b/.bcachefs_revision
index 7d7555ff..e778bec6 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-46af7258b951a79a66511172ab8772ad2dfaa4e3
+4d28432bcc5f91caf053f64a1cde1a6286adf4a6
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 8a46202b..b6183bd0 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -7,6 +7,7 @@
 #define _CRYPTO_SHA_H
 
 #include <linux/types.h>
+#include <sodium/crypto_hash_sha256.h>
 
 #define SHA1_DIGEST_SIZE        20
 #define SHA1_BLOCK_SIZE         64
@@ -112,4 +113,9 @@ extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
 
 extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
 			       unsigned int len, u8 *hash);
+
+static inline void sha256(const u8 *data, unsigned int len, u8 *out)
+{
+	crypto_hash_sha256(out, data, len);
+}
 #endif
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 5eb6f064..13efcc08 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -82,4 +82,71 @@ static inline s64 div_s64(s64 dividend, s32 divisor)
 	return div_s64_rem(dividend, divisor, &remainder);
 }
 
+#ifndef mul_u32_u32
+/*
+ * Many a GCC version messes this up and generates a 64x64 mult :-(
+ */
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#endif
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u64_shr
+static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
+{
+	return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u64_shr */
+
+#else
+
+#ifndef mul_u64_u64_shr
+static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
+{
+	union {
+		u64 ll;
+		struct {
+#ifdef __BIG_ENDIAN
+			u32 high, low;
+#else
+			u32 low, high;
+#endif
+		} l;
+	} rl, rm, rn, rh, a0, b0;
+	u64 c;
+
+	a0.ll = a;
+	b0.ll = b;
+
+	rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
+	rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
+	rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
+	rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
+
+	/*
+	 * Each of these lines computes a 64-bit intermediate result into "c",
+	 * starting at bits 32-95.  The low 32-bits go into the result of the
+	 * multiplication, the high 32-bits are carried into the next step.
+	 */
+	rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
+	rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
+	rh.l.high = (c >> 32) + rh.l.high;
+
+	/*
+	 * The 128-bit result of the multiplication is in rl.ll and rh.ll,
+	 * shift it right and throw away the high part of the result.
+	 */
+	if (shift == 0)
+		return rl.ll;
+	if (shift < 64)
+		return (rl.ll >> shift) | (rh.ll << (64 - shift));
+	return rh.ll >> (shift & 63);
+}
+#endif /* mul_u64_u64_shr */
+
+#endif
+
 #endif /* _LINUX_MATH64_H */
diff --git a/include/linux/random.h b/include/linux/random.h
index 3203d13c..9b2bb59a 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -9,7 +9,9 @@
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/bug.h>
+#include <linux/kernel.h>
 #include <linux/log2.h>
+#include <linux/math64.h>
 
 #ifdef SYS_getrandom
 static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
@@ -67,4 +69,19 @@ static inline u32 get_random_u32_below(u32 ceil)
 	}
 }
 
+static inline u64 get_random_u64_below(u64 ceil)
+{
+	if (ceil <= 1)
+		return 0;
+	if (ceil <= U32_MAX)
+		return get_random_u32_below(ceil);
+
+	for (;;) {
+		u64 rand = get_random_u64();
+		u64 mult = ceil * rand;
+		if (likely(mult >= -ceil % ceil))
+			return mul_u64_u64_shr(ceil, rand, 64);
+	}
+}
+
 #endif /* _LINUX_RANDOM_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ecad4a78..4dfcf3e6 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
 	int ret = 0;
 
 	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
-			 c, alloc_v2_unpack_error,
+			 c, alloc_v3_unpack_error,
 			 "unpack error");
 fsck_err:
 	return ret;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b432bb6e..0ea593e8 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -979,7 +979,6 @@ struct bch_fs {
 	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];
 	size_t			zstd_workspace_size;
 
-	struct crypto_shash	*sha256;
 	struct crypto_sync_skcipher *chacha20;
 	struct crypto_shash	*poly1305;
 
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 7a5b0d21..e96d8776 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+/* one free bit */
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
@@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
 					struct bch_sb, flags[5], 48, 64);
 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
 LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR,	struct bch_sb, flags[6], 14, 20);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 1ec1f90e..54666027 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 		       btree_node_write_in_flight(b));
 
 		btree_node_data_free(bc, b);
+		cond_resched();
 	}
 
 	BUG_ON(!bch2_journal_error(&c->journal) &&
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 6638bb1f..6abc9f17 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -2080,11 +2080,6 @@ static void btree_node_write_work(struct work_struct *work)
 		container_of(work, struct btree_write_bio, work);
 	struct bch_fs *c	= wbio->wbio.c;
 	struct btree *b		= wbio->wbio.bio.bi_private;
-	unsigned commit_flags =
-		BCH_WATERMARK_interior_updates|
-		BCH_TRANS_COMMIT_journal_reclaim|
-		BCH_TRANS_COMMIT_no_enospc|
-		BCH_TRANS_COMMIT_no_check_rw;
 	u64 start_time		= wbio->start_time;
 	int ret = 0;
 
@@ -2093,24 +2088,38 @@ static void btree_node_write_work(struct work_struct *work)
 		wbio->wbio.used_mempool,
 		wbio->data);
 
-	if (wbio->wbio.failed.nr) {
-		ret = bch2_trans_do(c,
-			bch2_btree_node_rewrite_key_get_iter(trans, b,
-					commit_flags));
-	} else if (!wbio->wbio.first_btree_write) {
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+		ret = -BCH_ERR_btree_node_write_all_failed;
+		goto err;
+	}
+
+	if (wbio->wbio.first_btree_write) {
+		if (wbio->wbio.failed.nr) {
+
+		}
+	} else {
 		ret = bch2_trans_do(c,
 			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-					commit_flags, true));
+					BCH_WATERMARK_interior_updates|
+					BCH_TRANS_COMMIT_journal_reclaim|
+					BCH_TRANS_COMMIT_no_enospc|
+					BCH_TRANS_COMMIT_no_check_rw,
+					!wbio->wbio.failed.nr));
+		if (ret)
+			goto err;
 	}
-
-	if (ret) {
-		set_btree_node_noevict(b);
-		bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
-				     "writing btree node: %s", bch2_err_str(ret));
-	}
-
+out:
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b, start_time);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+			     "writing btree node: %s", bch2_err_str(ret));
+	goto out;
 }
 
 static void btree_node_write_endio(struct bio *bio)
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index e32fce4f..7542c6f9 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
-						    struct btree_path *path,
-						    struct btree_path_level *l,
-						    struct bkey *u)
-{
-	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
-			bch2_btree_node_iter_peek(&l->iter, l->b));
-
-	path->pos = k.k ? k.k->p : l->b->key.k.p;
-	trans->paths_sorted = false;
-	bch2_btree_path_verify_level(trans, path, l - path->l);
-	return k;
-}
-
 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
 						    struct btree_path *path,
 						    struct btree_path_level *l,
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 8f22ef9a..47d8690f 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -126,10 +126,18 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
 
 int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
 
+int bch2_btree_write_buffer_insert_err(struct btree_trans *,
+				       enum btree_id, struct bkey_i *);
+
 static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
 					    enum btree_id btree,
 					    struct bkey_i *k)
 {
+	if (unlikely(!btree_type_uses_write_buffer(btree))) {
+		int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
+		dump_stack();
+		return ret;
+	}
 	/*
 	 * Most updates skip the btree write buffer until journal replay is
 	 * finished because synchronization with journal replay relies on having
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
index b56c4987..2c09d19d 100644
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -264,6 +264,22 @@ out:
 	BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
 }
 
+int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
+				       enum btree_id btree, struct bkey_i *k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
+	bch2_btree_id_to_text(&buf, btree);
+	prt_str(&buf, "\n");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+	printbuf_exit(&buf);
+	return -EROFS;
+}
+
 static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
@@ -312,7 +328,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 	darray_for_each(wb->sorted, i) {
 		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
 
-		BUG_ON(!btree_type_uses_write_buffer(k->btree));
+		if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
+			ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
+			goto err;
+		}
 
 		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
 			prefetch(&wb->flushing.keys.data[n->idx]);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 6aeec1c0..c5363256 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
 
 static inline int gen_after(u8 a, u8 b)
 {
-	int r = gen_cmp(a, b);
-
-	return r > 0 ? r : 0;
+	return max(0, gen_cmp(a, b));
 }
 
 static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 23a38357..7f9e4c59 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
 	return 0;
 }
 
+#if 0
+
+/*
+ * This seems to be duplicating code in cmd_remove_passphrase() in
+ * bcachefs-tools, but we might want to switch userspace to use this - and
+ * perhaps add an ioctl for calling this at runtime, so we can take the
+ * passphrase off of a mounted filesystem (which has come up).
+ */
 int bch2_disable_encryption(struct bch_fs *c)
 {
 	struct bch_sb_field_crypt *crypt;
@@ -725,6 +733,10 @@ out:
 	return ret;
 }
 
+/*
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
+ * should be
+ */
 int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 {
 	struct bch_encrypted_key key;
@@ -781,6 +793,7 @@ err:
 	memzero_explicit(&key, sizeof(key));
 	return ret;
 }
+#endif
 
 void bch2_fs_encryption_exit(struct bch_fs *c)
 {
@@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
 		crypto_free_shash(c->poly1305);
 	if (c->chacha20)
 		crypto_free_sync_skcipher(c->chacha20);
-	if (c->sha256)
-		crypto_free_shash(c->sha256);
 }
 
 int bch2_fs_encryption_init(struct bch_fs *c)
@@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	struct bch_key key;
 	int ret = 0;
 
-	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-	ret = PTR_ERR_OR_ZERO(c->sha256);
-	if (ret) {
-		c->sha256 = NULL;
-		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
-		goto out;
-	}
-
 	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		goto out;
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 43b9d71f..4ac251c8 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 			struct bch_key *);
 
+#if 0
 int bch2_disable_encryption(struct bch_fs *);
 int bch2_enable_encryption(struct bch_fs *, bool);
+#endif
 
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 522574bc..08bb7f30 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
-static bool can_allocate_without_blocking(struct bch_fs *c,
-					  struct data_update *m)
-{
-	if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
-		return false;
-
-	unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
-		? m->op.target
-		: 0;
-	struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
-	darray_for_each(m->op.devs_have, i)
-		__clear_bit(*i, devs.d);
-
-	rcu_read_lock();
-	unsigned nr_replicas = 0, i;
-	for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
-		struct bch_dev *ca = bch2_dev_rcu(c, i);
-
-		struct bch_dev_usage usage;
-		bch2_dev_usage_read_fast(ca, &usage);
-
-		if (!dev_buckets_free(ca, usage, m->op.watermark))
-			continue;
-
-		nr_replicas += ca->mi.durability;
-		if (nr_replicas >= m->op.nr_replicas)
-			break;
-	}
-	rcu_read_unlock();
-
-	return nr_replicas >= m->op.nr_replicas;
-}
-
 int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 			       struct bch_io_opts *io_opts)
 {
@@ -700,22 +666,49 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 	}
 
 	rbio_init(&m->rbio.bio, c, *io_opts, NULL);
+	m->rbio.data_update		= true;
 	m->rbio.bio.bi_iter.bi_size	= buf_bytes;
 	m->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(&m->k.k->k);
 	m->op.wbio.bio.bi_ioprio	= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
 	return 0;
 }
 
-static bool can_write_extent(struct bch_fs *c,
-			     struct bch_devs_list *devs_have,
-			     unsigned target)
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
 {
+	if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
+	    unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
+		return -BCH_ERR_data_update_done_would_block;
+
+	unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
+		? m->op.target
+		: 0;
 	struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
 
-	darray_for_each(*devs_have, i)
+	darray_for_each(m->op.devs_have, i)
 		__clear_bit(*i, devs.d);
 
-	return !bch2_is_zero(&devs, sizeof(devs));
+	rcu_read_lock();
+	unsigned nr_replicas = 0, i;
+	for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+		struct bch_dev *ca = bch2_dev_rcu(c, i);
+
+		struct bch_dev_usage usage;
+		bch2_dev_usage_read_fast(ca, &usage);
+
+		if (!dev_buckets_free(ca, usage, m->op.watermark))
+			continue;
+
+		nr_replicas += ca->mi.durability;
+		if (nr_replicas >= m->op.nr_replicas)
+			break;
+	}
+	rcu_read_unlock();
+
+	if (!nr_replicas)
+		return -BCH_ERR_data_update_done_no_rw_devs;
+	if (nr_replicas < m->op.nr_replicas)
+		return -BCH_ERR_insufficient_devices;
+	return 0;
 }
 
 int bch2_data_update_init(struct btree_trans *trans,
@@ -799,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 		ptr_bit <<= 1;
 	}
 
-	if (!can_write_extent(c, &m->op.devs_have,
-			      m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) {
-		/*
-		 * Check if we have rw devices not in devs_have: this can happen
-		 * if we're trying to move data on a ro or failed device
-		 *
-		 * If we can't move it, we need to clear the rebalance_work bit,
-		 * if applicable
-		 *
-		 * Also, copygc should skip ro/failed devices:
-		 */
-		return -BCH_ERR_data_update_done_no_rw_devs;
-	}
-
 	unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
 
 	/*
@@ -852,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans,
 		goto out_bkey_buf_exit;
 	}
 
-	if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
-	    !can_allocate_without_blocking(c, m)) {
-		ret = -BCH_ERR_data_update_done_would_block;
+	/*
+	 * Check if the allocation will succeed, to avoid getting an error later
+	 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
+	 * read:
+	 *
+	 * This guards against
+	 * - BCH_WRITE_alloc_nowait allocations failing (promotes)
+	 * - Destination target full
+	 * - Device(s) in destination target offline
+	 * - Insufficient durability available in destination target
+	 *   (i.e. trying to move a durability=2 replica to a target with a
+	 *   single durability=2 device)
+	 */
+	ret = can_write_extent(c, m);
+	if (ret)
 		goto out_bkey_buf_exit;
-	}
 
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 865cc53a..c73ba73f 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
 	return 0;
 }
 
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
-{
-	m->sectors	= le16_to_cpu(s->sectors);
-	m->algorithm	= s->algorithm;
-	m->nr_blocks	= s->nr_blocks;
-	m->nr_redundant	= s->nr_redundant;
-	m->disk_label	= s->disk_label;
-	m->blocks_nonempty = 0;
-
-	for (unsigned i = 0; i < s->nr_blocks; i++)
-		m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-}
-
 int bch2_trigger_stripe(struct btree_trans *trans,
 			enum btree_id btree, unsigned level,
 			struct bkey_s_c old, struct bkey_s _new,
@@ -1320,6 +1307,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	if (s->err) {
 		if (!bch2_err_matches(s->err, EROFS))
 			bch_err(c, "error creating stripe: error writing data buckets");
+		ret = s->err;
 		goto err;
 	}
 
@@ -1328,6 +1316,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 		if (ec_do_recov(c, &s->existing_stripe)) {
 			bch_err(c, "error creating stripe: error reading existing stripe");
+			ret = -BCH_ERR_ec_block_read;
 			goto err;
 		}
 
@@ -1353,6 +1342,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	if (ec_nr_failed(&s->new_stripe)) {
 		bch_err(c, "error creating stripe: error writing redundancy buckets");
+		ret = -BCH_ERR_ec_block_write;
 		goto err;
 	}
 
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 531fe575..cb27de6f 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -231,6 +231,7 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_offset)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
 	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
@@ -273,21 +274,25 @@
 	x(EIO,				stripe_reconstruct)			\
 	x(EIO,				key_type_error)				\
 	x(EIO,				extent_poisened)			\
-	x(EIO,				no_device_to_read_from)			\
 	x(EIO,				missing_indirect_extent)		\
 	x(EIO,				invalidate_stripe_to_dev)		\
 	x(EIO,				no_encryption_key)			\
 	x(EIO,				insufficient_journal_devices)		\
 	x(EIO,				device_offline)				\
 	x(EIO,				EIO_fault_injected)			\
+	x(EIO,				ec_block_read)				\
+	x(EIO,				ec_block_write)				\
 	x(EIO,				data_read)				\
+	x(BCH_ERR_data_read,		no_device_to_read_from)			\
+	x(BCH_ERR_data_read,		data_read_io_err)			\
+	x(BCH_ERR_data_read,		data_read_csum_err)			\
 	x(BCH_ERR_data_read,		data_read_retry)			\
 	x(BCH_ERR_data_read_retry,	data_read_retry_avoid)			\
-	x(BCH_ERR_data_read_retry_avoid,data_read_device_offline)		\
-	x(BCH_ERR_data_read_retry_avoid,data_read_io_err)			\
-	x(BCH_ERR_data_read_retry_avoid,data_read_ec_reconstruct_err)		\
-	x(BCH_ERR_data_read_retry_avoid,data_read_csum_err)			\
-	x(BCH_ERR_data_read_retry,	data_read_csum_err_maybe_userspace)	\
+	x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline)		\
+	x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err)			\
+	x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err)	\
+	x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err)		\
+	x(BCH_ERR_data_read_retry,	data_read_retry_csum_err_maybe_userspace)\
 	x(BCH_ERR_data_read,		data_read_decompress_err)		\
 	x(BCH_ERR_data_read,		data_read_decrypt_err)			\
 	x(BCH_ERR_data_read,		data_read_ptr_stale_race)		\
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index f62ee96b..1da754a8 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -28,6 +28,8 @@
 #include "trace.h"
 #include "util.h"
 
+#include <linux/random.h>
+
 static const char * const bch2_extent_flags_strs[] = {
 #define x(n, v)	[BCH_EXTENT_FLAG_##n] = #n,
 	BCH_EXTENT_FLAGS()
@@ -94,38 +96,30 @@ static inline int dev_failed(struct bch_dev *ca)
  */
 static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
-			      const struct extent_ptr_decoded p2)
+			      u64 p1_latency,
+			      struct bch_dev *ca1,
+			      const struct extent_ptr_decoded p2,
+			      u64 p2_latency)
 {
-	if (likely(!p1.do_ec_reconstruct &&
-		   !p2.do_ec_reconstruct)) {
-		struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
-		struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
+	struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
 
-		int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+	int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+	if (unlikely(failed_delta))
+		return failed_delta < 0;
 
-		if (failed_delta)
-			return failed_delta < 0;
-
-		u64 l1 = dev_latency(ca1);
-		u64 l2 = dev_latency(ca2);
-
-		/*
-		 * Square the latencies, to bias more in favor of the faster
-		 * device - we never want to stop issuing reads to the slower
-		 * device altogether, so that we can update our latency numbers:
-		 */
-		l1 *= l1;
-		l2 *= l2;
-
-		/* Pick at random, biased in favor of the faster device: */
-
-		return bch2_rand_range(l1 + l2) > l1;
-	}
-
-	if (bch2_force_reconstruct_read)
+	if (unlikely(bch2_force_reconstruct_read))
 		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
 
-	return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
+	if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+		return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
+
+	int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+	if (unlikely(crc_retry_delta))
+		return crc_retry_delta < 0;
+
+	/* Pick at random, biased in favor of the faster device: */
+
+	return get_random_u64_below(p1_latency + p2_latency) > p1_latency;
 }
 
 /*
@@ -138,86 +132,105 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 			       struct extent_ptr_decoded *pick,
 			       int dev)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_dev_io_failures *f;
-	unsigned csum_retry = 0;
-	bool have_csum_retries = false;
-	int ret = 0;
+	bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+	bool have_dirty_ptrs = false, have_pick = false;
 
 	if (k.k->type == KEY_TYPE_error)
 		return -BCH_ERR_key_type_error;
 
-	if (bch2_bkey_extent_ptrs_flags(ptrs) & BCH_EXTENT_FLAG_poisoned)
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
 		return -BCH_ERR_extent_poisened;
-again:
+
 	rcu_read_lock();
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 pick_latency;
+
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		have_dirty_ptrs |= !p.ptr.cached;
+
 		/*
 		 * Unwritten extent: no need to actually read, treat it as a
 		 * hole and return 0s:
 		 */
 		if (p.ptr.unwritten) {
-			ret = 0;
-			break;
+			rcu_read_unlock();
+			return 0;
 		}
 
 		/* Are we being asked to read from a specific device? */
 		if (dev >= 0 && p.ptr.dev != dev)
 			continue;
 
-		/*
-		 * If there are any dirty pointers it's an error if we can't
-		 * read:
-		 */
-		if (!ret && !p.ptr.cached)
-			ret = -BCH_ERR_no_device_to_read_from;
-
 		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
 
 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
 			continue;
 
-		if (unlikely(failed) &&
-		    (f = bch2_dev_io_failures(failed, p.ptr.dev))) {
-			have_csum_retries |= !f->failed_io && f->failed_csum_nr < BCH_MAX_CSUM_RETRIES;
+		struct bch_dev_io_failures *f =
+			unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (unlikely(f)) {
+			p.crc_retry_nr	   = f->failed_csum_nr;
+			p.has_ec	  &= ~f->failed_ec;
 
-			if (p.has_ec &&
-			    !f->failed_ec &&
-			    (f->failed_io || f->failed_csum_nr))
+			if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+				have_io_errors	|= f->failed_io;
+				have_io_errors	|= f->failed_ec;
+			}
+			have_csum_errors	|= !!f->failed_csum_nr;
+
+			if (p.has_ec && (f->failed_io || f->failed_csum_nr))
 				p.do_ec_reconstruct = true;
 			else if (f->failed_io ||
-				 f->failed_csum_nr > csum_retry)
+				 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
 				continue;
 		}
 
+		have_missing_devs |= ca && !bch2_dev_is_online(ca);
+
 		if (!ca || !bch2_dev_is_online(ca)) {
-			if (p.has_ec)
-				p.do_ec_reconstruct = true;
-			else
+			if (!p.has_ec)
 				continue;
+			p.do_ec_reconstruct = true;
 		}
 
-		if (p.has_ec && bch2_force_reconstruct_read)
+		if (bch2_force_reconstruct_read && p.has_ec)
 			p.do_ec_reconstruct = true;
 
-		if (ret > 0 && !ptr_better(c, p, *pick))
-			continue;
+		u64 p_latency = dev_latency(ca);
+		/*
+		 * Square the latencies, to bias more in favor of the faster
+		 * device - we never want to stop issuing reads to the slower
+		 * device altogether, so that we can update our latency numbers:
+		 */
+		p_latency *= p_latency;
 
-		*pick = p;
-		ret = 1;
+		if (!have_pick ||
+		    ptr_better(c,
+			       p, p_latency, ca,
+			       *pick, pick_latency)) {
+			*pick = p;
+			pick_latency = p_latency;
+			have_pick = true;
+		}
 	}
 	rcu_read_unlock();
 
-	if (unlikely(ret == -BCH_ERR_no_device_to_read_from &&
-		     have_csum_retries &&
-		     csum_retry < BCH_MAX_CSUM_RETRIES)) {
-		csum_retry++;
-		goto again;
-	}
+	if (have_pick)
+		return 1;
+	if (!have_dirty_ptrs)
+		return 0;
+	if (have_missing_devs)
+		return -BCH_ERR_no_device_to_read_from;
+	if (have_csum_errors)
+		return -BCH_ERR_data_read_csum_err;
+	if (have_io_errors)
+		return -BCH_ERR_data_read_io_err;
 
-	return ret;
+	WARN_ONCE(1, "unhandled error case in %s\n", __func__);
+	return -EINVAL;
 }
 
 /* KEY_TYPE_btree_ptr: */
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index b4058502..e78a39e7 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 ({									\
 	__label__ out;							\
 									\
-	(_ptr).has_ec	= false;					\
-	(_ptr).do_ec_reconstruct = false;				\
+	(_ptr).has_ec			= false;			\
+	(_ptr).do_ec_reconstruct	= false;			\
+	(_ptr).crc_retry_nr		= 0;				\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
 		switch (__extent_entry_type(_entry)) {			\
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index f8b8e598..e51529dc 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -21,19 +21,18 @@ struct bch_extent_crc_unpacked {
 
 struct extent_ptr_decoded {
 	bool				has_ec;
-	unsigned			do_ec_reconstruct;
+	bool				do_ec_reconstruct;
+	u8				crc_retry_nr;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
 	struct bch_extent_stripe_ptr	ec;
 };
 
-#define BCH_MAX_CSUM_RETRIES		3
-
 struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
-		unsigned	failed_csum_nr:4,
+		unsigned	failed_csum_nr:6,
 				failed_io:1,
 				failed_ec:1;
 	}			devs[BCH_REPLICAS_MAX + 1];
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 881b3051..5ab1c73c 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -117,6 +117,9 @@ static int readpage_bio_extend(struct btree_trans *trans,
 
 			unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
 
+			/* ensure proper alignment */
+			order = min(order, __ffs(folio_offset|BIT(31)));
+
 			folio = xa_load(&iter->mapping->i_pages, folio_offset);
 			if (folio && !xa_is_value(folio))
 				break;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 459ca825..17ac9c55 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
 	return c ?: ERR_PTR(-ENOENT);
 }
 
-static int bch2_remount(struct super_block *sb, int *flags,
-			struct bch_opts opts)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret = 0;
-
-	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
-	if (opts.read_only != c->opts.read_only) {
-		down_write(&c->state_lock);
-
-		if (opts.read_only) {
-			bch2_fs_read_only(c);
-
-			sb->s_flags |= SB_RDONLY;
-		} else {
-			ret = bch2_fs_read_write(c);
-			if (ret) {
-				bch_err(c, "error going rw: %i", ret);
-				up_write(&c->state_lock);
-				ret = -EINVAL;
-				goto err;
-			}
-
-			sb->s_flags &= ~SB_RDONLY;
-		}
-
-		c->opts.read_only = opts.read_only;
-
-		up_write(&c->state_lock);
-	}
-
-	if (opt_defined(opts, errors))
-		c->opts.errors = opts.errors;
-err:
-	return bch2_err_class(ret);
-}
-
 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
 {
 	struct bch_fs *c = root->d_sb->s_fs_info;
@@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
 {
 	struct super_block *sb = fc->root->d_sb;
 	struct bch2_opts_parse *opts = fc->fs_private;
+	struct bch_fs *c = sb->s_fs_info;
+	int ret = 0;
 
-	return bch2_remount(sb, &fc->sb_flags, opts->opts);
+	opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+
+	if (opts->opts.read_only != c->opts.read_only) {
+		down_write(&c->state_lock);
+
+		if (opts->opts.read_only) {
+			bch2_fs_read_only(c);
+
+			sb->s_flags |= SB_RDONLY;
+		} else {
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
+				up_write(&c->state_lock);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			sb->s_flags &= ~SB_RDONLY;
+		}
+
+		c->opts.read_only = opts->opts.read_only;
+
+		up_write(&c->state_lock);
+	}
+
+	if (opt_defined(opts->opts, errors))
+		c->opts.errors = opts->opts.errors;
+err:
+	return bch2_err_class(ret);
 }
 
 static const struct fs_context_operations bch2_context_ops = {
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 04ec0520..7aca010e 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 			     uid, gid, mode, rdev, parent);
 }
 
-static inline u32 bkey_generation(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		BUG();
-	case KEY_TYPE_inode_generation:
-		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
-	default:
-		return 0;
-	}
-}
-
 static struct bkey_i_inode_alloc_cursor *
 bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
 {
@@ -1198,6 +1185,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 		opts->_name##_from_inode = true;			\
 	} else {							\
 		opts->_name = c->opts._name;				\
+		opts->_name##_from_inode = false;			\
 	}
 	BCH_INODE_OPTS()
 #undef x
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 652dbc58..4fb279f1 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -25,8 +25,15 @@
 #include "subvolume.h"
 #include "trace.h"
 
+#include <linux/random.h>
 #include <linux/sched/mm.h>
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+#endif
+
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -59,7 +66,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 	}
 	rcu_read_unlock();
 
-	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+	return get_random_u32_below(nr * CONGESTED_MAX) < total;
 }
 
 #else
@@ -97,14 +104,21 @@ static inline bool have_io_error(struct bch_io_failures *failed)
 	return failed && failed->nr;
 }
 
-static bool ptr_being_rewritten(struct bch_read_bio *orig,
-				unsigned dev,
-				unsigned flags)
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
 {
-	if (!(flags & BCH_READ_data_update))
+	EBUG_ON(rbio->split);
+
+	return rbio->data_update
+		? container_of(rbio, struct data_update, rbio)
+		: NULL;
+}
+
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
+{
+	struct data_update *u = rbio_data_update(orig);
+	if (!u)
 		return false;
 
-	struct data_update *u = container_of(orig, struct data_update, rbio);
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
 	unsigned i = 0;
 	bkey_for_each_ptr(ptrs, ptr) {
@@ -193,7 +207,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
 					    struct bpos pos,
 					    struct extent_ptr_decoded *pick,
 					    unsigned sectors,
-					    unsigned flags,
 					    struct bch_read_bio *orig,
 					    struct bch_io_failures *failed)
 {
@@ -214,7 +227,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
 		unsigned ptr_bit = 1;
 		bkey_for_each_ptr(ptrs, ptr) {
 			if (bch2_dev_io_failures(failed, ptr->dev) &&
-			    !ptr_being_rewritten(orig, ptr->dev, flags))
+			    !ptr_being_rewritten(orig, ptr->dev))
 				update_opts.rewrite_ptrs |= ptr_bit;
 			ptr_bit <<= 1;
 		}
@@ -308,7 +321,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
 				k.k->type == KEY_TYPE_reflink_v
 				? BTREE_ID_reflink
 				: BTREE_ID_extents,
-				k, pos, pick, sectors, flags, orig, failed);
+				k, pos, pick, sectors, orig, failed);
 	if (!promote)
 		return NULL;
 
@@ -336,7 +349,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o
 	if (ret)
 		return ret;
 
-	if (rbio->flags & BCH_READ_data_update)
+	if (rbio->data_update)
 		prt_str(out, "(internal move) ");
 
 	return 0;
@@ -416,83 +429,6 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 	bio_endio(&rbio->bio);
 }
 
-static struct bkey_s_c get_rbio_extent(struct btree_trans *trans,
-				       struct bch_read_bio *rbio,
-				       struct btree_iter *iter)
-{
-	if (rbio->flags & BCH_READ_data_update) {
-		struct data_update *u = container_of(rbio, struct data_update, rbio);
-
-		return bch2_bkey_get_iter(trans, iter,
-					  u->btree_id, bkey_start_pos(&u->k.k->k), 0);
-	} else {
-		struct bpos pos = rbio->read_pos;
-		int ret = bch2_subvolume_get_snapshot(trans, rbio->subvol, &pos.snapshot);
-		if (ret)
-			return bkey_s_c_err(ret);
-
-		return bch2_bkey_get_iter(trans, iter,
-					  BTREE_ID_extents, pos, 0);
-	}
-}
-
-static void mark_io_failure_if_current_extent_matches(struct btree_trans *trans,
-						      struct bch_read_bio *rbio,
-						      struct bch_io_failures *failed)
-{
-	struct btree_iter iter = {};
-	struct bkey_s_c k;
-	int ret = lockrestart_do(trans,
-				 bkey_err(k = get_rbio_extent(trans, rbio, &iter)));
-
-	if (!ret) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-		bkey_for_each_ptr(ptrs, ptr)
-			if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr))
-				bch2_mark_io_failure(failed, &rbio->pick,
-					rbio->ret == -BCH_ERR_data_read_csum_err);
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-}
-
-static noinline int maybe_poison_extent(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c k, struct bch_io_failures *failed)
-{
-	u64 flags = bch2_bkey_extent_flags(k);
-	if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-		return 0;
-
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	/*
-	 * Make sure we actually attempt to read and got checksum failures from
-	 * every replica
-	 */
-
-	rcu_read_lock();
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
-			continue;
-
-		struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, ptr->dev);
-		if (!f || f->failed_csum_nr != BCH_MAX_CSUM_RETRIES) {
-			rcu_read_unlock();
-			return 0;
-		}
-	}
-	rcu_read_unlock();
-
-	struct bkey_i *new = __bch2_bkey_make_mut(trans, iter, &k, 0, 0,
-				   bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
-	return  PTR_ERR_OR_ZERO(new) ?:
-		bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-}
-
 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
 					struct bch_read_bio *rbio,
 					struct bvec_iter bvec_iter,
@@ -530,9 +466,6 @@ err:
 		goto retry;
 
 	if (ret) {
-		if (ret == -BCH_ERR_no_device_to_read_from && failed)
-			maybe_poison_extent(trans, &iter, k, failed);
-
 		rbio->bio.bi_status	= BLK_STS_IOERR;
 		rbio->ret		= ret;
 	}
@@ -560,7 +493,8 @@ static void bch2_rbio_retry(struct work_struct *work)
 		     bvec_iter_sectors(rbio->bvec_iter));
 
 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
-		mark_io_failure_if_current_extent_matches(trans, rbio, &failed);
+		bch2_mark_io_failure(&failed, &rbio->pick,
+				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
 
 	if (!rbio->split) {
 		rbio->bio.bi_status	= 0;
@@ -577,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	flags &= ~BCH_READ_last_fragment;
 	flags |= BCH_READ_must_clone;
 
-	int ret = flags & BCH_READ_data_update
+	int ret = rbio->data_update
 		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
 		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
 
@@ -591,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 			bch2_inum_offset_err_msg_trans(trans, &buf,
 					(subvol_inum) { subvol, read_pos.inode },
 					read_pos.offset << 9));
-		if (rbio->flags & BCH_READ_data_update)
+		if (rbio->data_update)
 			prt_str(&buf, "(internal move) ");
 		prt_str(&buf, "successful retry");
 
@@ -647,7 +581,7 @@ static void bch2_read_io_err(struct work_struct *work)
 		bch_err_ratelimited(c, "%s", buf.buf);
 
 	printbuf_exit(&buf);
-	bch2_rbio_error(rbio, -BCH_ERR_data_read_io_err, bio->bi_status);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
 }
 
 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -734,7 +668,7 @@ static void bch2_read_csum_err(struct work_struct *work)
 	else
 		bch_err_ratelimited(c, "%s", buf.buf);
 
-	bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err, BLK_STS_IOERR);
+	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
 	printbuf_exit(&buf);
 }
 
@@ -778,42 +712,6 @@ static void bch2_read_decrypt_err(struct work_struct *work)
 	printbuf_exit(&buf);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-
-static void corrupt_bio(struct bio *bio)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
-	bio_for_each_segment(bv, bio, iter) {
-		unsigned u64s = bv.bv_len / sizeof(u64);
-
-		if (offset < u64s) {
-			u64 *segment = bvec_kmap_local(&bv);
-			segment[offset] = get_random_u64();
-			kunmap_local(segment);
-			return;
-		}
-		offset -= u64s;
-	}
-}
-
-static inline void maybe_corrupt_bio(struct bio *bio)
-{
-	if (bch2_read_corrupt_ratio &&
-	    !get_random_u32_below(bch2_read_corrupt_ratio))
-		corrupt_bio(bio);
-}
-#else
-static inline void maybe_corrupt_bio(struct bio *bio)
-{
-}
-#endif
-
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
@@ -821,9 +719,10 @@ static void __bch2_read_endio(struct work_struct *work)
 		container_of(work, struct bch_read_bio, work);
 	struct bch_fs *c	= rbio->c;
 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	struct bio *src		= &rbio->bio;
-	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
-	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
+	struct bio *src			= &rbio->bio;
+	struct bio *dst			= &parent->bio;
+	struct bvec_iter dst_iter	= rbio->bvec_iter;
 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
 	struct nonce nonce = extent_nonce(rbio->version, crc);
 	unsigned nofs_flags;
@@ -841,7 +740,7 @@ static void __bch2_read_endio(struct work_struct *work)
 		src->bi_iter			= rbio->bvec_iter;
 	}
 
-	maybe_corrupt_bio(src);
+	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
 
 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
 	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
@@ -853,7 +752,7 @@ static void __bch2_read_endio(struct work_struct *work)
 	 */
 	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
 		rbio->flags |= BCH_READ_must_bounce;
-		bch2_rbio_error(rbio, -BCH_ERR_data_read_csum_err_maybe_userspace,
+		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
 				BLK_STS_IOERR);
 		goto out;
 	}
@@ -873,7 +772,7 @@ static void __bch2_read_endio(struct work_struct *work)
 	if (unlikely(rbio->narrow_crcs))
 		bch2_rbio_narrow_crcs(rbio);
 
-	if (likely(!(rbio->flags & BCH_READ_data_update))) {
+	if (likely(!parent->data_update)) {
 		/* Adjust crc to point to subset of data we want: */
 		crc.offset     += rbio->offset_into_extent;
 		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
@@ -1043,6 +942,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	struct bch_read_bio *rbio = NULL;
 	bool bounce = false, read_full = false, narrow_crcs = false;
 	struct bpos data_pos = bkey_start_pos(k.k);
+	struct data_update *u = rbio_data_update(orig);
 	int ret = 0;
 
 	if (bkey_extent_is_inline_data(k.k)) {
@@ -1106,16 +1006,7 @@ retry_pick:
 		goto retry_pick;
 	}
 
-	/*
-	 * Unlock the iterator while the btree node's lock is still in
-	 * cache, before doing the IO:
-	 */
-	if (!(flags & BCH_READ_in_retry))
-		bch2_trans_unlock(trans);
-	else
-		bch2_trans_unlock_long(trans);
-
-	if (!(flags & BCH_READ_data_update)) {
+	if (likely(!u)) {
 		if (!(flags & BCH_READ_last_fragment) ||
 		    bio_flagged(&orig->bio, BIO_CHAIN))
 			flags |= BCH_READ_must_clone;
@@ -1138,12 +1029,10 @@ retry_pick:
 			bounce = true;
 		}
 	} else {
-		read_full = true;
 		/*
 		 * can happen if we retry, and the extent we were going to read
 		 * has been merged in the meantime:
 		 */
-		struct data_update *u = container_of(orig, struct data_update, rbio);
 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
 			if (ca)
 				percpu_ref_put(&ca->io_ref);
@@ -1152,6 +1041,7 @@ retry_pick:
 		}
 
 		iter.bi_size	= pick.crc.compressed_size << 9;
+		read_full = true;
 	}
 
 	if (orig->opts.promote_target || have_io_error(failed))
@@ -1242,10 +1132,14 @@ retry_pick:
 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
 	rbio->bio.bi_end_io	= bch2_read_endio;
 
+	/* XXX: also nvme read recovery level */
+	if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
+		rbio->bio.bi_opf |= REQ_FUA;
+
 	if (rbio->bounce)
 		trace_and_count(c, io_read_bounce, &rbio->bio);
 
-	if (!(flags & BCH_READ_data_update))
+	if (!u)
 		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
 	else
 		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
@@ -1255,7 +1149,7 @@ retry_pick:
 	 * If it's being moved internally, we don't want to flag it as a cache
 	 * hit:
 	 */
-	if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update))
+	if (ca && pick.ptr.cached && !u)
 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
@@ -1264,6 +1158,15 @@ retry_pick:
 		trace_and_count(c, io_read_split, &orig->bio);
 	}
 
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	if (!(flags & BCH_READ_in_retry))
+		bch2_trans_unlock(trans);
+	else
+		bch2_trans_unlock_long(trans);
+
 	if (likely(!rbio->pick.do_ec_reconstruct)) {
 		if (unlikely(!rbio->have_ioref)) {
 			struct printbuf buf = PRINTBUF;
@@ -1275,7 +1178,7 @@ retry_pick:
 			printbuf_exit(&buf);
 
 			bch2_rbio_error(rbio,
-					-BCH_ERR_data_read_device_offline,
+					-BCH_ERR_data_read_retry_device_offline,
 					BLK_STS_IOERR);
 			goto out;
 		}
@@ -1302,7 +1205,7 @@ retry_pick:
 	} else {
 		/* Attempting reconstruct read: */
 		if (bch2_ec_read_extent(trans, rbio, k)) {
-			bch2_rbio_error(rbio, -BCH_ERR_data_read_ec_reconstruct_err,
+			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
 					BLK_STS_IOERR);
 			goto out;
 		}
@@ -1314,6 +1217,8 @@ out:
 	if (likely(!(flags & BCH_READ_in_retry))) {
 		return 0;
 	} else {
+		bch2_trans_unlock(trans);
+
 		int ret;
 
 		rbio->context = RBIO_CONTEXT_UNBOUND;
@@ -1324,7 +1229,7 @@ out:
 
 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
 			bch2_mark_io_failure(failed, &pick,
-					ret == -BCH_ERR_data_read_csum_err);
+					ret == -BCH_ERR_data_read_retry_csum_err);
 
 		return ret;
 	}
@@ -1341,11 +1246,11 @@ hole:
 	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
 		     bvec_iter_sectors(iter));
 	/*
-	 * won't normally happen in the BCH_READ_data_update
-	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
-	 * to read no longer exists we have to signal that:
+	 * won't normally happen in the data update (bch2_move_extent()) path,
+	 * but if we retry and the extent we wanted to read no longer exists we
+	 * have to signal that:
 	 */
-	if (flags & BCH_READ_data_update)
+	if (u)
 		orig->ret = -BCH_ERR_data_read_key_overwritten;
 
 	zero_fill_bio_iter(&orig->bio, iter);
@@ -1366,7 +1271,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
 	struct bkey_s_c k;
 	int ret;
 
-	BUG_ON(flags & BCH_READ_data_update);
+	EBUG_ON(rbio->data_update);
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@@ -1393,23 +1298,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
 		if (ret)
 			goto err;
 
-		if (unlikely(flags & BCH_READ_in_retry)) {
-			struct data_update *u = flags & BCH_READ_data_update
-				? container_of(rbio, struct data_update, rbio)
-				: NULL;
-
-			if (u &&
-			    !bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
-				/* extent we wanted to read no longer exists: */
-				ret = -BCH_ERR_data_read_key_overwritten;
-				goto err;
-			}
-
-			if (!bkey_deleted(&sk.k->k) &&
-			    !bkey_and_val_eq(k, bkey_i_to_s_c(sk.k)))
-				failed->nr = 0;
-		}
-
 		s64 offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
 		unsigned sectors = k.k->size - offset_into_extent;
@@ -1447,16 +1335,18 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
 err:
+		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
+			flags |= BCH_READ_must_bounce;
+
 		if (ret &&
 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
 			break;
 	}
 
-	if (unlikely(ret)) {
-		if (ret == -BCH_ERR_no_device_to_read_from && failed)
-			maybe_poison_extent(trans, &iter, k, failed);
+	bch2_trans_iter_exit(trans, &iter);
 
+	if (ret) {
 		struct printbuf buf = PRINTBUF;
 		lockrestart_do(trans,
 			bch2_inum_offset_err_msg_trans(trans, &buf, inum,
@@ -1472,7 +1362,6 @@ err:
 			bch2_rbio_done(rbio);
 	}
 
-	bch2_trans_iter_exit(trans, &iter);
 	bch2_bkey_buf_exit(&sk, c);
 	return ret;
 }
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
index edcf50a4..cd219504 100644
--- a/libbcachefs/io_read.h
+++ b/libbcachefs/io_read.h
@@ -36,7 +36,8 @@ struct bch_read_bio {
 	u16			flags;
 	union {
 	struct {
-	u16			promote:1,
+	u16			data_update:1,
+				promote:1,
 				bounce:1,
 				split:1,
 				have_ioref:1,
@@ -109,7 +110,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 	x(retry_if_stale)		\
 	x(may_promote)			\
 	x(user_mapped)			\
-	x(data_update)			\
 	x(last_fragment)		\
 	x(must_bounce)			\
 	x(must_clone)			\
@@ -163,12 +163,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
 {
 	struct bch_read_bio *rbio = to_rbio(bio);
 
-	rbio->c		= orig->c;
-	rbio->_state	= 0;
-	rbio->ret	= 0;
-	rbio->split	= true;
-	rbio->parent	= orig;
-	rbio->opts	= orig->opts;
+	rbio->c			= orig->c;
+	rbio->_state		= 0;
+	rbio->flags		= 0;
+	rbio->ret		= 0;
+	rbio->split		= true;
+	rbio->parent		= orig;
+	rbio->opts		= orig->opts;
 	return rbio;
 }
 
@@ -182,7 +183,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
 	rbio->start_time	= local_clock();
 	rbio->c			= c;
 	rbio->_state		= 0;
-	rbio->ret	= 0;
+	rbio->flags		= 0;
+	rbio->ret		= 0;
 	rbio->opts		= opts;
 	rbio->bio.bi_end_io	= end_io;
 	return rbio;
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index dbfcb28f..a2e6b305 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -34,6 +34,12 @@
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_write_corrupt_ratio;
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(write_corrupt_ratio, "");
+#endif
+
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 		bounce = true;
 	}
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
+	if (!bounce && write_corrupt_ratio) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+#endif
 	saved_iter = dst->bi_iter;
 
 	do {
@@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
 		init_append_extent(op, wp, version, crc);
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+		if (write_corrupt_ratio) {
+			swap(dst->bi_iter.bi_size, dst_len);
+			bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+#endif
+
 		if (dst != src)
 			bio_advance(dst, dst_len);
 		bio_advance(src, src_len);
@@ -1394,6 +1417,7 @@ retry:
 		bio->bi_private	= &op->cl;
 		bio->bi_opf |= REQ_OP_WRITE;
 		closure_get(&op->cl);
+
 		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  op->insert_keys.top, true);
 
@@ -1718,20 +1742,26 @@ static const char * const bch2_write_flags[] = {
 
 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
 {
-	prt_str(out, "pos: ");
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	prt_printf(out, "pos:\t");
 	bch2_bpos_to_text(out, op->pos);
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	prt_str(out, "started: ");
+	prt_printf(out, "started:\t");
 	bch2_pr_time_units(out, local_clock() - op->start_time);
 	prt_newline(out);
 
-	prt_str(out, "flags: ");
+	prt_printf(out, "flags:\t");
 	prt_bitflags(out, bch2_write_flags, op->flags);
 	prt_newline(out);
 
-	prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
+	prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
+	prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
+
+	prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
 
 	printbuf_indent_sub(out, 2);
 }
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 331c9d76..cf2700b0 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	kvfree(new_buf);
 }
 
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
-{
-	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-}
-
 static CLOSURE_CALLBACK(journal_write_done)
 {
 	closure_type(w, struct journal_buf, io);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index a3096e2a..55e17c2d 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -101,13 +101,25 @@ static void move_free(struct moving_io *io)
 static void move_write_done(struct bch_write_op *op)
 {
 	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct bch_fs *c = op->c;
 	struct moving_context *ctxt = io->write.ctxt;
 
-	if (io->write.op.error)
-		ctxt->write_error = true;
+	if (op->error) {
+		if (trace_io_move_write_fail_enabled()) {
+			struct printbuf buf = PRINTBUF;
 
-	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
-	atomic_dec(&io->write.ctxt->write_ios);
+			bch2_write_op_to_text(&buf, op);
+			prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
+			trace_io_move_write_fail(c, buf.buf);
+			printbuf_exit(&buf);
+		}
+		this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
+
+		ctxt->write_error = true;
+	}
+
+	atomic_sub(io->write_sectors, &ctxt->write_sectors);
+	atomic_dec(&ctxt->write_ios);
 	move_free(io);
 	closure_put(&ctxt->cl);
 }
@@ -359,7 +371,6 @@ int bch2_move_extent(struct moving_context *ctxt,
 			   bkey_start_pos(k.k),
 			   iter->btree_id, k, 0,
 			   NULL,
-			   BCH_READ_data_update|
 			   BCH_READ_last_fragment,
 			   data_opts.scrub ?  data_opts.read_dev : -1);
 	return 0;
@@ -580,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 		    k.k->type == KEY_TYPE_reflink_p &&
 		    REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
 			struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-			s64 offset_into_extent	= iter.pos.offset - bkey_start_offset(k.k);
+			s64 offset_into_extent	= 0;
 
 			bch2_trans_iter_exit(trans, &reflink_iter);
 			k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
@@ -599,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
 			 * pointer - need to fixup iter->k
 			 */
 			extent_iter = &reflink_iter;
+			offset_into_extent = 0;
 		}
 
 		if (!bkey_extent_is_direct_data(k.k))
@@ -712,7 +724,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 	struct btree_iter iter = {}, bp_iter = {};
 	struct bkey_buf sk;
 	struct bkey_s_c k;
-	unsigned sectors_moved = 0;
 	struct bkey_buf last_flushed;
 	int ret = 0;
 
@@ -834,7 +845,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 
 		if (ctxt->stats)
 			atomic64_add(sectors, &ctxt->stats->sectors_seen);
-		sectors_moved += sectors;
 next:
 		bch2_btree_iter_advance(&bp_iter);
 	}
@@ -1253,17 +1263,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	prt_printf(out, "keys moved:  %llu\n",	atomic64_read(&stats->keys_moved));
-	prt_printf(out, "keys raced:  %llu\n",	atomic64_read(&stats->keys_raced));
-	prt_printf(out, "bytes seen:  ");
+	prt_printf(out, "keys moved:\t%llu\n",	atomic64_read(&stats->keys_moved));
+	prt_printf(out, "keys raced:\t%llu\n",	atomic64_read(&stats->keys_raced));
+	prt_printf(out, "bytes seen:\t");
 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
 	prt_newline(out);
 
-	prt_printf(out, "bytes moved: ");
+	prt_printf(out, "bytes moved:\t");
 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
 	prt_newline(out);
 
-	prt_printf(out, "bytes raced: ");
+	prt_printf(out, "bytes raced:\t");
 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
 	prt_newline(out);
 
@@ -1272,7 +1282,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
 
 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
 {
-	struct moving_io *io;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
 
 	bch2_move_stats_to_text(out, ctxt->stats);
 	printbuf_indent_add(out, 2);
@@ -1292,6 +1303,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
 	printbuf_indent_add(out, 2);
 
 	mutex_lock(&ctxt->lock);
+	struct moving_io *io;
 	list_for_each_entry(io, &ctxt->ios, io_list)
 		bch2_data_update_inflight_to_text(out, &io->write);
 	mutex_unlock(&ctxt->lock);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index fa19fc44..5126c870 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 	prt_printf(out, "Currently calculated wait:\t");
 	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
 	prt_newline(out);
+
+	rcu_read_lock();
+	struct task_struct *t = rcu_dereference(c->copygc_thread);
+	if (t)
+		get_task_struct(t);
+	rcu_read_unlock();
+
+	if (t) {
+		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+		put_task_struct(t);
+	}
 }
 
 static int bch2_copygc_thread(void *arg)
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index afb89d31..baa9c11a 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -186,6 +186,11 @@ enum fsck_err_opts {
 	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
+	x(checksum_err_retry_nr,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(0, 32),						\
+	  BCH_SB_CSUM_ERR_RETRY_NR,	3,				\
+	  NULL,		NULL)						\
 	x(compression,			u8,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_compression),					\
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 58f6d97e..29a56938 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -26,9 +26,8 @@
 
 /* bch_extent_rebalance: */
 
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 
 	bkey_extent_entry_for_each(ptrs, entry)
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
 	return NULL;
 }
 
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+	return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
+}
+
 static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
 					   struct bch_io_opts *opts,
 					   struct bkey_s_c k,
@@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
 
 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 {
-	const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
 	if (!opts)
 		return 0;
 
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	u64 sectors = 0;
@@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
 
 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 {
+	printbuf_tabstop_push(out, 32);
+
 	struct bch_fs_rebalance *r = &c->rebalance;
 
+	/* print pending work */
+	struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
+	u64 v;
+	bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+
+	prt_printf(out, "pending work:\t");
+	prt_human_readable_u64(out, v);
+	prt_printf(out, "\n\n");
+
 	prt_str(out, bch2_rebalance_state_strs[r->state]);
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
@@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 	case BCH_REBALANCE_waiting: {
 		u64 now = atomic64_read(&c->io_clock[WRITE].now);
 
-		prt_str(out, "io wait duration:  ");
+		prt_printf(out, "io wait duration:\t");
 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
 		prt_newline(out);
 
-		prt_str(out, "io wait remaining: ");
+		prt_printf(out, "io wait remaining:\t");
 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
 		prt_newline(out);
 
-		prt_str(out, "duration waited:   ");
+		prt_printf(out, "duration waited:\t");
 		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
 		prt_newline(out);
 		break;
@@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 		break;
 	}
 	prt_newline(out);
+
+	rcu_read_lock();
+	struct task_struct *t = rcu_dereference(c->rebalance.thread);
+	if (t)
+		get_task_struct(t);
+	rcu_read_unlock();
+
+	if (t) {
+		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+		put_task_struct(t);
+	}
+
 	printbuf_indent_sub(out, 2);
 }
 
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 71c786cd..a6e26733 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -899,7 +899,7 @@ use_clean:
 	 * journal sequence numbers:
 	 */
 	if (!c->sb.clean)
-		journal_seq += 8;
+		journal_seq += JOURNAL_BUF_NR * 4;
 
 	if (blacklist_seq != journal_seq) {
 		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index c82a8910..fa27ec59 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -22,6 +22,7 @@ enum counters_flags {
 	x(io_move_write,				36,	TYPE_SECTORS)	\
 	x(io_move_finish,				37,	TYPE_SECTORS)	\
 	x(io_move_fail,					38,	TYPE_COUNTER)	\
+	x(io_move_write_fail,				82,	TYPE_COUNTER)	\
 	x(io_move_start_fail,				39,	TYPE_COUNTER)	\
 	x(bucket_invalidate,				3,	TYPE_COUNTER)	\
 	x(bucket_discard,				4,	TYPE_COUNTER)	\
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index f645a454..575ad1e0 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -12,7 +12,6 @@
 #include "super.h"
 
 #include <linux/crc32c.h>
-#include <crypto/hash.h>
 #include <crypto/sha2.h>
 
 static inline enum bch_str_hash_type
@@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
 	};
 
 	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
-		SHASH_DESC_ON_STACK(desc, c->sha256);
 		u8 digest[SHA256_DIGEST_SIZE];
 
-		desc->tfm = c->sha256;
-
-		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
-				    sizeof(bi->bi_hash_seed), digest);
+		sha256((const u8 *)&bi->bi_hash_seed,
+		       sizeof(bi->bi_hash_seed), digest);
 		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
 	}
 
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index ee32d043..f2e44282 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -365,10 +365,9 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
 	return 0;
 }
 
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
-			    enum bch_validate_flags flags, struct printbuf *out)
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+		     enum bch_validate_flags flags, struct printbuf *out)
 {
-	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field_members_v1 *mi;
 	enum bch_opt_id opt_id;
 	int ret;
@@ -377,15 +376,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 	if (ret)
 		return ret;
 
-	if (sb->features[1] ||
-	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
-		prt_printf(out, "Filesystem has incompatible features");
+	u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+	unsigned incompat_bit = 0;
+	if (incompat)
+		incompat_bit = __ffs64(incompat);
+	else if (sb->features[1])
+		incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+	if (incompat_bit) {
+		prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+			   incompat_bit,
+			   bch2_sb_features[BCH_FEATURE_NR - 1],
+			   BCH_FEATURE_NR - 1);
 		return -BCH_ERR_invalid_sb_features;
 	}
 
 	if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
 	    BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
-		prt_printf(out, "Filesystem has incompatible version");
+		prt_str(out, "Filesystem has incompatible version ");
+		bch2_version_to_text(out, le16_to_cpu(sb->version));
+		prt_str(out, ", current version ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
 		return -BCH_ERR_invalid_sb_features;
 	}
 
@@ -399,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 		return -BCH_ERR_invalid_sb_uuid;
 	}
 
+	if (!(flags & BCH_VALIDATE_write) &&
+	    le64_to_cpu(sb->offset) != read_offset) {
+		prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
+			   le64_to_cpu(sb->offset), read_offset);
+		return -BCH_ERR_invalid_sb_offset;
+	}
+
 	if (!sb->nr_devices ||
 	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
 		prt_printf(out, "Bad number of member devices %u (max %u)",
@@ -457,6 +475,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 
 		if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
 			SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+		    !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+			SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
 	}
 
 #ifdef __KERNEL__
@@ -874,7 +896,7 @@ got_super:
 
 	sb->have_layout = true;
 
-	ret = bch2_sb_validate(sb, 0, &err);
+	ret = bch2_sb_validate(sb->sb, offset, 0, &err);
 	if (ret) {
 		bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
 				path, err.buf);
@@ -1031,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
 	darray_for_each(online_devices, ca) {
 		printbuf_reset(&err);
 
-		ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
+		ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
 		if (ret) {
 			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
 			goto out;
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 167dd98f..78f708a6 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
 void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index cffad3b6..8e928b3d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -75,9 +75,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
 MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: crc32c");
-MODULE_SOFTDEP("pre: crc64");
-MODULE_SOFTDEP("pre: sha256");
 MODULE_SOFTDEP("pre: chacha20");
 MODULE_SOFTDEP("pre: poly1305");
 MODULE_SOFTDEP("pre: xxhash");
@@ -1838,7 +1835,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		goto err_late;
 
 	up_write(&c->state_lock);
-	return 0;
+out:
+	printbuf_exit(&label);
+	printbuf_exit(&errbuf);
+	bch_err_fn(c, ret);
+	return ret;
 
 err_unlock:
 	mutex_unlock(&c->sb_lock);
@@ -1847,10 +1848,7 @@ err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
-	printbuf_exit(&label);
-	printbuf_exit(&errbuf);
-	bch_err_fn(c, ret);
-	return ret;
+	goto out;
 err_late:
 	up_write(&c->state_lock);
 	ca = NULL;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 2ed3f755..5b8463ae 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -148,6 +148,7 @@ write_attribute(trigger_btree_key_cache_shrink);
 write_attribute(trigger_freelist_wakeup);
 write_attribute(trigger_btree_updates);
 read_attribute(gc_gens_pos);
+write_attribute(read_fua_test);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -395,6 +396,71 @@ SHOW(bch2_fs)
 	return 0;
 }
 
+static int read_fua_test(struct bch_fs *c)
+{
+	int ret = 0;
+	unsigned bs = 4096;
+	struct bio *bio;
+	void *buf;
+
+	struct bch_dev *ca = bch2_dev_get_ioref(c, 0, READ);
+	if (!ca)
+		return -EINVAL;
+
+	bio = bio_kmalloc(1, GFP_KERNEL);
+	if (!bio) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	buf = kmalloc(bs, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	u64 start = ktime_get_ns();
+	for (unsigned i = 0; i < 1000; i++) {
+		bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
+		bch2_bio_map(bio, buf, bs);
+		ret = submit_bio_wait(bio);
+		if (ret)
+			goto err;
+	}
+	u64 ns_nofua = ktime_get_ns() - start;
+
+	start = ktime_get_ns();
+	for (unsigned i = 0; i < 1000; i++) {
+		bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
+		bch2_bio_map(bio, buf, bs);
+		ret = submit_bio_wait(bio);
+		if (ret)
+			goto err;
+	}
+	u64 ns_fua = ktime_get_ns() - start;
+
+	u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
+
+	start = ktime_get_ns();
+	for (unsigned i = 0; i < 1000; i++) {
+		bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, 1, READ);
+		bio->bi_iter.bi_sector = (get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
+		bch2_bio_map(bio, buf, bs);
+		ret = submit_bio_wait(bio);
+		if (ret)
+			goto err;
+	}
+	u64 ns_rand = ktime_get_ns() - start;
+
+	pr_info("ns  nofua %llu", ns_nofua);
+	pr_info("ns    fua %llu", ns_fua);
+	pr_info("ns random %llu", ns_rand);
+err:
+	kfree(buf);
+	kfree(bio);
+	percpu_ref_put(&ca->io_ref);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 STORE(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -451,6 +517,9 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_freelist_wakeup)
 		closure_wake_up(&c->freelist_wait);
 
+	if (attr == &sysfs_read_fua_test)
+		read_fua_test(c);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -580,6 +649,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_btree_key_cache_shrink,
 	&sysfs_trigger_freelist_wakeup,
 	&sysfs_trigger_btree_updates,
+	&sysfs_read_fua_test,
 
 	&sysfs_gc_gens_pos,
 
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index c8669a6b..519d00d6 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail,
 	TP_ARGS(c, str)
 );
 
+DEFINE_EVENT(fs_str, io_move_write_fail,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
 DEFINE_EVENT(fs_str, io_move_start_fail,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 50a90e48..bf555ae7 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -653,21 +653,6 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 	return 0;
 }
 
-size_t bch2_rand_range(size_t max)
-{
-	size_t rand;
-
-	if (!max)
-		return 0;
-
-	do {
-		rand = get_random_long();
-		rand &= roundup_pow_of_two(max) - 1;
-	} while (rand >= max);
-
-	return rand;
-}
-
 void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
 {
 	struct bio_vec bv;
@@ -698,6 +683,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *bio)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+	bio_for_each_segment(bv, bio, iter) {
+		unsigned u64s = bv.bv_len / sizeof(u64);
+
+		if (offset < u64s) {
+			u64 *segment = bvec_kmap_local(&bv);
+			segment[offset] = get_random_u64();
+			kunmap_local(segment);
+			return;
+		}
+		offset -= u64s;
+	}
+}
+#endif
+
 #if 0
 void eytzinger1_test(void)
 {
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index e7c3541b..f0e360eb 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -401,11 +401,21 @@ do {									\
 	_ret;								\
 })
 
-size_t bch2_rand_range(size_t);
-
 void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *);
+
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
+{
+	if (ratio && !get_random_u32_below(ratio))
+		bch2_corrupt_bio(bio);
+}
+#else
+#define bch2_maybe_corrupt_bio(...)	do {} while (0)
+#endif
+
 static inline void memcpy_u64s_small(void *dst, const void *src,
 				     unsigned u64s)
 {