diff --git a/.bcachefs_revision b/.bcachefs_revision
index 57c74ea2..0c7b8559 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-e7f62157681d96386dc500609149b9685358a2b0
+f9c612bbf82da87d7d4a005310c5213db00e22de
diff --git a/Makefile.compiler b/Makefile.compiler
index 7aa1fbc4..8fcb4274 100644
--- a/Makefile.compiler
+++ b/Makefile.compiler
@@ -32,13 +32,13 @@ try-run = $(shell set -e;		\
 # Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
 
 as-option = $(call try-run,\
-	$(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
 
 # as-instr
 # Usage: aflags-y += $(call as-instr,instr,option1,option2)
 
 as-instr = $(call try-run,\
-	printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+	printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
 
 # __cc-option
 # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
@@ -72,7 +72,3 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1)
 # ld-option
 # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
 ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
-
-# ld-ifversion
-# Usage:  $(call ld-ifversion, -ge, 22252, y)
-ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4))
diff --git a/cmd_dump.c b/cmd_dump.c
index bf570dc6..0d349233 100644
--- a/cmd_dump.c
+++ b/cmd_dump.c
@@ -61,13 +61,11 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		const struct bch_extent_ptr *ptr;
 		struct bkey_ptrs_c ptrs;
-		struct btree_trans trans;
+		struct btree_trans *trans = bch2_trans_get(c);
 		struct btree_iter iter;
 		struct btree *b;
 
-		bch2_trans_init(&trans, c, 0, 0);
-
-		__for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+		__for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
 			struct btree_node_iter iter;
 			struct bkey u;
 			struct bkey_s_c k;
@@ -97,8 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
 						  btree_bytes(c));
 		}
 
-		bch2_trans_iter_exit(&trans, &iter);
-		bch2_trans_exit(&trans);
+		bch2_trans_iter_exit(trans, &iter);
+		bch2_trans_put(trans);
 	}
 
 	qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
diff --git a/cmd_kill_btree_node.c b/cmd_kill_btree_node.c
index e9b8265d..83389bc4 100644
--- a/cmd_kill_btree_node.c
+++ b/cmd_kill_btree_node.c
@@ -64,7 +64,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
 	if (IS_ERR(c))
 		die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
 
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct btree *b;
 	int ret;
@@ -74,9 +74,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
 	if (ret)
 		die("error %s from posix_memalign", bch2_err_str(ret));
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	__for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+	__for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
 		if (b->c.level != level)
 			continue;
 
@@ -113,8 +111,8 @@ int cmd_kill_btree_node(int argc, char *argv[])
 		bch_err(c, "node at specified index not found");
 	ret = EXIT_FAILURE;
 done:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	bch2_fs_stop(c);
 	return ret;
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 3958ba6b..85ab96c0 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -33,7 +33,7 @@
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/fs-common.h"
 #include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_write.h"
 #include "libbcachefs/replicas.h"
 #include "libbcachefs/str_hash.h"
 #include "libbcachefs/super.h"
@@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c,
 	bch2_inode_pack(&packed, inode);
 	packed.inode.k.p.snapshot = U32_MAX;
 	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-				NULL, NULL, 0);
+				NULL, 0);
 	if (ret)
 		die("error updating inode: %s", bch2_err_str(ret));
 }
@@ -140,7 +140,7 @@ static void create_link(struct bch_fs *c,
 	struct bch_inode_unpacked inode;
 
 	int ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_link_trans(&trans,
+		bch2_link_trans(trans,
 				(subvol_inum) { 1, parent->bi_inum }, &parent_u,
 				(subvol_inum) { 1, inum }, &inode, &qstr));
 	if (ret)
@@ -159,7 +159,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
 	bch2_inode_init_early(c, &new_inode);
 
 	int ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_create_trans(&trans,
+		bch2_create_trans(trans,
 				  (subvol_inum) { 1, parent->bi_inum }, parent,
 				  &new_inode, &qstr,
 				  uid, gid, mode, rdev, NULL, NULL,
@@ -232,7 +232,7 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
 		struct bch_inode_unpacked inode_u;
 
 		int ret = bch2_trans_do(c, NULL, NULL, 0,
-				bch2_xattr_set(&trans,
+				bch2_xattr_set(trans,
 					       (subvol_inum) { 1, dst->bi_inum },
 					       &inode_u, &hash_info, attr,
 					       val, val_size, h->flags, 0));
@@ -339,8 +339,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 			die("error reserving space in new filesystem: %s",
 			    bch2_err_str(ret));
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
-					&res, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0);
 		if (ret)
 			die("btree insert error %s", bch2_err_str(ret));
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7d378ab2..39143117 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -6,6 +6,8 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
+#define MAX_LFS_FILESIZE 	((loff_t)LLONG_MAX)
+
 #define BIO_MAX_VECS	256U
 
 typedef unsigned fmode_t;
@@ -21,30 +23,20 @@ struct user_namespace;
 #define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
 #define MKDEV(ma,mi)	(((ma) << MINORBITS) | (mi))
 
-/* file is open for reading */
-#define FMODE_READ		((__force fmode_t)0x1)
-/* file is open for writing */
-#define FMODE_WRITE		((__force fmode_t)0x2)
-/* file is seekable */
-#define FMODE_LSEEK		((__force fmode_t)0x4)
-/* file can be accessed using pread */
-#define FMODE_PREAD		((__force fmode_t)0x8)
-/* file can be accessed using pwrite */
-#define FMODE_PWRITE		((__force fmode_t)0x10)
-/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC		((__force fmode_t)0x20)
-/* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY		((__force fmode_t)0x40)
-/* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL		((__force fmode_t)0x80)
-/* File is opened using open(.., 3, ..) and is writeable only for ioctls
-   (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL	((__force fmode_t)0x100)
-/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH         ((__force fmode_t)0x200)
-/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH         ((__force fmode_t)0x400)
-#define FMODE_BUFFERED		((__force fmode_t)0x800)
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ		((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE		((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL		((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED	((__force blk_mode_t)(1 << 5))
 
 struct inode {
 	unsigned long		i_ino;
@@ -93,9 +85,14 @@ int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsig
 unsigned bdev_logical_block_size(struct block_device *bdev);
 sector_t get_capacity(struct gendisk *disk);
 
-void blkdev_put(struct block_device *bdev, fmode_t mode);
+struct blk_holder_ops {
+        void (*mark_dead)(struct block_device *bdev);
+};
+
+void blkdev_put(struct block_device *bdev, void *holder);
 void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+					void *holder, const struct blk_holder_ops *hop);
 int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 39df1f16..b9486dbe 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -65,6 +65,7 @@
 #define unreachable()		__builtin_unreachable()
 #define __same_type(a, b)	__builtin_types_compatible_p(typeof(a), typeof(b))
 #define fallthrough		__attribute__((__fallthrough__))
+#define __noreturn		__attribute__((__noreturn__))
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ef032531..ec5f478f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -12,7 +12,7 @@
 #define rcu_access_pointer(p)		READ_ONCE(p)
 
 #define kfree_rcu(ptr, rcu_head)	kfree(ptr) /* XXX */
-#define kvfree_rcu(ptr)			kfree(ptr) /* XXX */
+#define kvfree_rcu_mightsleep(ptr)	kfree(ptr) /* XXX */
 
 #define RCU_INIT_POINTER(p, v)		WRITE_ONCE(p, v)
 
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index b1a48886..f3809897 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -1,18 +1,71 @@
 // SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 
 #include "bcachefs.h"
 
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
 #include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+	[ACL_USER_OBJ]	= "user_obj",
+	[ACL_USER]	= "user",
+	[ACL_GROUP_OBJ]	= "group_obj",
+	[ACL_GROUP]	= "group",
+	[ACL_MASK]	= "mask",
+	[ACL_OTHER]	= "other",
+	NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+
+	if (!value ||
+	    size < sizeof(bch_acl_header) ||
+	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+		return;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		unsigned tag = le16_to_cpu(in->e_tag);
+
+		prt_str(out, acl_types[tag]);
+
+		switch (tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+		if (p != end)
+			prt_char(out, ' ');
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
 static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
 {
 	return sizeof(bch_acl_header) +
@@ -226,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
 			&hash, inode_inum(inode), &search, 0);
 	if (ret) {
 		if (!bch2_err_matches(ret, ENOENT))
@@ -253,7 +304,7 @@ retry:
 	}
 
 	xattr = bkey_s_c_to_xattr(k);
-	acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
 
 	if (!IS_ERR(acl))
@@ -262,8 +313,8 @@ out:
 	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return acl;
 }
 
@@ -303,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl;
@@ -311,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	acl = _acl;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -329,30 +379,30 @@ retry:
 			goto btree_err;
 	}
 
-	ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
+	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
 	if (ret)
 		goto btree_err;
 
 	inode_u.bi_ctime	= bch2_current_time(c);
 	inode_u.bi_mode		= mode;
 
-	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL, 0);
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
 btree_err:
-	bch2_trans_iter_exit(&trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(&trans, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_CTIME|ATTR_MODE);
 
 	set_cached_acl(&inode->v, type, acl);
 err:
-	bch2_trans_exit(&trans);
 	mutex_unlock(&inode->ei_update_lock);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -367,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 	struct btree_iter iter;
 	struct bkey_s_c_xattr xattr;
 	struct bkey_i_xattr *new;
-	struct posix_acl *acl;
+	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
 	int ret;
 
@@ -377,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
 	k = bch2_btree_iter_peek_slot(&iter);
-	xattr = bkey_s_c_to_xattr(k);
+	ret = bkey_err(k);
 	if (ret)
 		goto err;
+	xattr = bkey_s_c_to_xattr(k);
 
 	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index bb21d8d6..27e7eec0 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -7,8 +7,6 @@ struct bch_hash_info;
 struct bch_inode_info;
 struct posix_acl;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
 #define BCH_ACL_VERSION	0x0001
 
 typedef struct {
@@ -26,6 +24,10 @@ typedef struct {
 	__le32		a_version;
 } bch_acl_header;
 
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
 struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
 
 int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 069d98a8..19ef7a44 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -237,13 +237,12 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			  enum bkey_invalid_flags flags,
-			  struct printbuf *err)
+			  enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
 	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
-		prt_printf(err, "bad val size (%u > %lu)",
+		prt_printf(err, "bad val size (%u > %zu)",
 		       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
 		return -BCH_ERR_invalid_bkey;
 	}
@@ -527,7 +526,7 @@ int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
 			     struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
-		prt_printf(err, "bad val size (%lu != %zu)",
+		prt_printf(err, "bad val size (%zu != %zu)",
 		       bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
 		return -BCH_ERR_invalid_bkey;
 	}
@@ -549,7 +548,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 
 int bch2_bucket_gens_init(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_alloc_v4 a;
@@ -560,9 +559,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 	u8 gen;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		/*
 		 * Not a fsck error because this is checked/repaired by
@@ -575,10 +572,10 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		pos = alloc_gens_pos(iter.pos, &offset);
 
 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret = commit_do(&trans, NULL, NULL,
+			ret = commit_do(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW,
-				__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 			if (ret)
 				break;
 			have_bucket_gens_key = false;
@@ -592,15 +589,15 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 		g.v.gens[offset] = gen;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (have_bucket_gens_key && !ret)
-		ret = commit_do(&trans, NULL, NULL,
+		ret = commit_do(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|
 				BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -609,20 +606,19 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 int bch2_alloc_read(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
 	int ret;
 
 	down_read(&c->gc_lock);
-	bch2_trans_init(&trans, c, 0, 0);
 
 	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
 		const struct bch_bucket_gens *g;
 		u64 b;
 
-		for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+		for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
 				   BTREE_ITER_PREFETCH, k, ret) {
 			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
 			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
@@ -646,11 +642,11 @@ int bch2_alloc_read(struct bch_fs *c)
 			     b++)
 				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	} else {
 		struct bch_alloc_v4 a;
 
-		for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 				   BTREE_ITER_PREFETCH, k, ret) {
 			/*
 			 * Not a fsck error because this is checked/repaired by
@@ -663,10 +659,10 @@ int bch2_alloc_read(struct bch_fs *c)
 
 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	up_read(&c->gc_lock);
 
 	if (ret)
@@ -1201,15 +1197,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 		}
 
 		if (need_update) {
-			struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-			ret = PTR_ERR_OR_ZERO(k);
+			ret = PTR_ERR_OR_ZERO(u);
 			if (ret)
 				goto err;
 
-			memcpy(k, &g, sizeof(g));
+			memcpy(u, &g, sizeof(g));
 
-			ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
 			if (ret)
 				goto err;
 		}
@@ -1286,7 +1282,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
 	if (!btree_id_is_extents(iter->btree_id)) {
 		return __bch2_check_discard_freespace_key(trans, iter);
 	} else {
-		int ret;
+		int ret = 0;
 
 		while (!bkey_eq(iter->pos, end) &&
 		       !(ret = btree_trans_too_many_iters(trans) ?:
@@ -1355,15 +1351,14 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 		}
 
 	if (need_update) {
-		struct bkey_i *k;
+		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-		k = bch2_trans_kmalloc(trans, sizeof(g));
-		ret = PTR_ERR_OR_ZERO(k);
+		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			goto out;
 
-		memcpy(k, &g, sizeof(g));
-		ret = bch2_trans_update(trans, iter, k, 0);
+		memcpy(u, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, u, 0);
 	}
 out:
 fsck_err:
@@ -1373,27 +1368,25 @@ fsck_err:
 
 int bch2_check_alloc_info(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
 	struct bkey hole;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
 			     BTREE_ITER_PREFETCH);
-	bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 
 	while (1) {
 		struct bpos next;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
 		ret = bkey_err(k);
@@ -1406,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		if (k.k->type) {
 			next = bpos_nosnap_successor(k.k->p);
 
-			ret = bch2_check_alloc_key(&trans,
+			ret = bch2_check_alloc_key(trans,
 						   k, &iter,
 						   &discard_iter,
 						   &freespace_iter,
@@ -1416,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c)
 		} else {
 			next = k.k->p;
 
-			ret = bch2_check_alloc_hole_freespace(&trans,
+			ret = bch2_check_alloc_hole_freespace(trans,
 						    bkey_start_pos(k.k),
 						    &next,
 						    &freespace_iter) ?:
-				bch2_check_alloc_hole_bucket_gens(&trans,
+				bch2_check_alloc_hole_bucket_gens(trans,
 						    bkey_start_pos(k.k),
 						    &next,
 						    &bucket_gens_iter);
@@ -1428,7 +1421,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
 				goto bkey_err;
 		}
 
-		ret = bch2_trans_commit(&trans, NULL, NULL,
+		ret = bch2_trans_commit(trans, NULL, NULL,
 					BTREE_INSERT_NOFAIL|
 					BTREE_INSERT_LAZY_RW);
 		if (ret)
@@ -1441,29 +1434,29 @@ bkey_err:
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &bucket_gens_iter);
-	bch2_trans_iter_exit(&trans, &freespace_iter);
-	bch2_trans_iter_exit(&trans, &discard_iter);
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &bucket_gens_iter);
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret < 0)
 		goto err;
 
-	ret = for_each_btree_key2(&trans, iter,
+	ret = for_each_btree_key2(trans, iter,
 			BTREE_ID_need_discard, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-	      for_each_btree_key2(&trans, iter,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key2(trans, iter,
 			BTREE_ID_freespace, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-	      for_each_btree_key_commit(&trans, iter,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key_commit(trans, iter,
 			BTREE_ID_bucket_gens, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-		bch2_check_bucket_gens_key(&trans, &iter, k));
+		bch2_check_bucket_gens_key(trans, &iter, k));
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1549,10 +1542,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 				POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			bch2_check_alloc_to_lru_ref(&trans, &iter)));
+			bch2_check_alloc_to_lru_ref(trans, &iter)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1677,29 +1670,25 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
 	struct bpos discard_pos_done = POS_MAX;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	/*
 	 * We're doing the commit in bch2_discard_one_bucket instead of using
 	 * for_each_btree_key_commit() so that we can increment counters after
 	 * successful commit:
 	 */
-	ret = for_each_btree_key2(&trans, iter,
-			BTREE_ID_need_discard, POS_MIN, 0, k,
-		bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
-					&seen,
-					&open,
-					&need_journal_commit,
-					&discarded));
-
-	bch2_trans_exit(&trans);
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_need_discard, POS_MIN, 0, k,
+			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+						&seen,
+						&open,
+						&need_journal_commit,
+						&discarded)));
 
 	if (need_journal_commit * 2 > seen)
 		bch2_journal_flush_async(&c->journal, NULL);
@@ -1805,15 +1794,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
 	struct bch_dev *ca;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush(&trans);
+	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret)
 		goto err;
 
@@ -1821,11 +1808,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		s64 nr_to_invalidate =
 			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-		ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+		ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
 				lru_pos(ca->dev_idx, 0, 0),
 				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
 				BTREE_ITER_INTENT, k,
-			invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
 
 		if (ret < 0) {
 			percpu_ref_put(&ca->ref);
@@ -1833,7 +1820,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 		}
 	}
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
@@ -1847,7 +1834,7 @@ void bch2_do_invalidates(struct bch_fs *c)
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 				   unsigned long *last_updated)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey hole;
@@ -1855,9 +1842,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_member *m;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
 			     POS(ca->dev_idx, ca->mi.first_bucket),
 			     BTREE_ITER_PREFETCH);
 	/*
@@ -1871,7 +1856,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			*last_updated = jiffies;
 		}
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		if (bkey_ge(iter.pos, end)) {
 			ret = 0;
@@ -1891,8 +1876,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			struct bch_alloc_v4 a_convert;
 			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
 
-			ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
 			if (ret)
@@ -1902,7 +1887,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 		} else {
 			struct bkey_i *freespace;
 
-			freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
 			ret = PTR_ERR_OR_ZERO(freespace);
 			if (ret)
 				goto bkey_err;
@@ -1912,8 +1897,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 			freespace->k.p		= k.k->p;
 			freespace->k.size	= k.k->size;
 
-			ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_LAZY_RW|
 						  BTREE_INSERT_NOFAIL);
 			if (ret)
@@ -1928,11 +1913,11 @@ bkey_err:
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	if (ret < 0) {
-		bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "initializing free space");
 		return ret;
 	}
 
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index e02749dd..3bc4abd3 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -25,7 +25,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "nocow_locking.h"
@@ -502,9 +502,14 @@ again:
 }
 
 /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:	transaction object
+ * @ca:		device to allocate from
+ * @watermark:	how important is this allocation?
+ * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @usage:	for secondarily also returning the current device usage
  *
- * Returns index of bucket on success, 0 on failure
+ * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 				      struct bch_dev *ca,
@@ -597,7 +602,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	struct open_bucket *ob;
 
 	bch2_trans_do(c, NULL, NULL, 0,
-		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
 							cl, &usage)));
 	return ob;
 }
@@ -775,7 +780,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	struct dev_alloc_list devs_sorted;
 	struct ec_stripe_head *h;
 	struct open_bucket *ob;
-	struct bch_dev *ca;
 	unsigned i, ec_idx;
 	int ret = 0;
 
@@ -805,8 +809,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 		}
 	goto out_put_head;
 got_bucket:
-	ca = bch_dev_bkey_exists(c, ob->dev);
-
 	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;
 	ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -1032,10 +1034,13 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 
 /**
  * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:		open_bucket to predicate on
+ * @c:		filesystem handle
  * @ca:		if set, we're killing buckets for a particular device
  * @ec:		if true, we're shutting down erasure coding and killing all ec
  *		open_buckets
  *		otherwise, return true
+ * Returns: true if we should kill this open_bucket
  *
  * We're killing open_buckets because we're shutting down a device, erasure
  * coding, or the entire filesystem - check if this open_bucket matches:
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index 8747c5e1..cc856150 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -351,20 +351,17 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter alloc_iter = { NULL };
-	struct bch_dev *ca;
 	struct bkey_s_c alloc_k;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
-			"backpointer for mising device:\n%s",
+			"backpointer for missing device:\n%s",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = bch2_btree_delete_at(trans, bp_iter, 0);
 		goto out;
 	}
 
-	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
 	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
 				     bp_pos_to_bucket(c, k.k->p), 0);
 	ret = bkey_err(alloc_k);
@@ -393,10 +390,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_backpointers, POS_MIN, 0, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		  bch2_check_btree_backpointer(&trans, &iter, k)));
+		  bch2_check_btree_backpointer(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -629,7 +626,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	enum btree_id btree_id;
-	struct bpos_level last_flushed = { UINT_MAX };
+	struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
 	int ret = 0;
 
 	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
@@ -706,7 +703,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 		--btree_nodes;
 		if (!btree_nodes) {
-			*end = alloc_k.k->p;
+			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
 			break;
 		}
 
@@ -726,13 +723,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bpos start = POS_MIN, end;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	while (1) {
-		ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+		ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
 		if (ret)
 			break;
 
@@ -752,13 +748,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 			printbuf_exit(&buf);
 		}
 
-		ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+		ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
 		if (ret || bpos_eq(end, SPOS_MAX))
 			break;
 
 		start = bpos_successor(end);
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -827,13 +823,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
 int bch2_check_backpointers_to_extents(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	while (1) {
-		ret = bch2_get_btree_in_memory_pos(&trans,
+		ret = bch2_get_btree_in_memory_pos(trans,
 						   (1U << BTREE_ID_extents)|
 						   (1U << BTREE_ID_reflink),
 						   ~0,
@@ -859,13 +854,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 			printbuf_exit(&buf);
 		}
 
-		ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
 		if (ret || !bbpos_cmp(end, BBPOS_MAX))
 			break;
 
 		start = bbpos_successor(end);
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 30b3d7b9..9ae82254 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -454,6 +454,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_bucket_gens,
 	GC_PHASE_BTREE_snapshot_trees,
 	GC_PHASE_BTREE_deleted_inodes,
+	GC_PHASE_BTREE_logged_ops,
 
 	GC_PHASE_PENDING_DELETE,
 };
@@ -626,8 +627,8 @@ struct journal_keys {
 	size_t			size;
 };
 
-struct btree_path_buf {
-	struct btree_path	*path;
+struct btree_trans_buf {
+	struct btree_trans	*trans;
 };
 
 #define REPLICAS_DELTA_LIST_MAX	(1U << 16)
@@ -786,9 +787,9 @@ struct bch_fs {
 	/* btree_iter.c: */
 	struct seqmutex		btree_trans_lock;
 	struct list_head	btree_trans_list;
-	mempool_t		btree_paths_pool;
+	mempool_t		btree_trans_pool;
 	mempool_t		btree_trans_mem_pool;
-	struct btree_path_buf  __percpu	*btree_paths_bufs;
+	struct btree_trans_buf  __percpu	*btree_trans_bufs;
 
 	struct srcu_struct	btree_trans_barrier;
 	bool			btree_trans_barrier_initialized;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index f17238be..f0d13044 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -83,8 +83,8 @@ typedef uuid_t __uuid_t;
 #endif
 
 #define BITMASK(name, type, field, offset, end)				\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
 									\
 static inline __u64 name(const type *k)					\
 {									\
@@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v)				\
 }
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)		\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
 									\
 static inline __u64 name(const type *k)					\
 {									\
@@ -370,7 +370,9 @@ static inline void bkey_init(struct bkey *k)
 	x(backpointer,		28)			\
 	x(inode_v3,		29)			\
 	x(bucket_gens,		30)			\
-	x(snapshot_tree,	31)
+	x(snapshot_tree,	31)			\
+	x(logged_op_truncate,	32)			\
+	x(logged_op_finsert,	33)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@@ -723,7 +725,7 @@ struct bch_inode {
 	__le64			bi_hash_seed;
 	__le32			bi_flags;
 	__le16			bi_mode;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v2 {
@@ -733,7 +735,7 @@ struct bch_inode_v2 {
 	__le64			bi_hash_seed;
 	__le64			bi_flags;
 	__le16			bi_mode;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v3 {
@@ -745,7 +747,7 @@ struct bch_inode_v3 {
 	__le64			bi_sectors;
 	__le64			bi_size;
 	__le64			bi_version;
-	__u8			fields[0];
+	__u8			fields[];
 } __packed __aligned(8);
 
 #define INODEv3_FIELDS_START_INITIAL	6
@@ -847,8 +849,8 @@ enum {
 	__BCH_INODE_NODUMP		= 3,
 	__BCH_INODE_NOATIME		= 4,
 
-	__BCH_INODE_I_SIZE_DIRTY	= 5,
-	__BCH_INODE_I_SECTORS_DIRTY	= 6,
+	__BCH_INODE_I_SIZE_DIRTY	= 5, /* obsolete */
+	__BCH_INODE_I_SECTORS_DIRTY	= 6, /* obsolete */
 	__BCH_INODE_UNLINKED		= 7,
 	__BCH_INODE_BACKPTR_UNTRUSTED	= 8,
 
@@ -1097,20 +1099,20 @@ struct bch_reflink_v {
 	struct bch_val		v;
 	__le64			refcount;
 	union bch_extent_entry	start[0];
-	__u64			_data[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 struct bch_indirect_inline_data {
 	struct bch_val		v;
 	__le64			refcount;
-	u8			data[0];
+	u8			data[];
 };
 
 /* Inline data */
 
 struct bch_inline_data {
 	struct bch_val		v;
-	u8			data[0];
+	u8			data[];
 };
 
 /* Subvolumes: */
@@ -1183,6 +1185,33 @@ struct bch_lru {
 
 #define LRU_ID_STRIPES		(1U << 16)
 
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+	struct bch_val		v;
+	__le32			subvol;
+	__le32			pad;
+	__le64			inum;
+	__le64			new_i_size;
+};
+
+enum logged_op_finsert_state {
+	LOGGED_OP_FINSERT_start,
+	LOGGED_OP_FINSERT_shift_extents,
+	LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+	struct bch_val		v;
+	__u8			state;
+	__u8			pad[3];
+	__le32			subvol;
+	__le64			inum;
+	__le64			dst_offset;
+	__le64			src_offset;
+	__le64			pos;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1223,7 +1252,7 @@ enum bch_sb_field_type {
 
 struct bch_sb_field_journal {
 	struct bch_sb_field	field;
-	__le64			buckets[0];
+	__le64			buckets[];
 };
 
 struct bch_sb_field_journal_v2 {
@@ -1232,7 +1261,7 @@ struct bch_sb_field_journal_v2 {
 	struct bch_sb_field_journal_v2_entry {
 		__le64		start;
 		__le64		nr;
-	}			d[0];
+	}			d[];
 };
 
 /* BCH_SB_FIELD_members: */
@@ -1279,7 +1308,7 @@ enum bch_member_state {
 
 struct bch_sb_field_members {
 	struct bch_sb_field	field;
-	struct bch_member	members[0];
+	struct bch_member	members[];
 };
 
 /* BCH_SB_FIELD_crypt: */
@@ -1377,19 +1406,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
-	__u8			devs[0];
+	__u8			devs[];
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry_v0 entries[0];
+	struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
 struct bch_replicas_entry {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
-	__u8			devs[0];
+	__u8			devs[];
 } __packed;
 
 #define replicas_entry_bytes(_i)					\
@@ -1397,7 +1426,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry entries[0];
+	struct bch_replicas_entry entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -1432,7 +1461,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
-	struct bch_disk_group	entries[0];
+	struct bch_disk_group	entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_counters */
@@ -1525,7 +1554,7 @@ enum bch_persistent_counters {
 
 struct bch_sb_field_counters {
 	struct bch_sb_field	field;
-	__le64			d[0];
+	__le64			d[];
 };
 
 /*
@@ -1539,10 +1568,8 @@ struct jset_entry {
 	__u8			type; /* designates what this jset holds */
 	__u8			pad[3];
 
-	union {
-		struct bkey_i	start[0];
-		__u64		_data[0];
-	};
+	struct bkey_i		start[0];
+	__u64			_data[];
 };
 
 struct bch_sb_field_clean {
@@ -1553,10 +1580,8 @@ struct bch_sb_field_clean {
 	__le16			_write_clock;
 	__le64			journal_seq;
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
+	struct jset_entry	start[0];
+	__u64			_data[];
 };
 
 struct journal_seq_blacklist_entry {
@@ -1567,10 +1592,8 @@ struct journal_seq_blacklist_entry {
 struct bch_sb_field_journal_seq_blacklist {
 	struct bch_sb_field	field;
 
-	union {
-		struct journal_seq_blacklist_entry start[0];
-		__u64		_data[0];
-	};
+	struct journal_seq_blacklist_entry start[0];
+	__u64			_data[];
 };
 
 /* Superblock: */
@@ -1645,7 +1668,8 @@ enum bcachefs_metadata_version {
 	bcachefs_metadata_version_max
 };
 
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
 
 #define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
@@ -1706,10 +1730,8 @@ struct bch_sb {
 
 	struct bch_sb_layout	layout;
 
-	union {
-		struct bch_sb_field start[0];
-		__le64		_data[0];
-	};
+	struct bch_sb_field	start[0];
+	__le64			_data[];
 } __packed __aligned(8);
 
 /*
@@ -1954,7 +1976,7 @@ enum bch_csum_type {
 	BCH_CSUM_NR
 };
 
-static const unsigned bch_crc_bytes[] = {
+static const __maybe_unused unsigned bch_crc_bytes[] = {
 	[BCH_CSUM_none]				= 0,
 	[BCH_CSUM_crc32c_nonzero]		= 4,
 	[BCH_CSUM_crc32c]			= 4,
@@ -2186,10 +2208,8 @@ struct jset {
 	__le64			last_seq;
 
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
+	struct jset_entry	start[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
@@ -2259,7 +2279,10 @@ enum btree_id_flags {
 	x(snapshot_trees,	15,	0,					\
 	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
 	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
-	  BIT_ULL(KEY_TYPE_set))
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(logged_ops,		17,	0,					\
+	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -2294,10 +2317,8 @@ struct bset {
 	__le16			version;
 	__le16			u64s; /* count of d[] in u64s */
 
-	union {
-		struct bkey_packed start[0];
-		__u64		_data[0];
-	};
+	struct bkey_packed	start[0];
+	__u64			_data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 0a5bfe6e..abdb0550 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -127,7 +127,7 @@ static void pack_state_finish(struct pack_state *state,
 			      struct bkey_packed *k)
 {
 	EBUG_ON(state->p <  k->_data);
-	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
 
 	*state->p = state->w;
 }
@@ -308,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 
 /**
  * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:	packed result
+ * @in:		key to pack
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
 bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-		   const struct bkey_format *format)
+			const struct bkey_format *format)
 {
 	struct pack_state state = pack_state_init(format, out);
 	u64 *w = out->_data;
@@ -336,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 
 /**
  * bch2_bkey_unpack -- unpack the key and the value
+ * @b:		btree node of @src key (for packed format)
+ * @dst:	unpacked result
+ * @src:	packed input
  */
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-		 const struct bkey_packed *src)
+		      const struct bkey_packed *src)
 {
 	__bkey_unpack_key(b, &dst->k, src);
 
@@ -349,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 
 /**
  * bch2_bkey_pack -- pack the key and the value
+ * @dst:	packed result
+ * @src:	unpacked input
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
-	       const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+		    const struct bkey_format *format)
 {
 	struct bkey_packed tmp;
 
-	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
 		return false;
 
-	memmove_u64s((u64 *) out + format->key_u64s,
-		     &in->v,
-		     bkey_val_u64s(&in->k));
-	memcpy_u64s_small(out, &tmp, format->key_u64s);
+	memmove_u64s((u64 *) dst + format->key_u64s,
+		     &src->v,
+		     bkey_val_u64s(&src->k));
+	memcpy_u64s_small(dst, &tmp, format->key_u64s);
 
 	return true;
 }
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 51969a46..51845020 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -52,7 +52,7 @@ struct bkey_s {
 
 static inline struct bkey_i *bkey_next(struct bkey_i *k)
 {
-	return (struct bkey_i *) (k->_data + k->k.u64s);
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
 }
 
 #define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
@@ -397,7 +397,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
 }
 
 #define bkeyp_val(_format, _k)						\
-	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
 
 extern const struct bkey_format bch2_bkey_format_current;
 
@@ -732,7 +732,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f)
 #error edit for your odd byteorder.
 #endif
 
-#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
 #define next_word(p)		nth_word(p, 1)
 #define prev_word(p)		nth_word(p, -1)
 
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 6547142d..be9f012f 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -10,6 +10,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "lru.h"
 #include "quota.h"
 #include "reflink.h"
@@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			       unsigned flags, struct printbuf *err)
+			       enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -39,7 +40,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				 unsigned flags, struct printbuf *err)
+				 enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != 0)",
@@ -55,7 +56,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				   unsigned flags, struct printbuf *err)
+				   enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -70,7 +71,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-					unsigned flags, struct printbuf *err)
+					enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	return 0;
 }
@@ -91,7 +92,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-				unsigned flags, struct printbuf *err)
+				enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_val_bytes(k.k)) {
 		prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -368,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 {
 	const struct bkey_ops *ops;
 	struct bkey uk;
-	struct bkey_s u;
 	unsigned nr_compat = 5;
 	int i;
 
@@ -433,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		}
 
 		break;
-	case 4:
+	case 4: {
+		struct bkey_s u;
+
 		if (!bkey_packed(k)) {
 			u = bkey_i_to_s(packed_to_bkey(k));
 		} else {
@@ -450,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 		if (ops->compat)
 			ops->compat(btree_id, version, big_endian, write, u);
 		break;
+	}
 	default:
 		BUG();
 	}
diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h
index 79cf11d1..7c0f0b16 100644
--- a/libbcachefs/bkey_sort.h
+++ b/libbcachefs/bkey_sort.h
@@ -9,14 +9,24 @@ struct sort_iter {
 
 	struct sort_iter_set {
 		struct bkey_packed *k, *end;
-	} data[MAX_BSETS + 1];
+	} data[];
 };
 
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
 {
 	iter->b = b;
 	iter->used = 0;
-	iter->size = ARRAY_SIZE(iter->data);
+	iter->size = size;
+}
+
+struct sort_iter_stack {
+	struct sort_iter	iter;
+	struct sort_iter_set	sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
 }
 
 static inline void sort_iter_add(struct sort_iter *iter,
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index bcdf28f3..bb73ba90 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
 		printk(KERN_ERR "iter was:");
 
 		btree_node_iter_for_each(_iter, set) {
-			struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-			struct bset_tree *t = bch2_bkey_to_bset(b, k);
+			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
 			printk(" [%zi %zi]", t - b->set,
-			       k->_data - bset(b, t)->_data);
+			       k2->_data - bset(b, t)->_data);
 		}
 		panic("\n");
 	}
@@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, where);
 	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
 	struct printbuf buf1 = PRINTBUF;
 	struct printbuf buf2 = PRINTBUF;
 #if 0
@@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx)
 }
 
 struct ro_aux_tree {
-	struct bkey_float	f[0];
+	u8			nothing[0];
+	struct bkey_float	f[];
 };
 
 struct rw_aux_tree {
@@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
 {
 	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
 
-	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
 }
 
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
@@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b,
 		btree_keys_account_key_add(&b->nr, t - b->set, src);
 
 	if (src->u64s != clobber_u64s) {
-		u64 *src_p = where->_data + clobber_u64s;
-		u64 *dst_p = where->_data + src->u64s;
+		u64 *src_p = (u64 *) where->_data + clobber_u64s;
+		u64 *dst_p = (u64 *) where->_data + src->u64s;
 
 		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
 			(int) clobber_u64s - src->u64s);
@@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b,
 		      unsigned clobber_u64s)
 {
 	struct bset_tree *t = bset_tree_last(b);
-	u64 *src_p = where->_data + clobber_u64s;
+	u64 *src_p = (u64 *) where->_data + clobber_u64s;
 	u64 *dst_p = where->_data;
 
 	bch2_bset_verify_rw_aux_tree(b, t);
@@ -1188,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
 	case BSET_RO_AUX_TREE:
 		return bset_search_tree(b, t, search, lossy_packed_search);
 	default:
-		unreachable();
+		BUG();
 	}
 }
 
@@ -1268,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 }
 
 /**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
  * given position
  *
+ * @iter:	iterator to initialize
+ * @b:		btree node to search
+ * @search:	search key
+ *
  * Main entry point to the lookup code for individual btree nodes:
  *
  * NOTE:
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index a8283fdc..7c6769cd 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -795,7 +795,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	six_unlock_intent(&b->c.lock);
 
 	/* Unlock before doing IO: */
-	if (trans && sync)
+	if (path && sync)
 		bch2_trans_unlock_noassert(trans);
 
 	bch2_btree_node_read(c, b, sync);
@@ -934,7 +934,7 @@ retry:
 	}
 
 	if (unlikely(need_relock)) {
-		int ret = bch2_trans_relock(trans) ?:
+		ret = bch2_trans_relock(trans) ?:
 			bch2_btree_path_relock_intent(trans, path);
 		if (ret) {
 			six_unlock_type(&b->c.lock, lock_type);
@@ -965,11 +965,20 @@ retry:
 }
 
 /**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
+ * @trans:	btree transaction object
+ * @path:	btree_path being traversed
+ * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:	level of btree node being looked up (0 == leaf node)
+ * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
  */
 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
 				  const struct bkey_i *k, unsigned level,
@@ -1016,28 +1025,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
 		six_unlock_type(&b->c.lock, lock_type);
-		bch2_trans_unlock(trans);
-
-		bch2_btree_node_wait_on_read(b);
-
-		/*
-		 * should_be_locked is not set on this path yet, so we need to
-		 * relock it specifically:
-		 */
-		if (trans) {
-			int ret = bch2_trans_relock(trans) ?:
-				bch2_btree_path_relock_intent(trans, path);
-			if (ret) {
-				BUG_ON(!trans->restarted);
-				return ERR_PTR(ret);
-			}
-		}
-
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 	}
 
 	prefetch(b->aux_data);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 83dcd9eb..97fbd833 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -529,13 +529,11 @@ fsck_err:
 
 int bch2_check_topology(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree *b;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
 
@@ -546,8 +544,8 @@ int bch2_check_topology(struct bch_fs *c)
 		if (btree_node_fake(b))
 			continue;
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-		ret = bch2_btree_repair_topology_recurse(&trans, b);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(trans, b);
 		six_unlock_read(&b->c.lock);
 
 		if (ret == DROP_THIS_NODE) {
@@ -556,7 +554,7 @@ int bch2_check_topology(struct bch_fs *c)
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -566,8 +564,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			       struct bkey_s_c *k)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
-	const union bch_extent_entry *entry;
+	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+	const union bch_extent_entry *entry_c;
 	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
 	struct printbuf buf = PRINTBUF;
@@ -577,10 +575,10 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 	 * XXX
 	 * use check_bucket_ref here
 	 */
-	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
 
 		if (!g->gen_valid &&
 		    (c->opts.reconstruct_alloc ||
@@ -1068,15 +1066,13 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	if (initial)
-		trans.is_initial_gc = true;
+		trans->is_initial_gc = true;
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		ids[i] = i;
@@ -1084,22 +1080,22 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 
 	for (i = 0; i < BTREE_ID_NR && !ret; i++)
 		ret = initial
-			? bch2_gc_btree_init(&trans, ids[i], metadata_only)
-			: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
+			? bch2_gc_btree_init(trans, ids[i], metadata_only)
+			: bch2_gc_btree(trans, ids[i], initial, metadata_only);
 
 	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
 		if (!bch2_btree_id_root(c, i)->alive)
 			continue;
 
 		ret = initial
-			? bch2_gc_btree_init(&trans, i, metadata_only)
-			: bch2_gc_btree(&trans, i, initial, metadata_only);
+			? bch2_gc_btree_init(trans, i, metadata_only)
+			: bch2_gc_btree(trans, i, initial, metadata_only);
 	}
 
 	if (ret < 0)
 		bch_err_fn(c, ret);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1220,14 +1216,6 @@ static int bch2_gc_done(struct bch_fs *c,
 	     fsck_err(c, _msg ": got %llu, should be %llu"		\
 		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
 		dst->_f = src->_f
-#define copy_stripe_field(_f, _msg, ...)				\
-	if (dst->_f != src->_f &&					\
-	    (!verify ||							\
-	     fsck_err(c, "stripe %zu has wrong "_msg			\
-		      ": got %u, should be %u",				\
-		      iter.pos, ##__VA_ARGS__,				\
-		      dst->_f, src->_f)))				\
-		dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)					\
 	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)					\
@@ -1249,7 +1237,7 @@ static int bch2_gc_done(struct bch_fs *c,
 			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
 			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
 		}
-	};
+	}
 
 	{
 		unsigned nr = fs_usage_u64s(c);
@@ -1469,37 +1457,35 @@ fsck_err:
 
 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for_each_member_device(ca, c, i) {
-		ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+		ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 				POS(ca->dev_idx, ca->mi.first_bucket),
 				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_LAZY_RW,
-			bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+			bch2_alloc_write_key(trans, &iter, k, metadata_only));
 
 		if (ret < 0) {
-			bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 			percpu_ref_put(&ca->ref);
 			break;
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret < 0 ? ret : 0;
 }
 
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	struct bch_dev *ca;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bucket *g;
@@ -1515,17 +1501,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		if (!buckets) {
 			percpu_ref_put(&ca->ref);
 			bch_err(c, "error allocating ca->buckets[gc]");
-			return -BCH_ERR_ENOMEM_gc_alloc_start;
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			goto err;
 		}
 
 		buckets->first_bucket	= ca->mi.first_bucket;
 		buckets->nbuckets	= ca->mi.nbuckets;
 		rcu_assign_pointer(ca->buckets_gc, buckets);
-	};
+	}
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = gc_bucket(ca, k.k->p.offset);
@@ -1546,13 +1531,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 			g->stripe_redundancy	= a->stripe_redundancy;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-
-	bch2_trans_exit(&trans);
-
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_put(trans);
 	if (ret)
-		bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
-
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1575,7 +1558,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
 			g->dirty_sectors = 0;
 			g->cached_sectors = 0;
 		}
-	};
+	}
 }
 
 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
@@ -1627,7 +1610,7 @@ fsck_err:
 
 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	size_t idx = 0;
@@ -1636,23 +1619,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = for_each_btree_key_commit(trans, iter,
 			BTREE_ID_reflink, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+		bch2_gc_write_reflink_key(trans, &iter, k, &idx));
 
 	c->reflink_gc_nr = 0;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int bch2_gc_reflink_start(struct bch_fs *c,
 				 bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct reflink_gc *r;
@@ -1661,10 +1644,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 	c->reflink_gc_nr = 0;
 
-	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		const __le64 *refcount = bkey_refcount_c(k);
 
@@ -1682,9 +1665,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
 		r->size		= k.k->size;
 		r->refcount	= 0;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1751,7 +1734,7 @@ fsck_err:
 
 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
@@ -1759,15 +1742,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 	if (metadata_only)
 		return 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key_commit(&trans, iter,
+	ret = for_each_btree_key_commit(trans, iter,
 			BTREE_ID_stripes, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_gc_write_stripes_key(&trans, &iter, k));
+		bch2_gc_write_stripes_key(trans, &iter, k));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1779,6 +1762,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
+ * @c:			filesystem object
+ * @initial:		are we in recovery?
+ * @metadata_only:	are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
  * Order matters here:
  *  - Concurrent GC relies on the fact that we have a total ordering for
  *    everything that GC walks - see  gc_will_visit_node(),
@@ -1947,7 +1936,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 
 int bch2_gc_gens(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
@@ -1965,7 +1954,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	trace_and_count(c, gc_gens_start, c);
 	down_read(&c->gc_lock);
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	for_each_member_device(ca, c, i) {
 		struct bucket_gens *gens;
@@ -1988,33 +1977,31 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (btree_type_has_ptrs(i)) {
-			struct btree_iter iter;
-			struct bkey_s_c k;
-
 			c->gc_gens_btree = i;
 			c->gc_gens_pos = POS_MIN;
-			ret = for_each_btree_key_commit(&trans, iter, i,
+
+			ret = for_each_btree_key_commit(trans, iter, i,
 					POS_MIN,
 					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 					k,
 					NULL, NULL,
 					BTREE_INSERT_NOFAIL,
-				gc_btree_gens_key(&trans, &iter, k));
+				gc_btree_gens_key(trans, &iter, k));
 			if (ret && !bch2_err_matches(ret, EROFS))
-				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+				bch_err_fn(c, ret);
 			if (ret)
 				goto err;
 		}
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
 			POS_MIN,
 			BTREE_ITER_PREFETCH,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_NOFAIL,
-		bch2_alloc_write_oldest_gen(&trans, &iter, k));
+		bch2_alloc_write_oldest_gen(trans, &iter, k));
 	if (ret && !bch2_err_matches(ret, EROFS))
-		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (ret)
 		goto err;
 
@@ -2031,7 +2018,7 @@ err:
 		ca->oldest_gen = NULL;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	up_read(&c->gc_lock);
 	mutex_unlock(&c->gc_gens_lock);
 	return ret;
@@ -2086,7 +2073,7 @@ static int bch2_gc_thread(void *arg)
 		ret = bch2_gc_gens(c);
 #endif
 		if (ret < 0)
-			bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 
 		debug_check_no_locks_held();
 	}
@@ -2116,7 +2103,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
 	if (IS_ERR(p)) {
-		bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
+		bch_err_fn(c, PTR_ERR(p));
 		return PTR_ERR(p);
 	}
 
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 3b654841..a869cf6a 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -14,7 +14,7 @@
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
@@ -106,8 +106,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
 		vpfree(p, size);
 }
 
-static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
-				       bool *used_mempool)
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+				bool *used_mempool)
 {
 	unsigned flags = memalloc_nofs_save();
 	void *p;
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
 	BUG_ON(size > btree_bytes(c));
 
 	*used_mempool = false;
-	p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT);
+	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
 		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -123,8 +123,6 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
 	memalloc_nofs_restore(flags);
 	return p;
 }
-#define btree_bounce_alloc(_c, _size, _used_mempool)		\
-	alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool))
 
 static void sort_bkey_ptrs(const struct btree *bt,
 			   struct bkey_packed **ptrs, unsigned nr)
@@ -294,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 			    bool filter_whiteouts)
 {
 	struct btree_node *out;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct bset_tree *t;
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
@@ -303,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
 
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	for (t = b->set + start_idx;
 	     t < b->set + end_idx;
 	     t++) {
 		u64s += le16_to_cpu(bset(b, t)->u64s);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 	}
@@ -322,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
 	start_time = local_clock();
 
-	u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
@@ -338,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	start_bset->journal_seq = cpu_to_le64(seq);
 
 	if (sorting_entire_node) {
-		unsigned u64s = le16_to_cpu(out->keys.u64s);
+		u64s = le16_to_cpu(out->keys.u64s);
 
 		BUG_ON(bytes != btree_bytes(c));
 
@@ -412,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c,
 	bch2_verify_btree_nr_keys(dst);
 }
 
-#define SORT_CRIT	(4096 / sizeof(u64))
-
 /*
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
@@ -544,6 +540,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
 	prt_str(out, ": ");
 }
 
+__printf(8, 9)
 static int __btree_err(int ret,
 		       struct bch_fs *c,
 		       struct bch_dev *ca,
@@ -624,9 +621,6 @@ __cold
 void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
 	struct bset_tree *t;
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
 
 	for_each_bset(b, t) {
 		struct bset *i = bset(b, t);
@@ -662,6 +656,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 	bch2_bset_set_no_aux_tree(b, b->set);
 	bch2_btree_build_aux_trees(b);
 
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
 	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
 		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -910,7 +907,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
-	unsigned blacklisted_written, nonblacklisted_written = 0;
 	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
 	struct printbuf buf = PRINTBUF;
 	int ret = 0, retry_read = 0, write = READ;
@@ -920,8 +916,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	b->written = 0;
 
 	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-	sort_iter_init(iter, b);
-	iter->size = (btree_blocks(c) + 1) * 2;
+	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
 
 	if (bch2_meta_read_fault("btree"))
 		btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
@@ -1045,8 +1040,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		sort_iter_add(iter,
 			      vstruct_idx(i, 0),
 			      vstruct_last(i));
-
-		nonblacklisted_written = b->written;
 	}
 
 	if (ptr_written) {
@@ -1064,18 +1057,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 								      true),
 				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 				     "found bset signature after last bset");
-
-		/*
-		 * Blacklisted bsets are those that were written after the most recent
-		 * (flush) journal write. Since there wasn't a flush, they may not have
-		 * made it to all devices - which means we shouldn't write new bsets
-		 * after them, as that could leave a gap and then reads from that device
-		 * wouldn't find all the bsets in that btree node - which means it's
-		 * important that we start writing new bsets after the most recent _non_
-		 * blacklisted bset:
-		 */
-		blacklisted_written = b->written;
-		b->written = nonblacklisted_written;
 	}
 
 	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1143,9 +1124,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 	btree_node_reset_sib_u64s(b);
 
 	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
 
-		if (ca->mi.state != BCH_MEMBER_STATE_rw)
+		if (ca2->mi.state != BCH_MEMBER_STATE_rw)
 			set_btree_node_need_rewrite(b);
 	}
 
@@ -1227,19 +1208,17 @@ start:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
-	printbuf_exit(&buf);
 
 	if (saw_error && !btree_node_read_error(b)) {
-		struct printbuf buf = PRINTBUF;
-
+		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
 		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
 			 __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
-		printbuf_exit(&buf);
 
 		bch2_btree_node_rewrite_async(c, b);
 	}
 
+	printbuf_exit(&buf);
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
@@ -1649,8 +1628,7 @@ err:
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 			const struct bkey_i *k, unsigned level)
 {
-	return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
-
+	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1712,15 +1690,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 	__btree_node_write_done(c, b);
 	six_unlock_read(&b->c.lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1749,7 +1725,7 @@ static void btree_node_write_work(struct work_struct *work)
 		}
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
 					BCH_WATERMARK_reclaim|
 					BTREE_INSERT_JOURNAL_RECLAIM|
 					BTREE_INSERT_NOFAIL|
@@ -1854,7 +1830,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct nonce nonce;
 	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
 	u64 seq = 0;
@@ -1927,7 +1903,7 @@ do_write:
 
 	bch2_sort_whiteouts(c, b);
 
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	bytes = !b->written
 		? sizeof(struct btree_node)
@@ -1942,7 +1918,7 @@ do_write:
 			continue;
 
 		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 		seq = max(seq, le64_to_cpu(i->journal_seq));
@@ -1971,14 +1947,14 @@ do_write:
 	i->journal_seq	= cpu_to_le64(seq);
 	i->u64s		= 0;
 
-	sort_iter_add(&sort_iter,
+	sort_iter_add(&sort_iter.iter,
 		      unwritten_whiteouts_start(c, b),
 		      unwritten_whiteouts_end(c, b));
 	SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
 	b->whiteout_u64s = 0;
 
-	u64s = bch2_sort_keys(i->start, &sort_iter, false);
+	u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
 	le16_add_cpu(&i->u64s, u64s);
 
 	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index cd99bbb0..7e03dd76 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -7,7 +7,7 @@
 #include "btree_locking.h"
 #include "checksum.h"
 #include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct bch_fs;
 struct btree_write;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 5216d339..4cee5e6c 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -488,7 +488,6 @@ fixup_done:
 	if (!bch2_btree_node_iter_end(node_iter) &&
 	    iter_current_key_modified &&
 	    b->c.level) {
-		struct bset_tree *t;
 		struct bkey_packed *k, *k2, *p;
 
 		k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -689,7 +688,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			if (t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(trans, path, b->c.level);
 				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
 			}
 
 			bch2_btree_path_level_init(trans, path, b);
@@ -764,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;
 
-			mark_btree_node_locked(trans, path, path->level, lock_type);
+			mark_btree_node_locked(trans, path, path->level,
+					       (enum btree_node_locked_type) lock_type);
 			bch2_btree_path_level_init(trans, path, b);
 			return 0;
 		}
@@ -936,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (btree_node_read_locked(path, level + 1))
 		btree_node_unlock(trans, path, level + 1);
 
-	mark_btree_node_locked(trans, path, level, lock_type);
+	mark_btree_node_locked(trans, path, level,
+			       (enum btree_node_locked_type) lock_type);
 	path->level = level;
 	bch2_btree_path_level_init(trans, path, b);
 
@@ -1341,14 +1342,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
 	__bch2_path_free(trans, path);
 }
 
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 {
 	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
 	      trans->restart_count, restart_count,
 	      (void *) trans->last_begin_ip);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
 	panic("in transaction restart: %s, last restarted by %pS\n",
 	      bch2_err_str(trans->restarted),
@@ -1493,7 +1494,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 static noinline void btree_path_overflow(struct btree_trans *trans)
 {
 	bch2_dump_trans_paths_updates(trans);
-	panic("trans path oveflow\n");
+	panic("trans path overflow\n");
 }
 
 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -2046,8 +2047,12 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys less than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
@@ -2184,10 +2189,13 @@ end:
 }
 
 /**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 {
@@ -2278,8 +2286,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
  * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
@@ -2290,8 +2301,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 }
 
 /**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
  * iterator's current position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
@@ -2414,8 +2428,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
  * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
@@ -2722,7 +2739,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
 			  struct btree_iter *iter,
-			  unsigned btree_id, struct bpos pos,
+			  enum btree_id btree_id, struct bpos pos,
 			  unsigned flags)
 {
 	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
@@ -2830,6 +2847,8 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
+ * Returns:	current restart counter, to be used with trans_was_restarted()
+ *
  * While iterating over nodes or updating nodes a attempt to lock a btree node
  * may return BCH_ERR_transaction_restart when the trylock fails. When this
  * occurs bch2_trans_begin() should be called and the transaction retried.
@@ -2887,28 +2906,23 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	return trans->restart_count;
 }
 
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
 {
-	size_t paths_bytes	= sizeof(struct btree_path) * BTREE_ITER_MAX;
-	size_t updates_bytes	= sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-	void *p = NULL;
+	struct btree_trans *trans;
 
-	BUG_ON(trans->used_mempool);
-
-#ifdef __KERNEL__
-	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
-	if (!p) {
-		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
-		/*
-		 * paths need to be zeroed, bch2_check_for_deadlock looks at
-		 * paths in other threads
-		 */
-		memset(p, 0, paths_bytes);
+	if (IS_ENABLED(__KERNEL__)) {
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+		if (trans)
+			return trans;
 	}
 
-	trans->paths		= p; p += paths_bytes;
-	trans->updates		= p; p += updates_bytes;
+	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+	/*
+	 * paths need to be zeroed, bch2_check_for_deadlock looks at
+	 * paths in other threads
+	 */
+	memset(&trans->paths, 0, sizeof(trans->paths));
+	return trans;
 }
 
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
@@ -2928,13 +2942,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
 	return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 	__acquires(&c->btree_trans_barrier)
 {
+	struct btree_trans *trans;
 	struct btree_transaction_stats *s;
 
 	bch2_assert_btree_nodes_not_locked();
 
+	trans = bch2_trans_alloc(c);
+
 	memset(trans, 0, sizeof(*trans));
 	trans->c		= c;
 	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
@@ -2946,8 +2963,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 	closure_init_stack(&trans->ref);
 
-	bch2_trans_alloc_paths(trans, c);
-
 	s = btree_trans_stats(trans);
 	if (s && s->max_mem) {
 		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
@@ -2993,6 +3008,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 list_add_done:
 		seqmutex_unlock(&c->btree_trans_lock);
 	}
+
+	return trans;
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3017,7 +3034,7 @@ leaked:
 #endif
 }
 
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
 	__releases(&c->btree_trans_barrier)
 {
 	struct btree_insert_entry *i;
@@ -3063,18 +3080,11 @@ void bch2_trans_exit(struct btree_trans *trans)
 	else
 		kfree(trans->mem);
 
-#ifdef __KERNEL__
-	/*
-	 * Userspace doesn't have a real percpu implementation:
-	 */
-	trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
-	if (trans->paths)
-		mempool_free(trans->paths, &c->btree_paths_pool);
-
-	trans->mem	= (void *) 0x1;
-	trans->paths	= (void *) 0x1;
+	/* Userspace doesn't have a real percpu implementation: */
+	if (IS_ENABLED(__KERNEL__))
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+	if (trans)
+		mempool_free(trans, &c->btree_trans_pool);
 }
 
 static void __maybe_unused
@@ -3152,6 +3162,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
 	struct btree_transaction_stats *s;
+	struct btree_trans *trans;
+	int cpu;
+
+	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+	if (trans)
+		panic("%s leaked btree_trans\n", trans->fn);
+
+	if (c->btree_trans_bufs)
+		for_each_possible_cpu(cpu)
+			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+	free_percpu(c->btree_trans_bufs);
 
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
@@ -3163,13 +3184,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	if (c->btree_trans_barrier_initialized)
 		cleanup_srcu_struct(&c->btree_trans_barrier);
 	mempool_exit(&c->btree_trans_mem_pool);
-	mempool_exit(&c->btree_paths_pool);
+	mempool_exit(&c->btree_trans_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
 	struct btree_transaction_stats *s;
-	unsigned nr = BTREE_ITER_MAX;
 	int ret;
 
 	for (s = c->btree_transaction_stats;
@@ -3182,9 +3202,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
 	INIT_LIST_HEAD(&c->btree_trans_list);
 	seqmutex_init(&c->btree_trans_lock);
 
-	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
-			sizeof(struct btree_path) * nr +
-			sizeof(struct btree_insert_entry) * nr) ?:
+	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+	if (!c->btree_trans_bufs)
+		return -ENOMEM;
+
+	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+					  sizeof(struct btree_trans)) ?:
 		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
 					  BTREE_TRANS_MEM_MAX) ?:
 		init_srcu_struct(&c->btree_trans_barrier);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 8876f2b8..fbe27345 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -276,12 +276,14 @@ int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
-static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-	return restart_count != trans->restart_count;
+	return restart_count != trans->restart_count
+		? -BCH_ERR_transaction_restart_nested
+		: 0;
 }
 
-void bch2_trans_restart_error(struct btree_trans *, u32);
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
 
 static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
 						   u32 restart_count)
@@ -290,7 +292,7 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
 		bch2_trans_restart_error(trans, restart_count);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
 
 static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
 {
@@ -463,7 +465,7 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 }
 
 void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-			  unsigned, struct bpos, unsigned);
+			  enum btree_id, struct bpos, unsigned);
 
 static inline void bch2_trans_iter_init(struct btree_trans *trans,
 			  struct btree_iter *iter,
@@ -672,17 +674,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define lockrestart_do(_trans, _do)					\
 ({									\
 	u32 _restart_count;						\
-	int _ret;							\
+	int _ret2;							\
 									\
 	do {								\
 		_restart_count = bch2_trans_begin(_trans);		\
-		_ret = (_do);						\
-	} while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));	\
+		_ret2 = (_do);						\
+	} while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart));	\
 									\
-	if (!_ret)							\
+	if (!_ret2)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	_ret;								\
+	_ret2;								\
 })
 
 /*
@@ -697,26 +699,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define nested_lockrestart_do(_trans, _do)				\
 ({									\
 	u32 _restart_count, _orig_restart_count;			\
-	int _ret;							\
+	int _ret2;							\
 									\
 	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
 									\
-	while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
 		_restart_count = bch2_trans_begin(_trans);		\
 									\
-	if (!_ret)							\
+	if (!_ret2)							\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 									\
-	if (!_ret && trans_was_restarted(_trans, _orig_restart_count))	\
-		_ret = -BCH_ERR_transaction_restart_nested;		\
-									\
-	_ret;								\
+	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,			\
 			    _start, _flags, _k, _do)			\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -724,15 +723,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	while (1) {							\
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 									\
-		_ret = 0;						\
+		_ret3 = 0;						\
 		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
 		if (!(_k).k)						\
 			break;						\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_advance(&(_iter)))			\
@@ -740,13 +739,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key2_upto(_trans, _iter, _btree_id,		\
 			    _start, _end, _flags, _k, _do)		\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -754,15 +753,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	while (1) {							\
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 									\
-		_ret = 0;						\
+		_ret3 = 0;						\
 		(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
 		if (!(_k).k)						\
 			break;						\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_advance(&(_iter)))			\
@@ -770,13 +769,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
 				   _start, _flags, _k, _do)		\
 ({									\
-	int _ret = 0;							\
+	int _ret3 = 0;							\
 									\
 	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
 			     (_start), (_flags));			\
@@ -785,14 +784,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 		u32 _restart_count = bch2_trans_begin(_trans);		\
 		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
 		if (!(_k).k) {						\
-			_ret = 0;					\
+			_ret3 = 0;					\
 			break;						\
 		}							\
 									\
-		_ret = bkey_err(_k) ?: (_do);				\
-		if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
 			continue;					\
-		if (_ret)						\
+		if (_ret3)						\
 			break;						\
 		bch2_trans_verify_not_restarted(_trans, _restart_count);\
 		if (!bch2_btree_iter_rewind(&(_iter)))			\
@@ -800,7 +799,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 	}								\
 									\
 	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret;								\
+	_ret3;								\
 })
 
 #define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
@@ -916,21 +915,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
-void bch2_trans_exit(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
 
 extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 unsigned bch2_trans_get_fn_idx(const char *);
 
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem)			\
-do {									\
+#define bch2_trans_get(_c)						\
+({									\
 	static unsigned trans_fn_idx;					\
 									\
 	if (unlikely(!trans_fn_idx))					\
 		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
-									\
-	__bch2_trans_init(_trans, _c, trans_fn_idx);			\
-} while (0)
+	__bch2_trans_get(_c, trans_fn_idx);				\
+})
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index f7c001d4..1407f691 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -243,8 +243,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 	}
 
 	if (ck) {
-		int ret;
-
 		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 		if (unlikely(ret)) {
 			bkey_cached_move_to_freelist(bc, ck);
@@ -253,7 +251,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 
 		path->l[0].b = (void *) ck;
 		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 
 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 		if (unlikely(ret)) {
@@ -331,7 +329,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 		}
 
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 	}
 
 	ck->c.level		= 0;
@@ -479,7 +477,7 @@ retry:
 		if (!ck)
 			goto retry;
 
-		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -497,7 +495,8 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
 	}
 
 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
@@ -579,7 +578,8 @@ retry:
 			goto retry;
 		}
 
-		mark_btree_node_locked(trans, path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
 	}
 
 	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
@@ -705,13 +705,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	struct bkey_cached *ck =
 		container_of(pin, struct bkey_cached, journal);
 	struct bkey_cached_key key;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
 	key = ck->key;
 
 	if (ck->journal.seq != seq ||
@@ -728,13 +726,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 	}
 	six_unlock_read(&ck->c.lock);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-		btree_key_cache_flush_pos(&trans, key, seq,
+	ret = commit_do(trans, NULL, NULL, 0,
+		btree_key_cache_flush_pos(trans, key, seq,
 				BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -1065,7 +1063,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-	prt_printf(out, "nr_freed:\t%zu",	atomic_long_read(&c->nr_freed));
+	prt_printf(out, "nr_freed:\t%lu",	atomic_long_read(&c->nr_freed));
 	prt_newline(out);
 	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
 	prt_newline(out);
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 22e2cd39..6231e9ff 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -91,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 static inline void mark_btree_node_locked(struct btree_trans *trans,
 					  struct btree_path *path,
 					  unsigned level,
-					  enum six_lock_type type)
+					  enum btree_node_locked_type type)
 {
 	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index eafb0388..04c1f461 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -163,13 +163,11 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	unsigned long old, new, v;
 	unsigned idx = w - b->writes;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 	v = READ_ONCE(b->flags);
 
 	do {
@@ -188,7 +186,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	btree_node_write_if_need(c, b, SIX_LOCK_read);
 	six_unlock_read(&b->c.lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return 0;
 }
 
@@ -214,7 +212,11 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 }
 
 /**
- * btree_insert_key - insert a key one key into a leaf node
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:		btree transaction object
+ * @path:		path pointing to @insert's pos
+ * @insert:		key to insert
+ * @journal_seq:	sequence number of journal reservation
  */
 inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
 				       struct btree_path *path,
@@ -555,7 +557,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	struct btree_write_buffered_key *wb;
 	struct btree_trans_commit_hook *h;
 	unsigned u64s = 0;
-	bool marking = false;
 	int ret;
 
 	if (race_fault()) {
@@ -584,9 +585,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 			*stopped_at = i;
 			return ret;
 		}
-
-		if (btree_node_type_needs_gc(i->bkey_type))
-			marking = true;
 	}
 
 	if (trans->nr_wb_updates &&
@@ -778,7 +776,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
 						   struct btree_insert_entry *i,
 						   struct printbuf *err)
@@ -804,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
 
 	return -EINVAL;
 }
-#endif
 
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
@@ -1029,7 +1025,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	if (ret)
 		goto out_reset;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans_for_each_update(trans, i) {
 		struct printbuf buf = PRINTBUF;
 		enum bkey_invalid_flags invalid_flags = 0;
@@ -1046,7 +1041,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		if (ret)
 			return ret;
 	}
-#endif
 
 	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
 		ret = do_bch2_trans_commit_to_journal_replay(trans);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 71ad3893..67ecb5e4 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -194,34 +194,34 @@ struct btree_node_iter {
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-static const u16 BTREE_ITER_SLOTS		= 1 << 0;
-static const u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
+static const __maybe_unused u16 BTREE_ITER_SLOTS		= 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-static const u16 BTREE_ITER_INTENT		= 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT		= 1 << 2;
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-static const u16 BTREE_ITER_PREFETCH		= 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH		= 1 << 3;
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-static const u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
-static const u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
-static const u16 BTREE_ITER_CACHED		= 1 << 6;
-static const u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
-static const u16 BTREE_ITER_WITH_UPDATES	= 1 << 8;
-static const u16 BTREE_ITER_WITH_JOURNAL	= 1 << 9;
-static const u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
-static const u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
-static const u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
-static const u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
-static const u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
-static const u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
-#define __BTREE_ITER_FLAGS_END			       16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED		= 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES		= 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL		= 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
+#define __BTREE_ITER_FLAGS_END					       16
 
 enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
@@ -459,8 +459,8 @@ struct btree_trans {
 	void			*mem;
 
 	u8			sorted[BTREE_ITER_MAX + 8];
-	struct btree_path	*paths;
-	struct btree_insert_entry *updates;
+	struct btree_path	paths[BTREE_ITER_MAX];
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
 	struct btree_write_buffered_key *wb_updates;
 
 	/* update path: */
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 880ce743..324767c0 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -124,7 +124,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	struct bkey_s_c old_k, new_k;
 	snapshot_id_list s;
 	struct bkey_i *update;
-	int ret;
+	int ret = 0;
 
 	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
 		return 0;
@@ -466,11 +466,49 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 	return 0;
 }
 
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+						    struct btree_iter *iter,
+						    struct btree_path *path)
+{
+	if (!iter->key_cache_path ||
+	    !iter->key_cache_path->should_be_locked ||
+	    !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+		struct bkey_cached *ck;
+		int ret;
+
+		if (!iter->key_cache_path)
+			iter->key_cache_path =
+				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+					      BTREE_ITER_INTENT|
+					      BTREE_ITER_CACHED, _THIS_IP_);
+
+		iter->key_cache_path =
+			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+
+		ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+					       BTREE_ITER_CACHED);
+		if (unlikely(ret))
+			return ret;
+
+		ck = (void *) iter->key_cache_path->l[0].b;
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		}
+
+		btree_path_set_should_be_locked(iter->key_cache_path);
+	}
+
+	return 0;
+}
+
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 				   struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_path *path = iter->update_path ?: iter->path;
-	struct bkey_cached *ck;
 	int ret;
 
 	if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -494,34 +532,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 	    !path->cached &&
 	    !path->level &&
 	    btree_id_cached(trans->c, path->btree_id)) {
-		if (!iter->key_cache_path ||
-		    !iter->key_cache_path->should_be_locked ||
-		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-			if (!iter->key_cache_path)
-				iter->key_cache_path =
-					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-						      BTREE_ITER_INTENT|
-						      BTREE_ITER_CACHED, _THIS_IP_);
-
-			iter->key_cache_path =
-				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-							iter->flags & BTREE_ITER_INTENT,
-							_THIS_IP_);
-
-			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-						       BTREE_ITER_CACHED);
-			if (unlikely(ret))
-				return ret;
-
-			ck = (void *) iter->key_cache_path->l[0].b;
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-			}
-
-			btree_path_set_should_be_locked(iter->key_cache_path);
-		}
+		ret = bch2_trans_update_get_key_cache(trans, iter, path);
+		if (ret)
+			return ret;
 
 		path = iter->key_cache_path;
 	}
@@ -640,6 +653,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
 	int ret;
 
 	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_CACHED|
 			     BTREE_ITER_NOT_EXTENTS|
 			     BTREE_ITER_INTENT);
 	ret   = bch2_btree_iter_traverse(&iter) ?:
@@ -648,8 +662,8 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
 	return ret;
 }
 
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-			struct bkey_i *k, enum btree_update_flags flags)
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+			    struct bkey_i *k, enum btree_update_flags flags)
 {
 	struct btree_iter iter;
 	int ret;
@@ -667,16 +681,18 @@ int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
  * bch2_btree_insert - insert keys into the extent btree
  * @c:			pointer to struct bch_fs
  * @id:			btree to insert into
- * @insert_keys:	list of keys to insert
- * @hook:		insert callback
+ * @k:			key to insert
+ * @disk_res:		must be non-NULL whenever inserting or potentially
+ *			splitting data extents
+ * @flags:		transaction commit flags
+ *
+ * Returns:		0 on success, error code on failure
  */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-		      struct bkey_i *k,
-		      struct disk_reservation *disk_res,
-		      u64 *journal_seq, int flags)
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+		      struct disk_reservation *disk_res, int flags)
 {
-	return bch2_trans_do(c, disk_res, journal_seq, flags,
-			     __bch2_btree_insert(&trans, id, k, 0));
+	return bch2_trans_do(c, disk_res, NULL, flags,
+			     bch2_btree_insert_trans(trans, id, k, 0));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
@@ -714,6 +730,23 @@ int bch2_btree_delete_at_buffered(struct btree_trans *trans,
 	return bch2_trans_update_buffered(trans, btree, k);
 }
 
+int bch2_btree_delete(struct btree_trans *trans,
+		      enum btree_id btree, struct bpos pos,
+		      unsigned update_flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, update_flags);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
 				  unsigned update_flags,
@@ -777,9 +810,7 @@ err:
 	}
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 /*
@@ -793,7 +824,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    u64 *journal_seq)
 {
 	int ret = bch2_trans_run(c,
-			bch2_btree_delete_range_trans(&trans, id, start, end,
+			bch2_btree_delete_range_trans(trans, id, start, end,
 						      update_flags, journal_seq));
 	if (ret == -BCH_ERR_transaction_restart_nested)
 		ret = 0;
@@ -818,6 +849,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
 	return bch2_trans_update_buffered(trans, btree, k);
 }
 
+__printf(2, 0)
 static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
 {
 	struct printbuf buf = PRINTBUF;
@@ -854,6 +886,7 @@ err:
 	return ret;
 }
 
+__printf(3, 0)
 static int
 __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 		  va_list args)
@@ -865,12 +898,13 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL,
 			BTREE_INSERT_LAZY_RW|commit_flags,
-			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+			__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
 	}
 
 	return ret;
 }
 
+__printf(2, 3)
 int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
 	va_list args;
@@ -886,6 +920,7 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
  * Use for logging messages during recovery to enable reserved space and avoid
  * blocking.
  */
+__printf(2, 3)
 int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
 	va_list args;
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 901c42b5..9816d228 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -4,7 +4,6 @@
 
 #include "btree_iter.h"
 #include "journal.h"
-#include "journal.h"
 
 struct bch_fs;
 struct btree;
@@ -58,14 +57,15 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
 				unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
 				struct bkey_i *, enum btree_update_flags);
 
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
 			enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *, u64 *, int flags);
+		     struct disk_reservation *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 				  struct bpos, struct bpos, unsigned, u64 *);
@@ -114,8 +114,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
 			    struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
-int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
@@ -145,29 +145,16 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
 					(_journal_seq), (_flags)))
 
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+#define bch2_trans_run(_c, _do)						\
 ({									\
-	struct btree_trans trans;					\
-	int _ret;							\
-									\
-	bch2_trans_init(&trans, (_c), 0, 0);				\
-	_ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do);	\
-	bch2_trans_exit(&trans);					\
-									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
 	_ret;								\
 })
 
-#define bch2_trans_run(_c, _do)						\
-({									\
-	struct btree_trans trans;					\
-	int _ret;							\
-									\
-	bch2_trans_init(&trans, (_c), 0, 0);				\
-	_ret = (_do);							\
-	bch2_trans_exit(&trans);					\
-									\
-	_ret;								\
-})
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
 
 #define trans_for_each_update(_trans, _i)				\
 	for ((_i) = (_trans)->updates;					\
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index c741150e..7dbf6b6c 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -143,10 +143,15 @@ static size_t btree_node_u64s_with_format(struct btree *b,
 }
 
 /**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c:		filesystem handle
+ * @b:		btree node to rewrite
+ * @new_f:	bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
  */
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 				 struct bkey_format *new_f)
@@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	struct write_point *wp;
 	struct btree *b;
 	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct open_buckets ob = { .nr = 0 };
+	struct open_buckets obs = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
 	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
@@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		struct btree_alloc *a =
 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-		ob = a->ob;
+		obs = a->ob;
 		bkey_copy(&tmp.k, &a->k);
 		mutex_unlock(&c->btree_reserve_cache_lock);
 		goto mem_alloc;
@@ -292,7 +297,7 @@ retry:
 	bkey_btree_ptr_v2_init(&tmp.k);
 	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
-	bch2_open_bucket_get(c, wp, &ob);
+	bch2_open_bucket_get(c, wp, &obs);
 	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(trans, interior_node);
@@ -304,7 +309,7 @@ mem_alloc:
 	BUG_ON(b->ob.nr);
 
 	bkey_copy(&b->key, &tmp.k);
-	b->ob = ob;
+	b->ob = obs;
 
 	return b;
 }
@@ -592,12 +597,11 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	u64 journal_seq = 0;
 	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 512);
 	/*
 	 * If we're already in an error state, it might be because a btree node
 	 * was never written, and we might be trying to free that same btree
@@ -618,7 +622,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 
 		b = as->old_nodes[i];
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 		seq = b->data ? b->data->keys.seq : 0;
 		six_unlock_read(&b->c.lock);
 
@@ -640,13 +644,13 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * journal reclaim does btree updates when flushing bkey_cached entries,
 	 * which may require allocations as well.
 	 */
-	ret = commit_do(&trans, &as->disk_res, &journal_seq,
+	ret = commit_do(trans, &as->disk_res, &journal_seq,
 			BCH_WATERMARK_reclaim|
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_NOCHECK_RW|
 			BTREE_INSERT_JOURNAL_RECLAIM,
-			btree_update_nodes_written_trans(&trans, as));
-	bch2_trans_unlock(&trans);
+			btree_update_nodes_written_trans(trans, as));
+	bch2_trans_unlock(trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
 			     "%s(): error %s", __func__, bch2_err_str(ret));
@@ -655,7 +659,7 @@ err:
 		struct btree_path *path;
 
 		b = as->b;
-		path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
+		path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
 		/*
 		 * @b is the node we did the final insert into:
 		 *
@@ -678,13 +682,13 @@ err:
 		 * we may rarely end up with a locked path besides the one we
 		 * have here:
 		 */
-		bch2_trans_unlock(&trans);
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+		bch2_trans_unlock(trans);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 		path->l[b->c.level].b = b;
 
-		bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
 		mutex_lock(&c->btree_interior_update_lock);
 
@@ -697,15 +701,15 @@ err:
 		 * btree_interior_update_lock:
 		 */
 		if (as->b == b) {
-			struct bset *i = btree_bset_last(b);
-
 			BUG_ON(!b->c.level);
 			BUG_ON(!btree_node_dirty(b));
 
 			if (!ret) {
-				i->journal_seq = cpu_to_le64(
+				struct bset *last = btree_bset_last(b);
+
+				last->journal_seq = cpu_to_le64(
 							     max(journal_seq,
-								 le64_to_cpu(i->journal_seq)));
+								 le64_to_cpu(last->journal_seq)));
 
 				bch2_btree_add_journal_pin(c, b, journal_seq);
 			} else {
@@ -724,8 +728,8 @@ err:
 		six_unlock_write(&b->c.lock);
 
 		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		btree_node_unlock(&trans, path, b->c.level);
-		bch2_path_put(&trans, path, true);
+		btree_node_unlock(trans, path, b->c.level);
+		bch2_path_put(trans, path, true);
 	}
 
 	bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -745,7 +749,7 @@ err:
 	for (i = 0; i < as->nr_new_nodes; i++) {
 		b = as->new_nodes[i];
 
-		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 		btree_node_write_if_need(c, b, SIX_LOCK_read);
 		six_unlock_read(&b->c.lock);
 	}
@@ -753,8 +757,8 @@ err:
 	for (i = 0; i < as->nr_open_buckets; i++)
 		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-	bch2_btree_update_free(as, &trans);
-	bch2_trans_exit(&trans);
+	bch2_btree_update_free(as, trans);
+	bch2_trans_put(trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -1216,18 +1220,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	bch2_recalc_btree_reserve(c);
 }
 
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it.  All the btree roots are part of every journal write, so there
- * is nothing new to be done.  This just guarantees that there is a
- * journal write.
- */
 static void bch2_btree_set_root(struct btree_update *as,
 				struct btree_trans *trans,
 				struct btree_path *path,
@@ -1341,12 +1333,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
 		;
 
 	while (!bch2_keylist_empty(keys)) {
-		struct bkey_i *k = bch2_keylist_front(keys);
+		insert = bch2_keylist_front(keys);
 
-		if (bpos_gt(k->k.p, b->key.k.p))
+		if (bpos_gt(insert->k.p, b->key.k.p))
 			break;
 
-		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
 		bch2_keylist_pop_front(keys);
 	}
 }
@@ -1513,12 +1505,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
 		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path1, n1);
 
 		path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
 		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path2, n2);
 
 		/*
@@ -1539,7 +1531,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 			path2->locks_want++;
 			BUG_ON(btree_node_locked(path2, n3->c.level));
 			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-			mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
 			bch2_btree_path_level_init(trans, path2, n3);
 
 			n3->sib_u64s[0] = U16_MAX;
@@ -1563,7 +1555,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
 		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
 		bch2_btree_path_level_init(trans, path1, n1);
 
 		if (parent)
@@ -1661,12 +1653,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 }
 
 /**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
  *
- * @iter:		btree iterator
+ * @as:			btree_update object
+ * @trans:		btree_trans object
+ * @path:		path that points to current node
+ * @b:			node to insert keys into
  * @keys:		list of keys to insert
- * @hook:		insert callback
- * @persistent:		if not null, @persistent will wait on journal write
+ * @flags:		transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
  *
  * Inserts as many keys as it can into a given btree node, splitting it if full.
  * If a split occurred, this function will return early. This can only happen
@@ -1890,7 +1886,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
 	bch2_btree_path_level_init(trans, new_path, n);
 
 	bkey_init(&delete.k);
@@ -1934,9 +1930,6 @@ err_free_update:
 	goto out;
 }
 
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
 			    struct btree_iter *iter,
 			    struct btree *b,
@@ -1967,7 +1960,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
 	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
 	bch2_btree_path_level_init(trans, new_path, n);
 
 	trace_and_count(c, btree_node_rewrite, c, b);
@@ -2055,9 +2048,9 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
 	int ret;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		      async_btree_node_rewrite_trans(&trans, a));
+		      async_btree_node_rewrite_trans(trans, a));
 	if (ret)
-		bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2096,8 +2089,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err(c, "%s: error going read-write: %s",
-				__func__, bch2_err_str(ret));
+			bch_err_msg(c, ret, "going read-write");
 			kfree(a);
 			return;
 		}
@@ -2372,7 +2364,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-	bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
+	bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
index 6d2d43b6..4e6241db 100644
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -296,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
 	mutex_lock(&wb->flush_lock);
 
 	return bch2_trans_run(c,
-			__bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+			__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index c02c8c91..e7f4506f 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-	u16 bucket_sectors = !ptr->cached
+	u32 bucket_sectors = !ptr->cached
 		? dirty_sectors
 		: cached_sectors;
 	struct printbuf buf = PRINTBUF;
@@ -752,9 +752,9 @@ static int check_bucket_ref(struct btree_trans *trans,
 		goto err;
 	}
 
-	if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
+	if ((u64) bucket_sectors + sectors > U32_MAX) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
 			bch2_data_types[bucket_data_type ?: ptr_data_type],
@@ -1201,7 +1201,7 @@ not_found:
 		new->k.p		= bkey_start_pos(p.k);
 		new->k.p.offset += *idx - start;
 		bch2_key_resize(&new->k, next_idx - *idx);
-		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
 					  BTREE_TRIGGER_NORUN);
 	}
 
@@ -1300,7 +1300,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 	static int warned_disk_usage = 0;
 	bool warn = false;
 	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	struct replicas_delta *d = deltas->d, *d2;
+	struct replicas_delta *d, *d2;
 	struct replicas_delta *top = (void *) deltas->d + deltas->used;
 	struct bch_fs_usage *dst;
 	s64 added = 0, should_not_have_added;
@@ -1923,7 +1923,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index f192809f..ecbeb728 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -40,15 +40,42 @@ static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, secto
 	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
 	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR	0
+#else
+#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+	ulong	ulong;
+	u8	byte;
+};
+
 static inline void bucket_unlock(struct bucket *b)
 {
-	smp_store_release(&b->lock, 0);
+	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+	bit_spin_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline void bucket_lock(struct bucket *b)
 {
-	while (xchg(&b->lock, 1))
-		cpu_relax();
+	bit_spin_lock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
@@ -180,7 +207,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 
 	switch (watermark) {
 	case BCH_WATERMARK_NR:
-		unreachable();
+		BUG();
 	case BCH_WATERMARK_stripe:
 		reserved += ca->mi.nbuckets >> 6;
 		fallthrough;
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
index 81ab685c..ec1b636e 100644
--- a/libbcachefs/buckets_waiting_for_journal.c
+++ b/libbcachefs/buckets_waiting_for_journal.c
@@ -133,7 +133,7 @@ retry_rehash:
 	b->t = n;
 	kvfree(t);
 
-	pr_debug("took %zu rehashes, table at %zu/%zu elements",
+	pr_debug("took %zu rehashes, table at %zu/%lu elements",
 		 nr_rehashes, nr_elements, 1UL << b->t->bits);
 out:
 	mutex_unlock(&b->lock);
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index fb603df0..f69e15dc 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 		devs[i] = strndup_user((const char __user *)(unsigned long)
 				       user_devs[i],
 				       PATH_MAX);
-		if (!devs[i]) {
-			ret = -ENOMEM;
+		ret= PTR_ERR_OR_ZERO(devs[i]);
+		if (ret)
 			goto err;
-		}
 	}
 
 	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
@@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	err = bch2_fs_open_incremental(path);
 	kfree(path);
@@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 static long bch2_ioctl_query_uuid(struct bch_fs *c,
 			struct bch_ioctl_query_uuid __user *user_arg)
 {
-	return copy_to_user(&user_arg->uuid,
-			    &c->sb.user_uuid,
-			    sizeof(c->sb.user_uuid));
+	if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+			 sizeof(c->sb.user_uuid)))
+		return -EFAULT;
+	return 0;
 }
 
 #if 0
@@ -188,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_add(c, path);
 	kfree(path);
@@ -230,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_online(c, path);
 	kfree(path);
@@ -338,7 +341,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 	if (len < sizeof(e))
 		return -EINVAL;
 
-	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+	if (copy_to_user(buf, &e, sizeof(e)))
+		return -EFAULT;
+
+	return sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
@@ -417,7 +423,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
 		return -EFAULT;
 
-	arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
 	if (!arg)
 		return -ENOMEM;
 
@@ -466,9 +472,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	percpu_up_read(&c->mark_lock);
 	kfree(src);
 
-	if (!ret)
-		ret = copy_to_user(user_arg, arg,
-			sizeof(*arg) + arg->replica_entries_bytes);
+	if (ret)
+		goto err;
+	if (copy_to_user(user_arg, arg,
+			 sizeof(*arg) + arg->replica_entries_bytes))
+		ret = -EFAULT;
 err:
 	kfree(arg);
 	return ret;
@@ -513,7 +521,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 
 	percpu_ref_put(&ca->ref);
 
-	return copy_to_user(user_arg, &arg, sizeof(arg));
+	if (copy_to_user(user_arg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -550,8 +561,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 		goto err;
 	}
 
-	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
-			   sb, vstruct_bytes(sb));
+	if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+			 vstruct_bytes(sb)))
+		ret = -EFAULT;
 err:
 	if (!IS_ERR_OR_NULL(ca))
 		percpu_ref_put(&ca->ref);
@@ -617,6 +629,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	    arg.pad)
 		return -EINVAL;
 
+	if (arg.nbuckets > U32_MAX)
+		return -EINVAL;
+
 	ca = bch2_device_lookup(c, arg.dev, arg.flags);
 	if (IS_ERR(ca))
 		return PTR_ERR(ca);
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 4c87c596..1948119e 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 
 		for (i = 0; i < pages; i++) {
 			unsigned offset = offset_in_page(buf);
-			unsigned pg_len = min(len, PAGE_SIZE - offset);
+			unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
 
 			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
 			buf += pg_len;
@@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
 		crypto_alloc_sync_skcipher("chacha20", 0, 0);
 	int ret;
 
-	if (!chacha20) {
-		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
-		return PTR_ERR(chacha20);
+	ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+		return ret;
 	}
 
 	ret = crypto_skcipher_setkey(&chacha20->base,
 				     (void *) key, sizeof(*key));
 	if (ret) {
-		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
 		goto err;
 	}
 
@@ -366,11 +367,11 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 	BUG_ON(!bch2_checksum_mergeable(type));
 
 	while (b_len) {
-		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
 
 		bch2_checksum_update(&state,
-				page_address(ZERO_PAGE(0)), b);
-		b_len -= b;
+				page_address(ZERO_PAGE(0)), page_len);
+		b_len -= page_len;
 	}
 	a.lo = (__le64 __force) bch2_checksum_final(&state);
 	a.lo ^= b.lo;
@@ -395,9 +396,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		unsigned			csum_type;
 		struct bch_csum			csum;
 	} splits[3] = {
-		{ crc_a, len_a, new_csum_type },
-		{ crc_b, len_b, new_csum_type },
-		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+		{ crc_a, len_a, new_csum_type, { 0 }},
+		{ crc_b, len_b, new_csum_type, { 0 } },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
 	}, *i;
 	bool mergeable = crc_old.csum_type == new_csum_type &&
 		bch2_checksum_mergeable(new_csum_type);
@@ -558,6 +559,7 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 	return ret;
 }
 
+#ifndef __KERNEL__
 int bch2_revoke_key(struct bch_sb *sb)
 {
 	key_serial_t key_id;
@@ -575,6 +577,7 @@ int bch2_revoke_key(struct bch_sb *sb)
 
 	return 0;
 }
+#endif
 
 int bch2_decrypt_sb_key(struct bch_fs *c,
 			struct bch_sb_field_crypt *crypt,
@@ -596,7 +599,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
 	/* decrypt real key: */
 	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-			     &sb_key, sizeof(sb_key));
+				      &sb_key, sizeof(sb_key));
 	if (ret)
 		goto err;
 
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 9a4898db..13998388 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -40,15 +40,16 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
  */
 #define csum_vstruct(_c, _type, _nonce, _i)				\
 ({									\
-	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
-	const void *end = vstruct_end(_i);				\
+	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
 									\
-	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
 })
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
 int bch2_revoke_key(struct bch_sb *);
+#endif
 
 int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 		 void *data, size_t);
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 6b17f7cc..1480b645 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -3,7 +3,6 @@
 #include "checksum.h"
 #include "compress.h"
 #include "extents.h"
-#include "io.h"
 #include "super-io.h"
 
 #include <linux/lz4.h>
@@ -571,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
-	bool decompress_workspace_needed;
 	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
 						 c->opts.encoded_extent_max);
 	struct {
@@ -581,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		size_t				decompress_workspace;
 	} compression_types[] = {
 		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
-			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+			0 },
 		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
@@ -620,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (!(features & (1 << i->feature)))
 			continue;
 
-		if (i->decompress_workspace)
-			decompress_workspace_needed = true;
-
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c
index 442a9b80..26eb3d82 100644
--- a/libbcachefs/counters.c
+++ b/libbcachefs/counters.c
@@ -43,7 +43,7 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
 		prt_tab(out);
 		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
 		prt_newline(out);
-	};
+	}
 };
 
 int bch2_sb_counters_to_cpu(struct bch_fs *c)
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 81518f20..899ff46d 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -9,7 +9,7 @@
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "keylist.h"
 #include "move.h"
 #include "nocow_locking.h"
@@ -49,10 +49,6 @@ static void trace_move_extent_fail2(struct data_update *m,
 	if (insert) {
 		i = 0;
 		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-			struct bkey_s new_s;
-			new_s.k = (void *) new.k;
-			new_s.v = (void *) new.v;
-
 			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
 			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
 			    !ptr->cached)
@@ -307,7 +303,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-	return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
index 49e9055c..7ca1f98d 100644
--- a/libbcachefs/data_update.h
+++ b/libbcachefs/data_update.h
@@ -4,7 +4,7 @@
 #define _BCACHEFS_DATA_UPDATE_H
 
 #include "bkey_buf.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct moving_context;
 
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index ae47e185..75a3dc7c 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -19,7 +19,6 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
 #include "super.h"
 
 #include <linux/console.h>
@@ -154,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	BUG_ON(b->nsets != 1);
 
 	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-		if (k->type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
-			v->mem_ptr = 0;
-		}
+		if (k->type == KEY_TYPE_btree_ptr_v2)
+			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
 
 	v = c->verify_data;
 	bkey_copy(&v->key, &b->key);
@@ -322,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i)
 {
 	if (i->buf.pos) {
 		size_t bytes = min_t(size_t, i->buf.pos, i->size);
-		int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
+		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
 
-		if (err)
-			return err;
+		i->ret	 += copied;
+		i->ubuf	 += copied;
+		i->size	 -= copied;
+		i->buf.pos -= copied;
+		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
 
-		i->ret	 += bytes;
-		i->ubuf	 += bytes;
-		i->size	 -= bytes;
-		i->buf.pos -= bytes;
-		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
+		if (copied != bytes)
+			return -EFAULT;
 	}
 
 	return i->size ? 0 : i->ret;
@@ -369,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 			       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	ssize_t ret;
@@ -382,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
-	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+	trans = bch2_trans_get(i->c);
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		bch2_bkey_val_to_text(&i->buf, i->c, k);
 		prt_newline(&i->buf);
-		drop_locks_do(&trans, flush_buf(i));
+		drop_locks_do(trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
@@ -411,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct btree *b;
 	ssize_t ret;
@@ -427,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 	if (bpos_eq(SPOS_MAX, i->from))
 		return i->ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
+	trans = bch2_trans_get(i->c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
 		bch2_btree_node_to_text(&i->buf, i->c, b);
 		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
 			? bpos_successor(b->key.k.p)
 			: b->key.k.p;
 
-		ret = drop_locks_do(&trans, flush_buf(i));
+		ret = drop_locks_do(trans, flush_buf(i));
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
@@ -465,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	ssize_t ret;
@@ -478,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, i->c, 0, 0);
+	trans = bch2_trans_get(i->c);
 
-	ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
 				  BTREE_ITER_PREFETCH|
 				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 		struct btree_path_level *l = &iter.path->l[0];
@@ -493,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 		}
 
 		bch2_bfloat_to_text(&i->buf, l->b, _k);
-		drop_locks_do(&trans, flush_buf(i));
+		drop_locks_do(trans, flush_buf(i));
 	}));
 	i->from = iter.pos;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (!ret)
 		ret = flush_buf(i);
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index a7559ab0..6c6c8d57 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -479,21 +479,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 		       const struct bch_hash_info *hash_info,
 		       const struct qstr *name, subvol_inum *inum)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+	ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
 					  name, inum, 0);
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (!ret)
-		bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+		bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -522,7 +520,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_dirent dirent;
@@ -533,15 +531,14 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 	int ret;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
 			   SPOS(inum.inum, ctx->pos, snapshot),
 			   POS(inum.inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
@@ -549,7 +546,7 @@ retry:
 
 		dirent = bkey_s_c_to_dirent(k);
 
-		ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
 		if (ret < 0)
 			break;
 		if (ret)
@@ -558,7 +555,7 @@ retry:
 		/* dir_emit() can fault and block: */
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		dirent = bkey_i_to_s_c_dirent(sk.k);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 
 		name = bch2_dirent_get_name(dirent);
 
@@ -574,16 +571,16 @@ retry:
 		 * read_target looks up subvolumes, we can overflow paths if the
 		 * directory has many subvolumes in it
 		 */
-		ret = btree_trans_too_many_iters(&trans);
+		ret = btree_trans_too_many_iters(trans);
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index f36472c4..b292dbef 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -32,21 +32,21 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
 	for (i = 0; i < sb->nr_devices; i++) {
 		struct bch_member *m = mi->members + i;
-		unsigned g;
+		unsigned group_id;
 
 		if (!BCH_MEMBER_GROUP(m))
 			continue;
 
-		g = BCH_MEMBER_GROUP(m) - 1;
+		group_id = BCH_MEMBER_GROUP(m) - 1;
 
-		if (g >= nr_groups) {
+		if (group_id >= nr_groups) {
 			prt_printf(err, "disk %u has invalid label %u (have %u)",
-			       i, g, nr_groups);
+				   i, group_id, nr_groups);
 			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 
-		if (BCH_GROUP_DELETED(&groups->entries[g])) {
-			prt_printf(err, "disk %u has deleted label %u", i, g);
+		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+			prt_printf(err, "disk %u has deleted label %u", i, group_id);
 			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 	}
@@ -183,8 +183,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
 	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
 		struct bch_member *m = mi->members + i;
-		struct bch_disk_group_cpu *dst =
-			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+		struct bch_disk_group_cpu *dst;
 
 		if (!bch2_member_exists(m))
 			continue;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index f58e84a2..8646856e 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -11,10 +11,11 @@
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "checksum.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_read.h"
 #include "keylist.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -475,7 +476,7 @@ err:
 
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
-	return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+	return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
 }
 
 /* recovery read path: */
@@ -787,12 +788,10 @@ static void ec_stripe_delete_work(struct work_struct *work)
 {
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, ec_stripe_delete_work);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret;
 	u64 idx;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	while (1) {
 		mutex_lock(&c->ec_stripes_heap_lock);
 		idx = stripe_idx_to_delete(c);
@@ -801,15 +800,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
 		if (!idx)
 			break;
 
-		ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-				ec_stripe_delete(&trans, idx));
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_delete(trans, idx));
 		if (ret) {
 			bch_err_fn(c, ret);
 			break;
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
@@ -998,24 +997,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush(&trans);
+	ret = bch2_btree_write_buffer_flush(trans);
 	if (ret)
 		goto err;
 
 	for (i = 0; i < nr_data; i++) {
-		ret = ec_stripe_update_bucket(&trans, s, i);
+		ret = ec_stripe_update_bucket(trans, s, i);
 		if (ret)
 			break;
 	}
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -1123,7 +1120,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	ret = bch2_trans_do(c, &s->res, NULL,
 			    BTREE_INSERT_NOCHECK_RW|
 			    BTREE_INSERT_NOFAIL,
-			    ec_stripe_key_update(&trans,
+			    ec_stripe_key_update(trans,
 					bkey_i_to_stripe(&s->new_stripe.key),
 					!s->have_existing_stripe));
 	if (ret) {
@@ -1133,8 +1130,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	ret = ec_stripe_update_extents(c, &s->new_stripe);
 	if (ret) {
-		bch_err(c, "error creating stripe: error updating pointers: %s",
-			bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating stripe: error updating pointers");
 		goto err;
 	}
 err:
@@ -1822,7 +1818,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	const struct bch_stripe *s;
@@ -1830,9 +1826,7 @@ int bch2_stripes_read(struct bch_fs *c)
 	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
 			   BTREE_ITER_PREFETCH, k, ret) {
 		if (k.k->type != KEY_TYPE_stripe)
 			continue;
@@ -1855,9 +1849,9 @@ int bch2_stripes_read(struct bch_fs *c)
 
 		bch2_stripes_heap_insert(c, m, k.k->p.offset);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 885ae5d5..966d165a 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
 			bch2_ec_do_stripe_creates(c);
 			break;
 		default:
-			unreachable();
+			BUG();
 		}
 }
 
diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c
index dc906fc9..d260ff9b 100644
--- a/libbcachefs/errcode.c
+++ b/libbcachefs/errcode.c
@@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = {
 	NULL
 };
 
-#define BCH_ERR_0	0
-
 static unsigned bch2_errcode_parents[] = {
 #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
 	BCH_ERRCODES()
@@ -61,3 +59,10 @@ int __bch2_err_class(int err)
 
 	return -err;
 }
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index f7fa8744..64f7176c 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -99,6 +99,7 @@
 	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
 	x(ENOENT,			ENOENT_inode)				\
 	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_not_directory)			\
 	x(ENOENT,			ENOENT_directory_dead)			\
 	x(ENOENT,			ENOENT_subvolume)			\
 	x(ENOENT,			ENOENT_snapshot_tree)			\
@@ -218,7 +219,14 @@
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
+	x(0,				nopromote)				\
+	x(BCH_ERR_nopromote,		nopromote_may_not)			\
+	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
+	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
+	x(BCH_ERR_nopromote,		nopromote_congested)			\
+	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
+	x(BCH_ERR_nopromote,		nopromote_enomem)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
@@ -249,4 +257,8 @@ static inline long bch2_err_class(long err)
 	return err < 0 ? __bch2_err_class(err) : err;
 }
 
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
 #endif /* _BCACHFES_ERRCODE_H */
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 39009cf0..2a5af887 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
-#include "io.h"
 #include "super.h"
 
 #define FSCK_ERR_RATELIMIT_NR	10
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index cbfb5b21..58ccc7b9 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -8,7 +8,8 @@
 #include "fs-io-buffered.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/backing-dev.h>
 #include <linux/pagemap.h>
@@ -269,7 +270,7 @@ void bch2_readahead(struct readahead_control *ractl)
 	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct folio *folio;
 	struct readpages_iter readpages_iter;
 	int ret;
@@ -279,8 +280,6 @@ void bch2_readahead(struct readahead_control *ractl)
 	ret = readpages_iter_init(&readpages_iter, ractl);
 	BUG_ON(ret);
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	bch2_pagecache_add_get(inode);
 
 	while ((folio = readpage_iter_peek(&readpages_iter))) {
@@ -299,31 +298,27 @@ void bch2_readahead(struct readahead_control *ractl)
 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-		bchfs_read(&trans, rbio, inode_inum(inode),
+		bchfs_read(trans, rbio, inode_inum(inode),
 			   &readpages_iter);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 	}
 
 	bch2_pagecache_add_put(inode);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	darray_exit(&readpages_iter.folios);
 }
 
 static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
 			     subvol_inum inum, struct folio *folio)
 {
-	struct btree_trans trans;
-
 	bch2_folio_create(folio, __GFP_NOFAIL);
 
 	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
 	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
 	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bchfs_read(&trans, rbio, inum, NULL);
-	bch2_trans_exit(&trans);
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
 }
 
 static void bch2_read_single_folio_end_io(struct bio *bio)
@@ -694,12 +689,12 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 	if (IS_ERR_OR_NULL(folio))
 		goto err_unlock;
 
-	if (folio_test_uptodate(folio))
-		goto out;
-
 	offset = pos - folio_pos(folio);
 	len = min_t(size_t, len, folio_end_pos(folio) - pos);
 
+	if (folio_test_uptodate(folio))
+		goto out;
+
 	/* If we're writing entire folio, don't need to read it in first: */
 	if (!offset && len == folio_size(folio))
 		goto out;
@@ -800,10 +795,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-static noinline void folios_trunc(folios *folios, struct folio **fi)
+static noinline void folios_trunc(folios *fs, struct folio **fi)
 {
-	while (folios->data + folios->nr > fi) {
-		struct folio *f = darray_pop(folios);
+	while (fs->data + fs->nr > fi) {
+		struct folio *f = darray_pop(fs);
 
 		folio_unlock(f);
 		folio_put(f);
@@ -817,35 +812,35 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
-	folios folios;
+	folios fs;
 	struct folio **fi, *f;
-	unsigned copied = 0, f_offset;
-	u64 end = pos + len, f_pos;
+	unsigned copied = 0, f_offset, f_copied;
+	u64 end = pos + len, f_pos, f_len;
 	loff_t last_folio_pos = inode->v.i_size;
 	int ret = 0;
 
 	BUG_ON(!len);
 
 	bch2_folio_reservation_init(c, inode, &res);
-	darray_init(&folios);
+	darray_init(&fs);
 
 	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
 				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
 				   mapping_gfp_mask(mapping),
-				   &folios);
+				   &fs);
 	if (ret)
 		goto out;
 
-	BUG_ON(!folios.nr);
+	BUG_ON(!fs.nr);
 
-	f = darray_first(folios);
+	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
 		if (ret)
 			goto out;
 	}
 
-	f = darray_last(folios);
+	f = darray_last(fs);
 	end = min(end, folio_end_pos(f));
 	last_folio_pos = folio_pos(f);
 	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
@@ -858,15 +853,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		}
 	}
 
-	ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
 	if (ret)
 		goto out;
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		/*
 		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
@@ -878,11 +873,11 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		 */
 		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
 		if (unlikely(ret)) {
-			folios_trunc(&folios, fi);
-			if (!folios.nr)
+			folios_trunc(&fs, fi);
+			if (!fs.nr)
 				goto out;
 
-			end = min(end, folio_end_pos(darray_last(folios)));
+			end = min(end, folio_end_pos(darray_last(fs)));
 			break;
 		}
 
@@ -891,18 +886,17 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	}
 
 	if (mapping_writably_mapped(mapping))
-		darray_for_each(folios, fi)
+		darray_for_each(fs, fi)
 			flush_dcache_folio(*fi);
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-		unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
 		if (!f_copied) {
-			folios_trunc(&folios, fi);
+			folios_trunc(&fs, fi);
 			break;
 		}
 
@@ -911,7 +905,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		    pos + copied + f_copied < inode->v.i_size) {
 			iov_iter_revert(iter, f_copied);
 			folio_zero_range(f, 0, folio_size(f));
-			folios_trunc(&folios, fi);
+			folios_trunc(&fs, fi);
 			break;
 		}
 
@@ -919,7 +913,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 		copied += f_copied;
 
 		if (f_copied != f_len) {
-			folios_trunc(&folios, fi + 1);
+			folios_trunc(&fs, fi + 1);
 			break;
 		}
 
@@ -938,10 +932,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	spin_unlock(&inode->v.i_lock);
 
 	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(folios));
-	darray_for_each(folios, fi) {
-		struct folio *f = *fi;
-		u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
 
 		if (!folio_test_uptodate(f))
 			folio_mark_uptodate(f);
@@ -954,7 +948,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	inode->ei_last_dirtied = (unsigned long) current;
 out:
-	darray_for_each(folios, fi) {
+	darray_for_each(fs, fi) {
 		folio_unlock(*fi);
 		folio_put(*fi);
 	}
@@ -967,7 +961,7 @@ out:
 	if (last_folio_pos >= inode->v.i_size)
 		truncate_pagecache(&inode->v, inode->v.i_size);
 
-	darray_exit(&folios);
+	darray_exit(&fs);
 	bch2_folio_reservation_put(c, inode, &res);
 
 	return copied ?: ret;
@@ -1055,8 +1049,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = inode_to_bdi(&inode->v);
 	inode_lock(&inode->v);
 
 	ret = generic_write_checks(iocb, from);
@@ -1076,7 +1068,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		iocb->ki_pos += ret;
 unlock:
 	inode_unlock(&inode->v);
-	current->backing_dev_info = NULL;
 
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index 2b29abd2..6a9557e7 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -7,10 +7,12 @@
 #include "fs-io.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
+#include <linux/prefetch.h>
 #include <linux/task_io_accounting_ops.h>
 
 /* O_DIRECT reads */
@@ -232,23 +234,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
 				       u64 offset, u64 size,
 				       unsigned nr_replicas, bool compressed)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 end = offset + size;
 	u32 snapshot;
 	bool ret = true;
 	int err;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (err)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, err) {
 		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
@@ -263,11 +263,11 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
 		goto retry;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return err ? false : ret;
 }
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
index 1e60eead..8bd9bcdd 100644
--- a/libbcachefs/fs-io-pagecache.c
+++ b/libbcachefs/fs-io-pagecache.c
@@ -14,7 +14,7 @@
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 				     loff_t start, u64 end,
 				     int fgp_flags, gfp_t gfp,
-				     folios *folios)
+				     folios *fs)
 {
 	struct folio *f;
 	u64 pos = start;
@@ -24,7 +24,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 		if ((u64) pos >= (u64) start + (1ULL << 20))
 			fgp_flags &= ~FGP_CREAT;
 
-		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
 		if (ret)
 			break;
 
@@ -32,16 +32,16 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
 		if (IS_ERR_OR_NULL(f))
 			break;
 
-		BUG_ON(folios->nr && folio_pos(f) != pos);
+		BUG_ON(fs->nr && folio_pos(f) != pos);
 
 		pos = folio_end_pos(f);
-		darray_push(folios, f);
+		darray_push(fs, f);
 	}
 
-	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
 		ret = -ENOMEM;
 
-	return folios->nr ? 0 : ret;
+	return fs->nr ? 0 : ret;
 }
 
 /* pagecache_block must be held */
@@ -73,12 +73,15 @@ int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
 	return ret;
 }
 
+#if 0
+/* Useful for debug tracing: */
 static const char * const bch2_folio_sector_states[] = {
 #define x(n)	#n,
 	BCH_FOLIO_SECTOR_STATE()
 #undef x
 	NULL
 };
+#endif
 
 static inline enum bch_folio_sector_state
 folio_sector_dirty(enum bch_folio_sector_state state)
@@ -177,20 +180,20 @@ static void __bch2_folio_set(struct folio *folio,
  * extents btree:
  */
 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-		   struct folio **folios, unsigned nr_folios)
+		   struct folio **fs, unsigned nr_folios)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_folio *s;
-	u64 offset = folio_sector(folios[0]);
+	u64 offset = folio_sector(fs[0]);
 	unsigned folio_idx;
 	u32 snapshot;
 	bool need_set = false;
 	int ret;
 
 	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
 		if (!s)
 			return -ENOMEM;
 
@@ -201,22 +204,22 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 		return 0;
 
 	folio_idx = 0;
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inum.inum, offset, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
 		unsigned state = bkey_to_sector_state(k);
 
 		while (folio_idx < nr_folios) {
-			struct folio *folio = folios[folio_idx];
+			struct folio *folio = fs[folio_idx];
 			u64 folio_start	= folio_sector(folio);
 			u64 folio_end	= folio_end_sector(folio);
 			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
@@ -240,11 +243,11 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 4804e5a4..b0e8144e 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -16,7 +17,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
-#include "io.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "quota.h"
 #include "reflink.h"
@@ -164,7 +165,6 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 #endif
 }
 
-
 /* fsync: */
 
 /*
@@ -207,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
 				 struct bpos start,
 				 struct bpos end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
 		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
 			ret = 1;
 			break;
 		}
 	start = iter.pos;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -241,8 +239,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_folio *s;
-	unsigned start_offset = start & (PAGE_SIZE - 1);
-	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+	unsigned start_offset;
+	unsigned end_offset;
 	unsigned i;
 	struct folio *folio;
 	s64 i_sectors_delta = 0;
@@ -391,33 +389,12 @@ static int bch2_extend(struct mnt_idmap *idmap,
 	return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_truncate_finish_fn(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   void *p)
-{
-	bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-	return 0;
-}
-
-static int bch2_truncate_start_fn(struct btree_trans *trans,
-				  struct bch_inode_info *inode,
-				  struct bch_inode_unpacked *bi, void *p)
-{
-	u64 *new_i_size = p;
-
-	bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-	bi->bi_size = *new_i_size;
-	return 0;
-}
-
-int bch2_truncate(struct mnt_idmap *idmap,
+int bchfs_truncate(struct mnt_idmap *idmap,
 		  struct bch_inode_info *inode, struct iattr *iattr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_inode_unpacked inode_u;
-	u64 new_i_size = iattr->ia_size;
 	s64 i_sectors_delta = 0;
 	int ret = 0;
 
@@ -466,6 +443,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	if (unlikely(ret < 0))
 		goto err;
 
+	truncate_setsize(&inode->v, iattr->ia_size);
+
 	/*
 	 * When extending, we're going to write the new i_size to disk
 	 * immediately so we need to flush anything above the current on disk
@@ -487,32 +466,22 @@ int bch2_truncate(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
-			       &new_i_size, 0);
-	mutex_unlock(&inode->ei_update_lock);
-
-	if (unlikely(ret))
-		goto err;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	ret = bch2_fpunch(c, inode_inum(inode),
-			round_up(iattr->ia_size, block_bytes(c)) >> 9,
-			U64_MAX, &i_sectors_delta);
+	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
 	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
+	if (unlikely(ret)) {
+		/*
+		 * If we error here, VFS caches are now inconsistent with btree
+		 */
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+		goto err;
+	}
+
 	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
 				!bch2_journal_error(&c->journal), c,
 				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
 				inode->v.i_ino, (u64) inode->v.i_blocks,
 				inode->ei_inode.bi_sectors);
-	if (unlikely(ret))
-		goto err;
-
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
-	mutex_unlock(&inode->ei_update_lock);
 
 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
@@ -577,175 +546,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct address_space *mapping = inode->v.i_mapping;
-	struct bkey_buf copy;
-	struct btree_trans trans;
-	struct btree_iter src, dst, del;
-	loff_t shift, new_size;
-	u64 src_start;
+	s64 i_sectors_delta = 0;
 	int ret = 0;
 
 	if ((offset | len) & (block_bytes(c) - 1))
 		return -EINVAL;
 
 	if (insert) {
-		if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-			return -EFBIG;
-
 		if (offset >= inode->v.i_size)
 			return -EINVAL;
-
-		src_start	= U64_MAX;
-		shift		= len;
 	} else {
 		if (offset + len >= inode->v.i_size)
 			return -EINVAL;
-
-		src_start	= offset + len;
-		shift		= -len;
 	}
 
-	new_size = inode->v.i_size + shift;
-
 	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 	if (ret)
 		return ret;
 
-	if (insert) {
-		i_size_write(&inode->v, new_size);
-		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, new_size,
-					    ATTR_MTIME|ATTR_CTIME);
-		mutex_unlock(&inode->ei_update_lock);
-	} else {
-		s64 i_sectors_delta = 0;
+	if (insert)
+		i_size_write(&inode->v, inode->v.i_size + len);
 
-		ret = bch2_fpunch(c, inode_inum(inode),
-				  offset >> 9, (offset + len) >> 9,
-				  &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+				     insert, &i_sectors_delta);
+	if (!ret && !insert)
+		i_size_write(&inode->v, inode->v.i_size - len);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-		if (ret)
-			return ret;
-	}
-
-	bch2_bkey_buf_init(&copy);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-			POS(inode->v.i_ino, src_start >> 9),
-			BTREE_ITER_INTENT);
-	bch2_trans_copy_iter(&dst, &src);
-	bch2_trans_copy_iter(&del, &src);
-
-	while (ret == 0 ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-		struct bkey_s_c k;
-		struct bpos next_pos;
-		struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
-		struct bpos atomic_end;
-		unsigned trigger_flags = 0;
-		u32 snapshot;
-
-		bch2_trans_begin(&trans);
-
-		ret = bch2_subvolume_get_snapshot(&trans,
-					inode->ei_subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(&src, snapshot);
-		bch2_btree_iter_set_snapshot(&dst, snapshot);
-		bch2_btree_iter_set_snapshot(&del, snapshot);
-
-		bch2_trans_begin(&trans);
-
-		k = insert
-			? bch2_btree_iter_peek_prev(&src)
-			: bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
-		if ((ret = bkey_err(k)))
-			continue;
-
-		if (!k.k || k.k->p.inode != inode->v.i_ino)
-			break;
-
-		if (insert &&
-		    bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
-			break;
-reassemble:
-		bch2_bkey_buf_reassemble(&copy, c, k);
-
-		if (insert &&
-		    bkey_lt(bkey_start_pos(k.k), move_pos))
-			bch2_cut_front(move_pos, copy.k);
-
-		copy.k->k.p.offset += shift >> 9;
-		bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
-
-		ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-		if (ret)
-			continue;
-
-		if (!bkey_eq(atomic_end, copy.k->k.p)) {
-			if (insert) {
-				move_pos = atomic_end;
-				move_pos.offset -= shift >> 9;
-				goto reassemble;
-			} else {
-				bch2_cut_back(atomic_end, copy.k);
-			}
-		}
-
-		bkey_init(&delete.k);
-		delete.k.p = copy.k->k.p;
-		delete.k.size = copy.k->k.size;
-		delete.k.p.offset -= shift >> 9;
-		bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
-
-		next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
-
-		if (copy.k->k.size != k.k->size) {
-			/* We might end up splitting compressed extents: */
-			unsigned nr_ptrs =
-				bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-
-			ret = bch2_disk_reservation_get(c, &disk_res,
-					copy.k->k.size, nr_ptrs,
-					BCH_DISK_RESERVATION_NOFAIL);
-			BUG_ON(ret);
-		}
-
-		ret =   bch2_btree_iter_traverse(&del) ?:
-			bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-			bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-			bch2_trans_commit(&trans, &disk_res, NULL,
-					  BTREE_INSERT_NOFAIL);
-		bch2_disk_reservation_put(c, &disk_res);
-
-		if (!ret)
-			bch2_btree_iter_set_pos(&src, next_pos);
-	}
-	bch2_trans_iter_exit(&trans, &del);
-	bch2_trans_iter_exit(&trans, &dst);
-	bch2_trans_iter_exit(&trans, &src);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&copy, c);
-
-	if (ret)
-		return ret;
-
-	mutex_lock(&inode->ei_update_lock);
-	if (!insert) {
-		i_size_write(&inode->v, new_size);
-		ret = bch2_write_inode_size(c, inode, new_size,
-					    ATTR_MTIME|ATTR_CTIME);
-	} else {
-		/* We need an inode update to update bi_journal_seq for fsync: */
-		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				       ATTR_MTIME|ATTR_CTIME);
-	}
-	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
 
@@ -753,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 			     u64 start_sector, u64 end_sector)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
 	struct bch_io_opts opts;
 	int ret = 0;
 
 	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -775,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		u64 hole_start, hole_end;
 		u32 snapshot;
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
-		ret = bch2_subvolume_get_snapshot(&trans,
+		ret = bch2_subvolume_get_snapshot(trans,
 					inode->ei_subvol, &snapshot);
 		if (ret)
 			goto bkey_err;
@@ -814,7 +640,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 						 &hole_start,
 						 &hole_end,
 						 opts.data_replicas, true))
-				ret = drop_locks_do(&trans,
+				ret = drop_locks_do(trans,
 					(bch2_clamp_data_hole(&inode->v,
 							      &hole_start,
 							      &hole_end,
@@ -837,7 +663,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 				goto bkey_err;
 		}
 
-		ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
 					    sectors, opts, &i_sectors_delta,
 					    writepoint_hashed((unsigned long) current));
 		if (ret)
@@ -845,7 +671,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
 		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-		drop_locks_do(&trans,
+		drop_locks_do(trans,
 			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
@@ -857,14 +683,14 @@ bkey_err:
 		struct quota_res quota_res = { 0 };
 		s64 i_sectors_delta = 0;
 
-		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+		bch2_fpunch_at(trans, &iter, inode_inum(inode),
 			       end_sector, &i_sectors_delta);
 		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 		bch2_quota_reservation_put(c, inode, &quota_res);
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -970,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode,
 			       u64 start, u64 end)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u32 snapshot;
 	u64 sectors = end - start;
 	u64 pos = start;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(inode->v.i_ino, pos, snapshot), 0);
 
-	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
 	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
 	       !(ret = bkey_err(k))) {
 		if (bkey_extent_is_allocation(k.k)) {
@@ -1001,17 +825,14 @@ retry:
 		bch2_btree_iter_advance(&iter);
 	}
 	pos = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
-	if (ret)
-		return ret;
-
-	return bch2_quota_reservation_add(c, inode, res, sectors, true);
+	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -1104,7 +925,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
@@ -1116,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   POS(inode->v.i_ino, U64_MAX),
 			   0, k, ret) {
@@ -1134,12 +955,12 @@ retry:
 		} else if (k.k->p.offset >> 9 > isize)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return ret;
 
@@ -1157,7 +978,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
@@ -1169,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 	if (offset >= isize)
 		return -ENXIO;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
 			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
 			   BTREE_ITER_SLOTS, k, ret) {
 		if (k.k->p.inode != inode->v.i_ino) {
@@ -1195,12 +1016,12 @@ retry:
 			offset = max(offset, bkey_start_offset(k.k) << 9);
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return ret;
 
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index bb5b709f..ca70346e 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -6,7 +6,7 @@
 
 #include "buckets.h"
 #include "fs.h"
-#include "io_types.h"
+#include "io_write_types.h"
 #include "quota.h"
 
 #include <linux/uio.h>
@@ -165,7 +165,7 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct mnt_idmap *,
+int bchfs_truncate(struct mnt_idmap *,
 		  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 141bcced..0679b2f7 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -122,7 +122,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 
 	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-	return copy_to_user(arg, &fa, sizeof(fa));
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static int fssetxattr_inode_update_fn(struct btree_trans *trans,
diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h
index f201980e..54a9c21a 100644
--- a/libbcachefs/fs-ioctl.h
+++ b/libbcachefs/fs-ioctl.h
@@ -5,7 +5,7 @@
 /* Inode flags: */
 
 /* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
 	[__BCH_INODE_SYNC]	= S_SYNC,
 	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
 	[__BCH_INODE_APPEND]	= S_APPEND,
@@ -13,7 +13,7 @@ static const unsigned bch_flags_to_vfs[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
 	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
 	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
 	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
@@ -22,7 +22,7 @@ static const unsigned bch_flags_to_uflags[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
 	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
 	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
 	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 80dcda43..1354af2c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -19,7 +19,7 @@
 #include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
@@ -82,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 				  inode_set_fn set,
 				  void *p, unsigned fields)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 512);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
 				BTREE_ITER_INTENT) ?:
-		(set ? set(&trans, inode, &inode_u, p) : 0) ?:
-		bch2_inode_write(&trans, &iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+		(set ? set(trans, inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
 	/*
 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
 	 * this is important for inode updates via bchfs_write_index_update
 	 */
 	if (!ret)
-		bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
+		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
 
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
@@ -114,7 +112,7 @@ retry:
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret < 0 ? ret : 0;
 }
 
@@ -182,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
 	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct bch_subvolume subvol;
 	int ret;
 
@@ -196,14 +194,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	if (!(inode->v.i_state & I_NEW))
 		return &inode->v;
 
-	bch2_trans_init(&trans, c, 8, 0);
-	ret = lockrestart_do(&trans,
-		bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+	trans = bch2_trans_get(c);
+	ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 
 	if (!ret)
-		bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
-	bch2_trans_exit(&trans);
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	bch2_trans_put(trans);
 
 	if (ret) {
 		iget_failed(&inode->v);
@@ -226,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	      unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
 	struct bch_inode_info *inode, *old;
 	struct bch_inode_unpacked inode_u;
@@ -256,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap,
 	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_lock(&dir->ei_update_lock);
 
-	bch2_trans_init(&trans, c, 8,
-			2048 + (!(flags & BCH_CREATE_TMPFILE)
-				? dentry->d_name.len : 0));
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret   = bch2_create_trans(&trans,
+	ret   = bch2_create_trans(trans,
 				  inode_inum(dir), &dir_u, &inode_u,
 				  !(flags & BCH_CREATE_TMPFILE)
 				  ? &dentry->d_name : NULL,
@@ -278,9 +274,9 @@ retry:
 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
 	inum.inum = inode_u.bi_inum;
 
-	ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+	ret   = bch2_subvolume_get(trans, inum.subvol, true,
 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
-		bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+		bch2_trans_commit(trans, NULL, &journal_seq, 0);
 	if (unlikely(ret)) {
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
@@ -291,13 +287,13 @@ err_before_quota:
 	}
 
 	if (!(flags & BCH_CREATE_TMPFILE)) {
-		bch2_inode_update_after_write(&trans, dir, &dir_u,
+		bch2_inode_update_after_write(trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
 	bch2_iget5_set(&inode->v, &inum);
-	bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -337,7 +333,7 @@ err_before_quota:
 		unlock_new_inode(&inode->v);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
 	posix_acl_release(acl);
@@ -346,7 +342,7 @@ err_trans:
 	if (!(flags & BCH_CREATE_TMPFILE))
 		mutex_unlock(&dir->ei_update_lock);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
 	inode = ERR_PTR(ret);
@@ -401,26 +397,25 @@ static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *dir,
 		       struct dentry *dentry)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bch_inode_unpacked dir_u, inode_u;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_link_trans(&trans,
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_link_trans(trans,
 					inode_inum(dir),   &dir_u,
 					inode_inum(inode), &inode_u,
 					&dentry->d_name));
 
 	if (likely(!ret)) {
-		bch2_inode_update_after_write(&trans, dir, &dir_u,
+		bch2_inode_update_after_write(trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
@@ -451,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_inode_unpacked dir_u, inode_u;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret;
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-	bch2_trans_init(&trans, c, 4, 1024);
 
-	ret = commit_do(&trans, NULL, NULL,
+	ret = commit_do(trans, NULL, NULL,
 			BTREE_INSERT_NOFAIL,
-		bch2_unlink_trans(&trans,
+		bch2_unlink_trans(trans,
 				  inode_inum(dir), &dir_u,
 				  &inode_u, &dentry->d_name,
 				  deleting_snapshot));
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(&trans, dir, &dir_u,
+	bch2_inode_update_after_write(trans, dir, &dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	bch2_inode_update_after_write(&trans, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_MTIME);
 
 	if (inode_u.bi_subvol) {
@@ -479,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 		set_nlink(&inode->v, 0);
 	}
 err:
-	bch2_trans_exit(&trans);
 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -543,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
 		? BCH_RENAME_EXCHANGE
 		: dst_dentry->d_inode
@@ -560,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			return ret;
 	}
 
-	bch2_trans_init(&trans, c, 8, 2048);
+	trans = bch2_trans_get(c);
 
 	bch2_lock_inodes(INODE_UPDATE_LOCK,
 			 src_dir,
@@ -587,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 			goto err;
 	}
 
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_rename_trans(&trans,
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_rename_trans(trans,
 					  inode_inum(src_dir), &src_dir_u,
 					  inode_inum(dst_dir), &dst_dir_u,
 					  &src_inode_u,
@@ -603,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 	BUG_ON(dst_inode &&
 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-	bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
+	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
 
 	if (src_dir != dst_dir)
-		bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
+		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
 
-	bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
+	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
 				      ATTR_CTIME);
 
 	if (dst_inode)
-		bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
+		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	bch2_fs_quota_transfer(c, src_inode,
 			       bch_qid(&src_inode->ei_inode),
@@ -680,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_qid qid;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
@@ -701,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 	if (ret)
 		goto err;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	kfree(acl);
 	acl = NULL;
 
-	ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_INTENT);
 	if (ret)
 		goto btree_err;
@@ -715,29 +709,29 @@ retry:
 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
 				     inode_u.bi_mode, &acl);
 		if (ret)
 			goto btree_err;
 	}
 
-	ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL);
 btree_err:
-	bch2_trans_iter_exit(&trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
 
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err_trans;
 
-	bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
+	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 
 	if (acl)
 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 err_trans:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 err:
 	mutex_unlock(&inode->ei_update_lock);
 
@@ -798,7 +792,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
 		return ret;
 
 	return iattr->ia_valid & ATTR_SIZE
-		? bch2_truncate(idmap, inode, iattr)
+		? bchfs_truncate(idmap, inode, iattr)
 		: bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
@@ -879,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *ei = to_bch_ei(vinode);
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf cur, prev;
@@ -900,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
 	bch2_bkey_buf_init(&cur);
 	bch2_bkey_buf_init(&prev);
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(ei->v.i_ino, start, snapshot), 0);
 
-	while (!(ret = btree_trans_too_many_iters(&trans)) &&
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
 	       !(ret = bkey_err(k))) {
 		enum btree_id data_btree = BTREE_ID_extents;
@@ -928,7 +922,7 @@ retry:
 
 		bch2_bkey_buf_reassemble(&cur, c, k);
 
-		ret = bch2_read_indirect_extent(&trans, &data_btree,
+		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &cur);
 		if (ret)
 			break;
@@ -947,7 +941,7 @@ retry:
 		cur.k->k.p.offset += cur.k->k.size;
 
 		if (have_extent) {
-			bch2_trans_unlock(&trans);
+			bch2_trans_unlock(trans);
 			ret = bch2_fill_extent(c, info,
 					bkey_i_to_s_c(prev.k), 0);
 			if (ret)
@@ -961,18 +955,18 @@ retry:
 			POS(iter.pos.inode, iter.pos.offset + sectors));
 	}
 	start = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
 	if (!ret && have_extent) {
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
 				       FIEMAP_EXTENT_LAST);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&cur, c);
 	bch2_bkey_buf_exit(&prev, c);
 	return ret < 0 ? ret : 0;
@@ -1230,7 +1224,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter1;
 	struct btree_iter iter2;
 	struct bkey_s_c k;
@@ -1245,23 +1239,23 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 	if (!S_ISDIR(dir->v.i_mode))
 		return -EINVAL;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
 			     POS(dir->ei_inode.bi_inum, 0), 0);
-	bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
 			     POS(dir->ei_inode.bi_inum, 0), 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
 
-	ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
 	if (ret)
 		goto err;
 
@@ -1279,7 +1273,7 @@ retry:
 		}
 
 		d = bkey_s_c_to_dirent(k);
-		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 		if (ret > 0)
 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		if (ret)
@@ -1301,7 +1295,7 @@ retry:
 				continue;
 
 			d = bkey_s_c_to_dirent(k);
-			ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 			if (ret < 0)
 				break;
 			if (ret)
@@ -1325,9 +1319,9 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_iter_exit(&trans, &iter1);
-	bch2_trans_iter_exit(&trans, &iter2);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter1);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -1661,7 +1655,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 		up_write(&c->state_lock);
 	}
 
-	if (opts.errors >= 0)
+	if (opt_defined(opts, errors))
 		c->opts.errors = opts.errors;
 err:
 	return bch2_err_class(ret);
@@ -1722,6 +1716,35 @@ static void bch2_put_super(struct super_block *sb)
 	__bch2_fs_stop(c);
 }
 
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+	return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	down_write(&c->state_lock);
+	ret = bch2_fs_read_write(c);
+	up_write(&c->state_lock);
+	return ret;
+}
+
 static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
@@ -1733,10 +1756,8 @@ static const struct super_operations bch_super_operations = {
 	.show_options	= bch2_show_options,
 	.remount_fs	= bch2_remount,
 	.put_super	= bch2_put_super,
-#if 0
 	.freeze_fs	= bch2_freeze,
 	.unfreeze_fs	= bch2_unfreeze,
-#endif
 };
 
 static int bch2_set_super(struct super_block *s, void *data)
@@ -1890,7 +1911,7 @@ got_sb:
 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	ret = PTR_ERR_OR_ZERO(vinode);
 	if (ret) {
-		bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "mounting: error getting root inode");
 		goto err_put_super;
 	}
 
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 10e11119..5edf1d4b 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -197,7 +197,7 @@ int bch2_vfs_init(void);
 
 #else
 
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	do {} while (0)
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 57b3dfab..206302b0 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -80,7 +80,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
 	if (!ret)
 		*subvol = le32_to_cpu(s.subvol);
 	else if (bch2_err_matches(ret, ENOENT))
-		bch_err(trans->c, "snapshot %u not fonud", snapshot);
+		bch_err(trans->c, "snapshot %u not found", snapshot);
 	return ret;
 
 }
@@ -127,8 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 	ret = bch2_inode_unpack(k, inode);
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "error fetching inode %llu: %s",
-			inode_nr, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -154,8 +153,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
 		*snapshot = iter.pos.snapshot;
 err:
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(trans->c, "error fetching inode %llu:%u: %s",
-			inode_nr, *snapshot, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -206,17 +204,16 @@ static int __write_inode(struct btree_trans *trans,
 				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 }
 
-static int write_inode(struct btree_trans *trans,
-		       struct bch_inode_unpacked *inode,
-		       u32 snapshot)
+static int fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
 {
 	int ret = commit_do(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW,
 				  __write_inode(trans, inode, snapshot));
 	if (ret)
-		bch_err(trans->c, "error in fsck: error updating inode: %s",
-			bch2_err_str(ret));
+		bch_err_fn(trans->c, ret);
 	return ret;
 }
 
@@ -278,13 +275,13 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
 	}
 
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 	if (ret)
 		return ret;
 
 	if (d_type != DT_DIR) {
 		bch_err(c, "error looking up lost+found: not a directory");
-		return ret;
+		return -BCH_ERR_ENOENT_not_directory;
 	}
 
 	/*
@@ -301,7 +298,7 @@ create_lostfound:
 				0, 0, S_IFDIR|0700, 0, NULL, NULL,
 				(subvol_inum) { }, 0);
 	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating lost+found");
 	return ret;
 }
 
@@ -365,8 +362,7 @@ static int reattach_inode(struct btree_trans *trans,
 				  BTREE_INSERT_NOFAIL,
 			__reattach_inode(trans, inode, inode_snapshot));
 	if (ret) {
-		bch_err(trans->c, "error reattaching inode %llu: %s",
-			inode->bi_inum, bch2_err_str(ret));
+		bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
 		return ret;
 	}
 
@@ -475,7 +471,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
  * and @ancestor hasn't been overwritten in @seen
  *
- * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ * @c:		filesystem handle
+ * @seen:	list of snapshot ids already seen at current position
+ * @id:		descendent snapshot id
+ * @ancestor:	ancestor snapshot id
+ *
+ * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
  */
 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
 				    u32 id, u32 ancestor)
@@ -520,14 +521,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
  * snapshot id @dst, test whether there is some snapshot in which @dst is
  * visible.
  *
- * This assumes we're visiting @src keys in natural key order.
+ * @c:		filesystem handle
+ * @s:		list of snapshot IDs already seen at @src
+ * @src:	snapshot ID of src key
+ * @dst:	snapshot ID of dst key
+ * Returns:	true if there is some snapshot in which @dst is visible
  *
- * @s	- list of snapshot IDs already seen at @src
- * @src	- snapshot ID of src key
- * @dst	- snapshot ID of dst key
+ * Assumes we're visiting @src keys in natural key order
  */
-static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-		       u32 src, u32 dst)
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+			u32 src, u32 dst)
 {
 	return dst <= src
 		? key_visible_in_snapshot(c, s, dst, src)
@@ -618,10 +621,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 
 	w->first_this_inode = true;
 
-	if (trans_was_restarted(trans, restart_count))
-		return -BCH_ERR_transaction_restart_nested;
-
-	return 0;
+	return trans_was_restarted(trans, restart_count);
 }
 
 static struct inode_walker_entry *
@@ -822,7 +822,7 @@ bad_hash:
 		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
 		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+			bch_err_fn(c, ret);
 		if (ret)
 			return ret;
 		ret = -BCH_ERR_transaction_restart_nested;
@@ -886,7 +886,8 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret) {
-			bch_err_msg(c, ret, "in fsck: error updating inode");
+			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				bch_err_msg(c, ret, "in fsck updating inode");
 			return ret;
 		}
 
@@ -904,8 +905,7 @@ static int check_inode(struct btree_trans *trans,
 
 		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "error in fsck: error while deleting inode: %s",
-				bch2_err_str(ret));
+			bch_err_msg(c, ret, "in fsck deleting inode");
 		return ret;
 	}
 
@@ -928,8 +928,7 @@ static int check_inode(struct btree_trans *trans,
 				POS(u.bi_inum, U64_MAX),
 				0, NULL);
 		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch_err(c, "error in fsck: error truncating inode: %s",
-				bch2_err_str(ret));
+			bch_err_msg(c, ret, "in fsck truncating inode");
 		if (ret)
 			return ret;
 
@@ -954,8 +953,7 @@ static int check_inode(struct btree_trans *trans,
 
 		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
 		if (sectors < 0) {
-			bch_err(c, "error in fsck: error recounting inode sectors: %s",
-				bch2_err_str(sectors));
+			bch_err_msg(c, sectors, "fsck recounting inode sectors");
 			return sectors;
 		}
 
@@ -974,13 +972,13 @@ static int check_inode(struct btree_trans *trans,
 	if (do_update) {
 		ret = __write_inode(trans, &u, iter->pos.snapshot);
 		if (ret) {
-			bch_err_msg(c, ret, "in fsck: error updating inode");
+			bch_err_msg(c, ret, "in fsck updating inode");
 			return ret;
 		}
 	}
 err:
 fsck_err:
-	if (ret)
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err_fn(c, ret);
 	return ret;
 }
@@ -989,7 +987,7 @@ noinline_for_stack
 int bch2_check_inodes(struct bch_fs *c)
 {
 	bool full = c->opts.fsck;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bch_inode_unpacked prev = { 0 };
 	struct snapshots_seen s;
@@ -997,16 +995,15 @@ int bch2_check_inodes(struct bch_fs *c)
 	int ret;
 
 	snapshots_seen_init(&s);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
 			POS_MIN,
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_inode(&trans, &iter, k, &prev, &s, full));
+		check_inode(trans, &iter, k, &prev, &s, full));
 
-	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1081,7 +1078,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 			    w->last_pos.inode, i->snapshot,
 			    i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = write_inode(trans, &i->inode, i->snapshot);
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1089,9 +1086,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
 	if (ret)
 		bch_err_fn(c, ret);
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1441,7 +1436,7 @@ int bch2_check_extents(struct bch_fs *c)
 {
 	struct inode_walker w = inode_walker_init();
 	struct snapshots_seen s;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct extent_ends extent_ends;
@@ -1450,23 +1445,22 @@ int bch2_check_extents(struct bch_fs *c)
 
 	snapshots_seen_init(&s);
 	extent_ends_init(&extent_ends);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 			&res, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
 		bch2_disk_reservation_put(c, &res);
-		check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+		check_extent(trans, &iter, k, &w, &s, &extent_ends);
 	})) ?:
-	check_i_sectors(&trans, &w);
+	check_i_sectors(trans, &w);
 
 	bch2_disk_reservation_put(c, &res);
 	extent_ends_exit(&extent_ends);
 	inode_walker_exit(&w);
-	bch2_trans_exit(&trans);
 	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -1501,7 +1495,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = write_inode(trans, &i->inode, i->snapshot);
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1509,9 +1503,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
 	if (ret)
 		bch_err_fn(c, ret);
-	if (!ret && trans_was_restarted(trans, restart_count))
-		ret = -BCH_ERR_transaction_restart_nested;
-	return ret;
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 static int check_dirent_target(struct btree_trans *trans,
@@ -1809,23 +1801,22 @@ int bch2_check_dirents(struct bch_fs *c)
 	struct inode_walker target = inode_walker_init();
 	struct snapshots_seen s;
 	struct bch_hash_info hash_info;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
 	snapshots_seen_init(&s);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
+		check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
 	inode_walker_exit(&target);
@@ -1879,23 +1870,18 @@ int bch2_check_xattrs(struct bch_fs *c)
 {
 	struct inode_walker inode = inode_walker_init();
 	struct bch_hash_info hash_info;
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 			POS(BCACHEFS_ROOT_INO, 0),
 			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
 			k,
 			NULL, NULL,
 			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_xattr(&trans, &iter, k, &hash_info, &inode));
-
-	bch2_trans_exit(&trans);
-
+		check_xattr(trans, &iter, k, &hash_info, &inode)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1927,10 +1913,10 @@ static int check_root_trans(struct btree_trans *trans)
 		ret = commit_do(trans, NULL, NULL,
 				      BTREE_INSERT_NOFAIL|
 				      BTREE_INSERT_LAZY_RW,
-			__bch2_btree_insert(trans, BTREE_ID_subvolumes,
+			bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
 					    &root_subvol.k_i, 0));
 		if (ret) {
-			bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "writing root subvol");
 			goto err;
 		}
 
@@ -1949,7 +1935,7 @@ static int check_root_trans(struct btree_trans *trans)
 
 		ret = __write_inode(trans, &root_inode, snapshot);
 		if (ret)
-			bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "writing root inode");
 	}
 err:
 fsck_err:
@@ -1964,7 +1950,7 @@ int bch2_check_root(struct bch_fs *c)
 	ret = bch2_trans_do(c, NULL, NULL,
 			     BTREE_INSERT_NOFAIL|
 			     BTREE_INSERT_LAZY_RW,
-		check_root_trans(&trans));
+		check_root_trans(trans));
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -2116,16 +2102,14 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	pathbuf path = { 0, };
 	int ret;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2142,12 +2126,12 @@ int bch2_check_directory_structure(struct bch_fs *c)
 		if (u.bi_flags & BCH_INODE_UNLINKED)
 			continue;
 
-		ret = check_path(&trans, &path, &u, iter.pos.snapshot);
+		ret = check_path(trans, &path, &u, iter.pos.snapshot);
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	darray_exit(&path);
 
 	if (ret)
@@ -2155,8 +2139,6 @@ int bch2_check_directory_structure(struct bch_fs *c)
 	return ret;
 }
 
-/* check_nlink pass: */
-
 struct nlink_table {
 	size_t		nr;
 	size_t		size;
@@ -2238,15 +2220,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 				       struct nlink_table *t,
 				       u64 start, u64 *end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked u;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+	for_each_btree_key(trans, iter, BTREE_ID_inodes,
 			   POS(0, start),
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
@@ -2275,8 +2255,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 		}
 
 	}
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2288,7 +2268,7 @@ noinline_for_stack
 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
 				     u64 range_start, u64 range_end)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct snapshots_seen s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -2297,9 +2277,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
 	snapshots_seen_init(&s);
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
 			   BTREE_ITER_INTENT|
 			   BTREE_ITER_PREFETCH|
 			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2319,12 +2297,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 			break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret)
 		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	snapshots_seen_exit(&s);
 	return ret;
 }
@@ -2375,22 +2353,17 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
 			       struct nlink_table *links,
 			       u64 range_start, u64 range_end)
 {
-	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	size_t idx = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
-			POS(0, range_start),
-			BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
-
-	bch2_trans_exit(&trans);
-
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS(0, range_start),
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
 	if (ret < 0) {
 		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
 		return ret;
@@ -2472,13 +2445,12 @@ int bch2_fix_reflink_p(struct bch_fs *c)
 		return 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_extents, POS_MIN,
 				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
 				BTREE_ITER_ALL_SNAPSHOTS, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			fix_reflink_p_key(&trans, &iter, k)));
-
+			fix_reflink_p_key(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 8114b6e4..8bfd99cb 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -120,8 +120,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
 
-		int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
-					   &unpacked);
+		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
 		BUG_ON(ret);
 		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
@@ -318,7 +317,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-int bch2_inode_peek(struct btree_trans *trans,
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
 		    subvol_inum inum, unsigned flags)
@@ -349,7 +348,17 @@ int bch2_inode_peek(struct btree_trans *trans,
 	return 0;
 err:
 	bch2_trans_iter_exit(trans, iter);
-	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+	return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+
+	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 	return ret;
 }
@@ -817,7 +826,7 @@ err:
 
 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
 	struct bch_inode_unpacked inode_u;
@@ -825,8 +834,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 	u32 snapshot;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 1024);
-
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
@@ -835,19 +842,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
-		bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
-		bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
+	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
 	if (ret)
 		goto err;
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
 			       SPOS(0, inum.inum, snapshot),
 			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 	ret = bkey_err(k);
@@ -855,7 +862,7 @@ retry:
 		goto err;
 
 	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(trans.c,
+		bch2_fs_inconsistent(c,
 				     "inode %llu:%u not found when deleting",
 				     inum.inum, snapshot);
 		ret = -EIO;
@@ -868,15 +875,28 @@ retry:
 	delete.k.p = iter.pos;
 	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-	ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
@@ -897,7 +917,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 			    struct bch_inode_unpacked *inode)
 {
 	return bch2_trans_do(c, NULL, NULL, 0,
-		bch2_inode_find_by_inum_trans(&trans, inum, inode));
+		bch2_inode_find_by_inum_trans(trans, inum, inode));
 }
 
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -1069,14 +1089,12 @@ delete:
 
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = bch2_btree_write_buffer_flush_sync(&trans);
+	ret = bch2_btree_write_buffer_flush_sync(trans);
 	if (ret)
 		goto err;
 
@@ -1086,26 +1104,26 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	 * but we can't retry because the btree write buffer won't have been
 	 * flushed and we'd spin:
 	 */
-	for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+	for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
 			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+		ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
 		if (ret < 0)
 			break;
 
 		if (ret) {
 			if (!test_bit(BCH_FS_RW, &c->flags)) {
-				bch2_trans_unlock(&trans);
+				bch2_trans_unlock(trans);
 				bch2_fs_lazy_rw(c);
 			}
 
-			ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
 			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 22b24405..a7464e1b 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -118,6 +118,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 
 int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+				  subvol_inum,
+				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
 				  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
deleted file mode 100644
index 831e3f1b..00000000
--- a/libbcachefs/io.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "checksum.h"
-#include "bkey_buf.h"
-#include "io_types.h"
-
-#define to_wbio(_bio)			\
-	container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio)			\
-	container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       enum bch_data_type, const struct bkey_i *, bool);
-
-#define BLK_STS_REMOVED		((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#define BCH_WRITE_FLAGS()		\
-	x(ALLOC_NOWAIT)			\
-	x(CACHED)			\
-	x(DATA_ENCODED)			\
-	x(PAGES_STABLE)			\
-	x(PAGES_OWNED)			\
-	x(ONLY_SPECIFIED_DEVS)		\
-	x(WROTE_DATA_INLINE)		\
-	x(FROM_INTERNAL)		\
-	x(CHECK_ENOSPC)			\
-	x(SYNC)				\
-	x(MOVE)				\
-	x(IN_WORKER)			\
-	x(DONE)				\
-	x(IO_ERROR)			\
-	x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f)	__BCH_WRITE_##f,
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-	return op->watermark == BCH_WATERMARK_copygc
-		? op->c->copygc_wq
-		: op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-			       struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-		       struct btree_iter *, struct bkey_i *,
-		       struct disk_reservation *, u64, s64 *, bool);
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-			  unsigned, struct bch_io_opts, s64 *,
-			  struct write_point_specifier);
-
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct bch_io_opts opts)
-{
-	op->c			= c;
-	op->end_io		= NULL;
-	op->flags		= 0;
-	op->written		= 0;
-	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c, opts);
-	op->compression_opt	= opts.compression;
-	op->nr_replicas		= 0;
-	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->watermark		= BCH_WATERMARK_normal;
-	op->incompressible	= 0;
-	op->open_buckets.nr	= 0;
-	op->devs_have.nr	= 0;
-	op->target		= 0;
-	op->opts		= opts;
-	op->subvol		= 0;
-	op->pos			= POS_MAX;
-	op->version		= ZERO_VERSION;
-	op->write_point		= (struct write_point_specifier) { 0 };
-	op->res			= (struct disk_reservation) { 0 };
-	op->new_i_size		= U64_MAX;
-	op->i_sectors_delta	= 0;
-	op->devs_need_flush	= NULL;
-}
-
-void bch2_write(struct closure *);
-
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-	struct bch_write_bio *wbio = to_wbio(bio);
-
-	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-	return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-				struct bkey_buf *);
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-					    enum btree_id *data_btree,
-					    unsigned *offset_into_extent,
-					    struct bkey_buf *k)
-{
-	if (k->k->k.type != KEY_TYPE_reflink_p)
-		return 0;
-
-	*data_btree = BTREE_ID_reflink;
-	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
-}
-
-enum bch_read_flags {
-	BCH_READ_RETRY_IF_STALE		= 1 << 0,
-	BCH_READ_MAY_PROMOTE		= 1 << 1,
-	BCH_READ_USER_MAPPED		= 1 << 2,
-	BCH_READ_NODECODE		= 1 << 3,
-	BCH_READ_LAST_FRAGMENT		= 1 << 4,
-
-	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 5,
-	BCH_READ_MUST_CLONE		= 1 << 6,
-	BCH_READ_IN_RETRY		= 1 << 7,
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-		       struct bvec_iter, struct bpos, enum btree_id,
-		       struct bkey_s_c, unsigned,
-		       struct bch_io_failures *, unsigned);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-			struct bch_read_bio *rbio, struct bpos read_pos,
-			enum btree_id data_btree, struct bkey_s_c k,
-			unsigned offset_into_extent, unsigned flags)
-{
-	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-			   data_btree, k, offset_into_extent, NULL, flags);
-}
-
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum)
-{
-	struct bch_io_failures failed = { .nr = 0 };
-
-	BUG_ON(rbio->_state);
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-	rbio->subvol = inum.subvol;
-
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
-		    BCH_READ_RETRY_IF_STALE|
-		    BCH_READ_MAY_PROMOTE|
-		    BCH_READ_USER_MAPPED);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-					     struct bch_io_opts opts)
-{
-	struct bch_read_bio *rbio = to_rbio(bio);
-
-	rbio->_state	= 0;
-	rbio->promote	= NULL;
-	rbio->opts	= opts;
-	return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
new file mode 100644
index 00000000..32432bdd
--- /dev/null
+++ b/libbcachefs/io_misc.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  unsigned sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct closure cl;
+	struct open_buckets open_buckets = { 0 };
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	unsigned sectors_allocated = 0;
+	bool have_reservation = false;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
+
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+	if (!have_reservation) {
+		unsigned new_replicas =
+			max(0, (int) opts.data_replicas -
+			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+		/*
+		 * Get a disk reservation before (in the nocow case) calling
+		 * into the allocator:
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+		if (unlikely(ret))
+			goto err;
+
+		bch2_bkey_buf_reassemble(&old, c, k);
+	}
+
+	if (have_reservation) {
+		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+			goto err;
+
+		bch2_key_resize(&new.k->k, sectors);
+	} else if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+		struct bch_extent_ptr *ptr;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				BCH_WATERMARK_normal, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
+
+		sectors = min(sectors, wp->sectors_free);
+		sectors_allocated = sectors;
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	have_reservation = true;
+
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+				 0, i_sectors_delta, true);
+err:
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+
+	bch2_open_buckets_put(c, &open_buckets);
+	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   subvol_inum inum, u64 end,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+	u32 snapshot;
+
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		if (ret)
+			ret2 = ret;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		/*
+		 * peek_upto() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end_pos, &delete);
+
+		ret = bch2_extent_update(trans, inum, iter, &delete,
+				&disk_res, 0, i_sectors_delta, false);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+		s64 *i_sectors_delta)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, start),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+			      subvol_inum inum,
+			      u64 new_i_size)
+{
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+		(inode_u.bi_size = new_i_size, 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+					    struct bkey_i *op_k,
+					    u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter fpunch_iter;
+	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+	int ret;
+
+	ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			truncate_set_isize(trans, inum, new_i_size));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_INTENT);
+	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_exit(trans, &fpunch_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+err:
+	bch2_logged_op_finish(trans, op_k);
+	return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_truncate op;
+
+	bkey_logged_op_truncate_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.new_i_size	= cpu_to_le64(new_i_size);
+
+	return bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
+	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
+	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	offset	<<= 9;
+	len	<<= 9;
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	if (len > 0) {
+		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+			ret = -EFBIG;
+			goto err;
+		}
+
+		if (offset >= inode_u.bi_size) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	inode_u.bi_size += len;
+	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+					   struct bkey_i *op_k,
+					   u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+	u64 src_offset = le64_to_cpu(op->v.src_offset);
+	s64 shift = dst_offset - src_offset;
+	u64 len = abs(shift);
+	u64 pos = le64_to_cpu(op->v.pos);
+	bool insert = shift > 0;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, 0),
+			     BTREE_ITER_INTENT);
+
+	switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+	op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+	if (insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, len) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete, *copy;
+		struct bkey_s_c k;
+		struct bpos src_pos = POS(inum.inum, src_offset);
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto btree_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+		k = insert
+			? bch2_btree_iter_peek_prev(&iter)
+			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto btree_err;
+
+		if (!k.k ||
+		    k.k->p.inode != inum.inum ||
+		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
+			break;
+
+		copy = bch2_bkey_make_mut_noupdate(trans, k);
+		if ((ret = PTR_ERR_OR_ZERO(copy)))
+			goto btree_err;
+
+		if (insert &&
+		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
+			bch2_cut_front(src_pos, copy);
+
+			/* Splitting compressed extent? */
+			bch2_disk_reservation_add(c, &disk_res,
+					copy->k.size *
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+					BCH_DISK_RESERVATION_NOFAIL);
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = copy->k.p;
+		delete.k.p.snapshot = snapshot;
+		delete.k.size = copy->k.size;
+
+		copy->k.p.offset += shift;
+		copy->k.p.snapshot = snapshot;
+
+		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+		ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+			bch2_logged_op_update(trans, &op->k_i) ?:
+			bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		pos = le64_to_cpu(op->v.pos);
+	}
+
+	op->v.state = LOGGED_OP_FINSERT_finish;
+
+	if (!insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, shift) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, 0, 0) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	break;
+case LOGGED_OP_FINSERT_finish:
+	break;
+	}
+err:
+	bch2_logged_op_finish(trans, op_k);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_finsert op;
+	s64 shift = insert ? len : -len;
+
+	bkey_logged_op_finsert_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.dst_offset	= cpu_to_le64(offset + shift);
+	op.v.src_offset	= cpu_to_le64(offset);
+	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
+
+	return bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+}
diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h
new file mode 100644
index 00000000..c9e6ed40
--- /dev/null
+++ b/libbcachefs/io_misc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  unsigned, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_truncate_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_finsert_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
new file mode 100644
index 00000000..443c3ea6
--- /dev/null
+++ b/libbcachefs/io_read.c
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct data_update	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	BUG_ON(!opts.promote_target);
+
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return -BCH_ERR_nopromote_may_not;
+
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
+		return -BCH_ERR_nopromote_already_promoted;
+
+	if (bkey_extent_is_unwritten(k))
+		return -BCH_ERR_nopromote_unwritten;
+
+	if (bch2_target_congested(c, opts.promote_target))
+		return -BCH_ERR_nopromote_congested;
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return -BCH_ERR_nopromote_in_flight;
+
+	return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	bch2_data_update_exit(&op->write);
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+	struct promote_op *op =
+		container_of(wop, struct promote_op, write.op);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+					  enum btree_id btree_id,
+					  struct bkey_s_c k,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct bch_fs *c = trans->c;
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOFS);
+	if (!*rbio)
+		goto err;
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOFS))
+		goto err;
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+	ret = bch2_data_update_init(trans, NULL, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			(struct data_update_opts) {
+				.target		= opts.promote_target,
+				.extra_replicas	= 1,
+				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+			},
+			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
+	if (ret) {
+		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+					bch_promote_params);
+		BUG_ON(ret);
+		goto err;
+	}
+
+	op->write.op.end_io = promote_done;
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full)
+{
+	struct bch_fs *c = trans->c;
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+	int ret;
+
+	ret = should_promote(c, k, pos, opts, flags);
+	if (ret)
+		goto nopromote;
+
+	promote = __promote_alloc(trans,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
+				  k, pos, pick, opts, sectors, rbio);
+	if (!promote) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto nopromote;
+	}
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+nopromote:
+	trace_read_nopromote(c, ret);
+	return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_bkey_buf_init(&sk);
+
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (bkey_err(k))
+		goto err;
+
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+	bch2_trans_unlock(trans);
+
+	if (!bch2_bkey_matches_ptr(c, k,
+				   rbio->pick.ptr,
+				   rbio->data_pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_and_count(c, read_retry, &rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE) {
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	}
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter iter;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (crc_is_compressed(rbio->pick.crc))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+		      __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
+	struct bch_csum csum;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc_is_compressed(crc)) {
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+out:
+	memalloc_nofs_restore(nofs_flags);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch_err_inum_offset_ratelimited(ca,
+		rbio->read_pos.inode,
+		rbio->read_pos.offset << 9,
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+	bch2_io_error(ca);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	goto out;
+decompression_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decompression error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+decrypt_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decrypt error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    rbio->read_pos.inode,
+				    rbio->read_pos.offset,
+				    "data read error: %s",
+			       bch2_blk_status_to_str(bio->bi_status))) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    ptr_stale(ca, &rbio->pick.ptr)) {
+		trace_and_count(c, read_reuse_race, &rbio->bio);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->promote ||
+	    crc_is_compressed(rbio->pick.crc) ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_buf *orig_k)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+		*offset_into_extent;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+			       POS(0, reflink_offset), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v &&
+	    k.k->type != KEY_TYPE_indirect_inline_data) {
+		bch_err_inum_offset_ratelimited(trans->c,
+			orig_k->k->k.p.inode,
+			orig_k->k->k.p.offset << 9,
+			"%llu len %u points to nonexistent indirect extent %llu",
+			orig_k->k->k.p.offset,
+			orig_k->k->k.size,
+			reflink_offset);
+		bch2_inconsistent_error(trans->c);
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+	struct btree_iter iter;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     PTR_BUCKET_POS(c, &ptr),
+			     BTREE_ITER_CACHED);
+
+	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+	printbuf_indent_add(&buf, 2);
+	prt_newline(&buf);
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+
+	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (!ret) {
+		prt_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca = NULL;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos data_pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	if (bkey_extent_is_inline_data(k.k)) {
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_inline_data_bytes(k.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+retry_pick:
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0) {
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"no device to read from");
+		goto err;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    unlikely(ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto get_bio;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (crc_is_compressed(pick.crc) ||
+	    (pick.crc.csum_type != BCH_CSUM_none &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (orig->opts.promote_target)
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(crc_is_compressed(pick.crc));
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		data_pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+	}
+get_bio:
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOFS,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_and_count(c, read_bounce, &rbio->bio);
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+		bio_inc_remaining(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			bch_err_inum_offset_ratelimited(c,
+					read_pos.inode,
+					read_pos.offset << 9,
+					"no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (unlikely(c->opts.no_data_io)) {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		if (!ret)
+			goto out_read_done;
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
+		 struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	BUG_ON(flags & BCH_READ_NODECODE);
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, bvec_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (bvec_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+					 data_btree, k,
+					 offset_into_extent, failed, flags);
+		if (ret)
+			break;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    ret == READ_RETRY ||
+	    ret == READ_RETRY_AVOID)
+		goto retry;
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c, inum.inum,
+						bvec_iter.bi_sector << 9,
+						"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
+
+	return 0;
+}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
new file mode 100644
index 00000000..d9c18bb7
--- /dev/null
+++ b/libbcachefs/io_read.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	u32			subvol;
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
+					    unsigned *offset_into_extent,
+					    struct bkey_buf *k)
+{
+	if (k->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
+{
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/libbcachefs/io.c b/libbcachefs/io_write.c
similarity index 53%
rename from libbcachefs/io.c
rename to libbcachefs/io_write.c
index 3c614c86..d2a0de88 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io_write.c
@@ -1,29 +1,24 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Some low level IO code, and hacks for various block layer limitations
- *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 
 #include "bcachefs.h"
-#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_buf.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "checksum.h"
-#include "compress.h"
 #include "clock.h"
-#include "data_update.h"
+#include "compress.h"
 #include "debug.h"
-#include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
 #include "extent_update.h"
 #include "inode.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
@@ -39,48 +34,8 @@
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-	if (status == BLK_STS_REMOVED)
-		return "device removed";
-	return blk_status_to_str(status);
-}
-
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	const struct bch_devs_mask *devs;
-	unsigned d, nr = 0, total = 0;
-	u64 now = local_clock(), last;
-	s64 congested;
-	struct bch_dev *ca;
-
-	if (!target)
-		return false;
-
-	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target) ?:
-		&c->rw_devs[BCH_DATA_user];
-
-	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-		ca = rcu_dereference(c->devs[d]);
-		if (!ca)
-			continue;
-
-		congested = atomic_read(&ca->congested);
-		last = READ_ONCE(ca->congested_last);
-		if (time_after64(now, last))
-			congested -= (now - last) >> 12;
-
-		total += max(congested, 0LL);
-		nr++;
-	}
-	rcu_read_unlock();
-
-	return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
 				       u64 now, int rw)
 {
@@ -136,13 +91,6 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	return false;
-}
-
 #endif
 
 /* Allocate, free from mempool: */
@@ -368,213 +316,13 @@ int bch2_extent_update(struct btree_trans *trans,
 	return 0;
 }
 
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-			  subvol_inum inum,
-			  struct btree_iter *iter,
-			  unsigned sectors,
-			  struct bch_io_opts opts,
-			  s64 *i_sectors_delta,
-			  struct write_point_specifier write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct disk_reservation disk_res = { 0 };
-	struct closure cl;
-	struct open_buckets open_buckets = { 0 };
-	struct bkey_s_c k;
-	struct bkey_buf old, new;
-	unsigned sectors_allocated = 0;
-	bool have_reservation = false;
-	bool unwritten = opts.nocow &&
-	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-	int ret;
-
-	bch2_bkey_buf_init(&old);
-	bch2_bkey_buf_init(&new);
-	closure_init_stack(&cl);
-
-	k = bch2_btree_iter_peek_slot(iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-
-	if (!have_reservation) {
-		unsigned new_replicas =
-			max(0, (int) opts.data_replicas -
-			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-		/*
-		 * Get a disk reservation before (in the nocow case) calling
-		 * into the allocator:
-		 */
-		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-		if (unlikely(ret))
-			goto err;
-
-		bch2_bkey_buf_reassemble(&old, c, k);
-	}
-
-	if (have_reservation) {
-		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-			goto err;
-
-		bch2_key_resize(&new.k->k, sectors);
-	} else if (!unwritten) {
-		struct bkey_i_reservation *reservation;
-
-		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-		reservation = bkey_reservation_init(new.k);
-		reservation->k.p = iter->pos;
-		bch2_key_resize(&reservation->k, sectors);
-		reservation->v.nr_replicas = opts.data_replicas;
-	} else {
-		struct bkey_i_extent *e;
-		struct bch_devs_list devs_have;
-		struct write_point *wp;
-		struct bch_extent_ptr *ptr;
-
-		devs_have.nr = 0;
-
-		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-		e = bkey_extent_init(new.k);
-		e->k.p = iter->pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				opts.foreground_target,
-				false,
-				write_point,
-				&devs_have,
-				opts.data_replicas,
-				opts.data_replicas,
-				BCH_WATERMARK_normal, 0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-			ret = -BCH_ERR_transaction_restart_nested;
-		if (ret)
-			goto err;
-
-		sectors = min(sectors, wp->sectors_free);
-		sectors_allocated = sectors;
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-	}
-
-	have_reservation = true;
-
-	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-				 0, i_sectors_delta, true);
-err:
-	if (!ret && sectors_allocated)
-		bch2_increment_clock(c, sectors_allocated, WRITE);
-
-	bch2_open_buckets_put(c, &open_buckets);
-	bch2_disk_reservation_put(c, &disk_res);
-	bch2_bkey_buf_exit(&new, c);
-	bch2_bkey_buf_exit(&old, c);
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
-	return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   subvol_inum inum, u64 end,
-		   s64 *i_sectors_delta)
-{
-	struct bch_fs *c	= trans->c;
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bpos end_pos = POS(inum.inum, end);
-	struct bkey_s_c k;
-	int ret = 0, ret2 = 0;
-	u32 snapshot;
-
-	while (!ret ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-
-		if (ret)
-			ret2 = ret;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(iter, snapshot);
-
-		/*
-		 * peek_upto() doesn't have ideal semantics for extents:
-		 */
-		k = bch2_btree_iter_peek_upto(iter, end_pos);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k);
-		if (ret)
-			continue;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter->pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end_pos, &delete);
-
-		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, 0, i_sectors_delta, false);
-		bch2_disk_reservation_put(c, &disk_res);
-	}
-
-	return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-		s64 *i_sectors_delta)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, start),
-			     BTREE_ITER_INTENT);
-
-	ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
-
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-
-	return ret;
-}
-
 static int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
 	struct bkey_buf sk;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *k = bch2_keylist_front(keys);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	subvol_inum inum = {
 		.subvol = op->subvol,
@@ -585,30 +333,29 @@ static int bch2_write_index_default(struct bch_write_op *op)
 	BUG_ON(!inum.subvol);
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	do {
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		k = bch2_keylist_front(keys);
 		bch2_bkey_buf_copy(&sk, c, k);
 
-		ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
 						  &sk.k->k.p.snapshot);
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
 		if (ret)
 			break;
 
-		bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 				     bkey_start_pos(&sk.k->k),
 				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
+		ret = bch2_extent_update(trans, inum, &iter, sk.k,
 					 &op->res,
 					 op->new_i_size, &op->i_sectors_delta,
 					 op->flags & BCH_WRITE_CHECK_ENOSPC);
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
@@ -621,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
 			bch2_cut_front(iter.pos, k);
 	} while (!bch2_keylist_empty(keys));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
@@ -741,7 +488,8 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 }
 
 /**
- * bch_write_index - after a write, update index to point to new data
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:		bch_write_op to process
  */
 static void __bch2_write_index(struct bch_write_op *op)
 {
@@ -778,10 +526,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		op->written += sectors_start - keylist_sectors(keys);
 
 		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
 			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
+				insert->k.p.inode, insert->k.p.offset << 9,
 				"write error while doing btree update: %s",
 				bch2_err_str(ret));
 		}
@@ -1182,7 +930,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 	do {
 		struct bch_extent_crc_unpacked crc = { 0 };
 		struct bversion version = op->version;
-		size_t dst_len, src_len;
+		size_t dst_len = 0, src_len = 0;
 
 		if (page_alloc_failed &&
 		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
@@ -1414,27 +1162,25 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i *orig;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for_each_keylist_key(&op->insert_keys, orig) {
-		ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+		ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
 				     bkey_start_pos(&orig->k), orig->k.p,
 				     BTREE_ITER_INTENT, k,
 				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-			bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
 		}));
 
 		if (ret && !bch2_err_matches(ret, EROFS)) {
-			struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
 			bch_err_inum_offset_ratelimited(c,
-				k->k.p.inode, k->k.p.offset << 9,
+				insert->k.p.inode, insert->k.p.offset << 9,
 				"write error while doing btree update: %s",
 				bch2_err_str(ret));
 		}
@@ -1445,7 +1191,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 		}
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 }
 
 static void __bch2_nocow_write_done(struct bch_write_op *op)
@@ -1469,7 +1215,7 @@ static void bch2_nocow_write_done(struct closure *cl)
 static void bch2_nocow_write(struct bch_write_op *op)
 {
 	struct bch_fs *c = op->c;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_ptrs_c ptrs;
@@ -1486,15 +1232,15 @@ static void bch2_nocow_write(struct bch_write_op *op)
 	if (op->flags & BCH_WRITE_MOVE)
 		return;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
 	if (unlikely(ret))
 		goto err;
 
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(op->pos.inode, op->pos.offset, snapshot),
 			     BTREE_ITER_SLOTS);
 	while (1) {
@@ -1540,7 +1286,7 @@ retry:
 
 		/* Unlock before taking nocow locks, doing IO: */
 		bkey_reassemble(op->insert_keys.top, k);
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 
 		bch2_cut_front(op->pos, op->insert_keys.top);
 		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
@@ -1589,7 +1335,7 @@ retry:
 		bch2_btree_iter_advance(&iter);
 	}
 out:
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
@@ -1604,7 +1350,7 @@ err:
 		op->flags |= BCH_WRITE_DONE;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	/* fallback to cow write path? */
 	if (!(op->flags & BCH_WRITE_DONE)) {
@@ -1682,7 +1428,7 @@ again:
 		 * allocations for specific disks may hang arbitrarily long:
 		 */
 		ret = bch2_trans_do(c, NULL, NULL, 0,
-			bch2_alloc_sectors_start_trans(&trans,
+			bch2_alloc_sectors_start_trans(trans,
 				op->target,
 				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 				op->write_point,
@@ -1798,7 +1544,8 @@ err:
 }
 
 /**
- * bch_write - handle a write to a cache device or flash only volume
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:		&bch_write_op->cl
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
@@ -1899,1140 +1646,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
 	printbuf_indent_sub(out, 2);
 }
 
-/* Cache promotion on read */
-
-struct promote_op {
-	struct rcu_head		rcu;
-	u64			start_time;
-
-	struct rhash_head	hash;
-	struct bpos		pos;
-
-	struct data_update	write;
-	struct bio_vec		bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-	.head_offset	= offsetof(struct promote_op, hash),
-	.key_offset	= offsetof(struct promote_op, pos),
-	.key_len	= sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
-				  struct bpos pos,
-				  struct bch_io_opts opts,
-				  unsigned flags)
+void bch2_fs_io_write_exit(struct bch_fs *c)
 {
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return false;
-
-	if (!opts.promote_target)
-		return false;
-
-	if (bch2_bkey_has_target(c, k, opts.promote_target))
-		return false;
-
-	if (bkey_extent_is_unwritten(k))
-		return false;
-
-	if (bch2_target_congested(c, opts.promote_target)) {
-		/* XXX trace this */
-		return false;
-	}
-
-	if (rhashtable_lookup_fast(&c->promote_table, &pos,
-				   bch_promote_params))
-		return false;
-
-	return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-	int ret;
-
-	bch2_data_update_exit(&op->write);
-
-	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-				     bch_promote_params);
-	BUG_ON(ret);
-	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-	kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-	struct promote_op *op =
-		container_of(wop, struct promote_op, write.op);
-	struct bch_fs *c = op->write.op.c;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-			       op->start_time);
-	promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-	struct bio *bio = &op->write.op.wbio.bio;
-
-	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
-	/* we now own pages: */
-	BUG_ON(!rbio->bounce);
-	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-	bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
-					  enum btree_id btree_id,
-					  struct bkey_s_c k,
-					  struct bpos pos,
-					  struct extent_ptr_decoded *pick,
-					  struct bch_io_opts opts,
-					  unsigned sectors,
-					  struct bch_read_bio **rbio)
-{
-	struct bch_fs *c = trans->c;
-	struct promote_op *op = NULL;
-	struct bio *bio;
-	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-	int ret;
-
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
-		return NULL;
-
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
-	if (!op)
-		goto err;
-
-	op->start_time = local_clock();
-	op->pos = pos;
-
-	/*
-	 * We don't use the mempool here because extents that aren't
-	 * checksummed or compressed can be too big for the mempool:
-	 */
-	*rbio = kzalloc(sizeof(struct bch_read_bio) +
-			sizeof(struct bio_vec) * pages,
-			GFP_NOFS);
-	if (!*rbio)
-		goto err;
-
-	rbio_init(&(*rbio)->bio, opts);
-	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
-
-	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-				 GFP_NOFS))
-		goto err;
-
-	(*rbio)->bounce		= true;
-	(*rbio)->split		= true;
-	(*rbio)->kmalloc	= true;
-
-	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-					  bch_promote_params))
-		goto err;
-
-	bio = &op->write.op.wbio.bio;
-	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
-	ret = bch2_data_update_init(trans, NULL, &op->write,
-			writepoint_hashed((unsigned long) current),
-			opts,
-			(struct data_update_opts) {
-				.target		= opts.promote_target,
-				.extra_replicas	= 1,
-				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
-			},
-			btree_id, k);
-	/*
-	 * possible errors: -BCH_ERR_nocow_lock_blocked,
-	 * -BCH_ERR_ENOSPC_disk_reservation:
-	 */
-	if (ret) {
-		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-					bch_promote_params);
-		BUG_ON(ret);
-		goto err;
-	}
-
-	op->write.op.end_io = promote_done;
-
-	return op;
-err:
-	if (*rbio)
-		bio_free_pages(&(*rbio)->bio);
-	kfree(*rbio);
-	*rbio = NULL;
-	kfree(op);
-	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-	return NULL;
-}
-
-noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
-					struct bvec_iter iter,
-					struct bkey_s_c k,
-					struct extent_ptr_decoded *pick,
-					struct bch_io_opts opts,
-					unsigned flags,
-					struct bch_read_bio **rbio,
-					bool *bounce,
-					bool *read_full)
-{
-	struct bch_fs *c = trans->c;
-	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
-	/* data might have to be decompressed in the write path: */
-	unsigned sectors = promote_full
-		? max(pick->crc.compressed_size, pick->crc.live_size)
-		: bvec_iter_sectors(iter);
-	struct bpos pos = promote_full
-		? bkey_start_pos(k.k)
-		: POS(k.k->p.inode, iter.bi_sector);
-	struct promote_op *promote;
-
-	if (!should_promote(c, k, pos, opts, flags))
-		return NULL;
-
-	promote = __promote_alloc(trans,
-				  k.k->type == KEY_TYPE_reflink_v
-				  ? BTREE_ID_reflink
-				  : BTREE_ID_extents,
-				  k, pos, pick, opts, sectors, rbio);
-	if (!promote)
-		return NULL;
-
-	*bounce		= true;
-	*read_full	= promote_full;
-	return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID	1
-#define READ_RETRY		2
-#define READ_ERR		3
-
-enum rbio_context {
-	RBIO_CONTEXT_NULL,
-	RBIO_CONTEXT_HIGHPRI,
-	RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-	return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-			   enum rbio_context context,
-			   struct workqueue_struct *wq)
-{
-	if (context <= rbio->context) {
-		fn(&rbio->work);
-	} else {
-		rbio->work.func		= fn;
-		rbio->context		= context;
-		queue_work(wq, &rbio->work);
-	}
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-	BUG_ON(rbio->bounce && !rbio->split);
-
-	if (rbio->promote)
-		promote_free(rbio->c, rbio->promote);
-	rbio->promote = NULL;
-
-	if (rbio->bounce)
-		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-	if (rbio->split) {
-		struct bch_read_bio *parent = rbio->parent;
-
-		if (rbio->kmalloc)
-			kfree(rbio);
-		else
-			bio_put(&rbio->bio);
-
-		rbio = parent;
-	}
-
-	return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-	if (rbio->start_time)
-		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-				       rbio->start_time);
-	bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter,
-				     struct bch_io_failures *failed,
-				     unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	int ret;
-
-	flags &= ~BCH_READ_LAST_FRAGMENT;
-	flags |= BCH_READ_MUST_CLONE;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
-			     rbio->read_pos, BTREE_ITER_SLOTS);
-retry:
-	rbio->bio.bi_status = 0;
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (bkey_err(k))
-		goto err;
-
-	bch2_bkey_buf_reassemble(&sk, c, k);
-	k = bkey_i_to_s_c(sk.k);
-	bch2_trans_unlock(&trans);
-
-	if (!bch2_bkey_matches_ptr(c, k,
-				   rbio->pick.ptr,
-				   rbio->data_pos.offset -
-				   rbio->pick.crc.offset)) {
-		/* extent we wanted to read no longer exists: */
-		rbio->hole = true;
-		goto out;
-	}
-
-	ret = __bch2_read_extent(&trans, rbio, bvec_iter,
-				 rbio->read_pos,
-				 rbio->data_btree,
-				 k, 0, failed, flags);
-	if (ret == READ_RETRY)
-		goto retry;
-	if (ret)
-		goto err;
-out:
-	bch2_rbio_done(rbio);
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-	return;
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
-	goto out;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bvec_iter iter	= rbio->bvec_iter;
-	unsigned flags		= rbio->flags;
-	subvol_inum inum = {
-		.subvol = rbio->subvol,
-		.inum	= rbio->read_pos.inode,
-	};
-	struct bch_io_failures failed = { .nr = 0 };
-
-	trace_and_count(c, read_retry, &rbio->bio);
-
-	if (rbio->retry == READ_RETRY_AVOID)
-		bch2_mark_io_failure(&failed, &rbio->pick);
-
-	rbio->bio.bi_status = 0;
-
-	rbio = bch2_rbio_free(rbio);
-
-	flags |= BCH_READ_IN_RETRY;
-	flags &= ~BCH_READ_MAY_PROMOTE;
-
-	if (flags & BCH_READ_NODECODE) {
-		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
-	} else {
-		flags &= ~BCH_READ_LAST_FRAGMENT;
-		flags |= BCH_READ_MUST_CLONE;
-
-		__bch2_read(c, rbio, iter, inum, &failed, flags);
-	}
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-			    blk_status_t error)
-{
-	rbio->retry = retry;
-
-	if (rbio->flags & BCH_READ_IN_RETRY)
-		return;
-
-	if (retry == READ_ERR) {
-		rbio = bch2_rbio_free(rbio);
-
-		rbio->bio.bi_status = error;
-		bch2_rbio_done(rbio);
-	} else {
-		bch2_rbio_punt(rbio, bch2_rbio_retry,
-			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	}
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-				   struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-	struct bch_extent_crc_unpacked new_crc;
-	struct btree_iter iter;
-	struct bkey_i *new;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	if (crc_is_compressed(rbio->pick.crc))
-		return 0;
-
-	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if ((ret = bkey_err(k)))
-		goto out;
-
-	if (bversion_cmp(k.k->version, rbio->version) ||
-	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-		goto out;
-
-	/* Extent was merged? */
-	if (bkey_start_offset(k.k) < data_offset ||
-	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-		goto out;
-
-	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-			rbio->pick.crc, NULL, &new_crc,
-			bkey_start_offset(k.k) - data_offset, k.k->size,
-			rbio->pick.crc.csum_type)) {
-		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-		ret = 0;
-		goto out;
-	}
-
-	/*
-	 * going to be temporarily appending another checksum entry:
-	 */
-	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-				 sizeof(struct bch_extent_crc128));
-	if ((ret = PTR_ERR_OR_ZERO(new)))
-		goto out;
-
-	bkey_reassemble(new, k);
-
-	if (!bch2_bkey_narrow_crcs(new, new_crc))
-		goto out;
-
-	ret = bch2_trans_update(trans, &iter, new,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
-		      __bch2_rbio_narrow_crcs(&trans, rbio));
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct bio *src		= &rbio->bio;
-	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
-	struct bvec_iter dst_iter = rbio->bvec_iter;
-	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-	struct nonce nonce = extent_nonce(rbio->version, crc);
-	unsigned nofs_flags;
-	struct bch_csum csum;
-	int ret;
-
-	nofs_flags = memalloc_nofs_save();
-
-	/* Reset iterator for checksumming and copying bounced data: */
-	if (rbio->bounce) {
-		src->bi_iter.bi_size		= crc.compressed_size << 9;
-		src->bi_iter.bi_idx		= 0;
-		src->bi_iter.bi_bvec_done	= 0;
-	} else {
-		src->bi_iter			= rbio->bvec_iter;
-	}
-
-	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
-		goto csum_err;
-
-	/*
-	 * XXX
-	 * We need to rework the narrow_crcs path to deliver the read completion
-	 * first, and then punt to a different workqueue, otherwise we're
-	 * holding up reads while doing btree updates which is bad for memory
-	 * reclaim.
-	 */
-	if (unlikely(rbio->narrow_crcs))
-		bch2_rbio_narrow_crcs(rbio);
-
-	if (rbio->flags & BCH_READ_NODECODE)
-		goto nodecode;
-
-	/* Adjust crc to point to subset of data we want: */
-	crc.offset     += rbio->offset_into_extent;
-	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
-
-	if (crc_is_compressed(crc)) {
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-		    !c->opts.no_data_io)
-			goto decompression_err;
-	} else {
-		/* don't need to decrypt the entire bio: */
-		nonce = nonce_add(nonce, crc.offset << 9);
-		bio_advance(src, crc.offset << 9);
-
-		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-		src->bi_iter.bi_size = dst_iter.bi_size;
-
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		if (rbio->bounce) {
-			struct bvec_iter src_iter = src->bi_iter;
-
-			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-		}
-	}
-
-	if (rbio->promote) {
-		/*
-		 * Re encrypt data we decrypted, so it's consistent with
-		 * rbio->crc:
-		 */
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-
-		promote_start(rbio->promote, rbio);
-		rbio->promote = NULL;
-	}
-nodecode:
-	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
-		rbio = bch2_rbio_free(rbio);
-		bch2_rbio_done(rbio);
-	}
-out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-csum_err:
-	/*
-	 * Checksum error: if the bio wasn't bounced, we may have been
-	 * reading into buffers owned by userspace (that userspace can
-	 * scribble over) - retry the read, bouncing it this time:
-	 */
-	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-		rbio->flags |= BCH_READ_MUST_BOUNCE;
-		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-		goto out;
-	}
-
-	bch_err_inum_offset_ratelimited(ca,
-		rbio->read_pos.inode,
-		rbio->read_pos.offset << 9,
-		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
-		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
-	bch2_io_error(ca);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-	goto out;
-decompression_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decompression error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	goto out;
-decrypt_err:
-	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-					rbio->read_pos.offset << 9,
-					"decrypt error");
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-	struct bch_read_bio *rbio =
-		container_of(bio, struct bch_read_bio, bio);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct workqueue_struct *wq = NULL;
-	enum rbio_context context = RBIO_CONTEXT_NULL;
-
-	if (rbio->have_ioref) {
-		bch2_latency_acct(ca, rbio->submit_time, READ);
-		percpu_ref_put(&ca->io_ref);
-	}
-
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-
-	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-				    rbio->read_pos.inode,
-				    rbio->read_pos.offset,
-				    "data read error: %s",
-			       bch2_blk_status_to_str(bio->bi_status))) {
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
-		return;
-	}
-
-	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	    ptr_stale(ca, &rbio->pick.ptr)) {
-		trace_and_count(c, read_reuse_race, &rbio->bio);
-
-		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
-		else
-			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
-		return;
-	}
-
-	if (rbio->narrow_crcs ||
-	    rbio->promote ||
-	    crc_is_compressed(rbio->pick.crc) ||
-	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
-	else if (rbio->pick.crc.csum_type)
-		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
-
-	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-				unsigned *offset_into_extent,
-				struct bkey_buf *orig_k)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 reflink_offset;
-	int ret;
-
-	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
-		*offset_into_extent;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-			       POS(0, reflink_offset), 0);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_reflink_v &&
-	    k.k->type != KEY_TYPE_indirect_inline_data) {
-		bch_err_inum_offset_ratelimited(trans->c,
-			orig_k->k->k.p.inode,
-			orig_k->k->k.p.offset << 9,
-			"%llu len %u points to nonexistent indirect extent %llu",
-			orig_k->k->k.p.offset,
-			orig_k->k->k.size,
-			reflink_offset);
-		bch2_inconsistent_error(trans->c);
-		ret = -EIO;
-		goto err;
-	}
-
-	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-						   struct bkey_s_c k,
-						   struct bch_extent_ptr ptr)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
-	struct btree_iter iter;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     PTR_BUCKET_POS(c, &ptr),
-			     BTREE_ITER_CACHED);
-
-	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
-	printbuf_indent_add(&buf, 2);
-	prt_newline(&buf);
-
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_newline(&buf);
-
-	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
-	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-	if (!ret) {
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, k);
-	}
-
-	bch2_fs_inconsistent(c, "%s", buf.buf);
-
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bpos read_pos,
-		       enum btree_id data_btree, struct bkey_s_c k,
-		       unsigned offset_into_extent,
-		       struct bch_io_failures *failed, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct extent_ptr_decoded pick;
-	struct bch_read_bio *rbio = NULL;
-	struct bch_dev *ca = NULL;
-	struct promote_op *promote = NULL;
-	bool bounce = false, read_full = false, narrow_crcs = false;
-	struct bpos data_pos = bkey_start_pos(k.k);
-	int pick_ret;
-
-	if (bkey_extent_is_inline_data(k.k)) {
-		unsigned bytes = min_t(unsigned, iter.bi_size,
-				       bkey_inline_data_bytes(k.k));
-
-		swap(iter.bi_size, bytes);
-		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-		swap(iter.bi_size, bytes);
-		bio_advance_iter(&orig->bio, &iter, bytes);
-		zero_fill_bio_iter(&orig->bio, iter);
-		goto out_read_done;
-	}
-retry_pick:
-	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
-
-	/* hole or reservation - just zero fill: */
-	if (!pick_ret)
-		goto hole;
-
-	if (pick_ret < 0) {
-		bch_err_inum_offset_ratelimited(c,
-				read_pos.inode, read_pos.offset << 9,
-				"no device to read from");
-		goto err;
-	}
-
-	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
-	/*
-	 * Stale dirty pointers are treated as IO errors, but @failed isn't
-	 * allocated unless we're in the retry path - so if we're not in the
-	 * retry path, don't check here, it'll be caught in bch2_read_endio()
-	 * and we'll end up in the retry path:
-	 */
-	if ((flags & BCH_READ_IN_RETRY) &&
-	    !pick.ptr.cached &&
-	    unlikely(ptr_stale(ca, &pick.ptr))) {
-		read_from_stale_dirty_pointer(trans, k, pick.ptr);
-		bch2_mark_io_failure(failed, &pick);
-		goto retry_pick;
-	}
-
-	/*
-	 * Unlock the iterator while the btree node's lock is still in
-	 * cache, before doing the IO:
-	 */
-	bch2_trans_unlock(trans);
-
-	if (flags & BCH_READ_NODECODE) {
-		/*
-		 * can happen if we retry, and the extent we were going to read
-		 * has been merged in the meantime:
-		 */
-		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
-			goto hole;
-
-		iter.bi_size	= pick.crc.compressed_size << 9;
-		goto get_bio;
-	}
-
-	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
-	    bio_flagged(&orig->bio, BIO_CHAIN))
-		flags |= BCH_READ_MUST_CLONE;
-
-	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
-		bch2_can_narrow_extent_crcs(k, pick.crc);
-
-	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
-		flags |= BCH_READ_MUST_BOUNCE;
-
-	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-	if (crc_is_compressed(pick.crc) ||
-	    (pick.crc.csum_type != BCH_CSUM_none &&
-	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-	       (flags & BCH_READ_USER_MAPPED)) ||
-	      (flags & BCH_READ_MUST_BOUNCE)))) {
-		read_full = true;
-		bounce = true;
-	}
-
-	if (orig->opts.promote_target)
-		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
-					&rbio, &bounce, &read_full);
-
-	if (!read_full) {
-		EBUG_ON(crc_is_compressed(pick.crc));
-		EBUG_ON(pick.crc.csum_type &&
-			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-			 bvec_iter_sectors(iter) != pick.crc.live_size ||
-			 pick.crc.offset ||
-			 offset_into_extent));
-
-		data_pos.offset += offset_into_extent;
-		pick.ptr.offset += pick.crc.offset +
-			offset_into_extent;
-		offset_into_extent		= 0;
-		pick.crc.compressed_size	= bvec_iter_sectors(iter);
-		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
-		pick.crc.offset			= 0;
-		pick.crc.live_size		= bvec_iter_sectors(iter);
-		offset_into_extent		= 0;
-	}
-get_bio:
-	if (rbio) {
-		/*
-		 * promote already allocated bounce rbio:
-		 * promote needs to allocate a bio big enough for uncompressing
-		 * data in the write path, but we're not going to use it all
-		 * here:
-		 */
-		EBUG_ON(rbio->bio.bi_iter.bi_size <
-		       pick.crc.compressed_size << 9);
-		rbio->bio.bi_iter.bi_size =
-			pick.crc.compressed_size << 9;
-	} else if (bounce) {
-		unsigned sectors = pick.crc.compressed_size;
-
-		rbio = rbio_init(bio_alloc_bioset(NULL,
-						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
-						  0,
-						  GFP_NOFS,
-						  &c->bio_read_split),
-				 orig->opts);
-
-		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-		rbio->bounce	= true;
-		rbio->split	= true;
-	} else if (flags & BCH_READ_MUST_CLONE) {
-		/*
-		 * Have to clone if there were any splits, due to error
-		 * reporting issues (if a split errored, and retrying didn't
-		 * work, when it reports the error to its parent (us) we don't
-		 * know if the error was from our bio, and we should retry, or
-		 * from the whole bio, in which case we don't want to retry and
-		 * lose the error)
-		 */
-		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-						 &c->bio_read_split),
-				 orig->opts);
-		rbio->bio.bi_iter = iter;
-		rbio->split	= true;
-	} else {
-		rbio = orig;
-		rbio->bio.bi_iter = iter;
-		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-	}
-
-	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-	rbio->c			= c;
-	rbio->submit_time	= local_clock();
-	if (rbio->split)
-		rbio->parent	= orig;
-	else
-		rbio->end_io	= orig->bio.bi_end_io;
-	rbio->bvec_iter		= iter;
-	rbio->offset_into_extent= offset_into_extent;
-	rbio->flags		= flags;
-	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
-	rbio->narrow_crcs	= narrow_crcs;
-	rbio->hole		= 0;
-	rbio->retry		= 0;
-	rbio->context		= 0;
-	/* XXX: only initialize this if needed */
-	rbio->devs_have		= bch2_bkey_devs(k);
-	rbio->pick		= pick;
-	rbio->subvol		= orig->subvol;
-	rbio->read_pos		= read_pos;
-	rbio->data_btree	= data_btree;
-	rbio->data_pos		= data_pos;
-	rbio->version		= k.k->version;
-	rbio->promote		= promote;
-	INIT_WORK(&rbio->work, NULL);
-
-	rbio->bio.bi_opf	= orig->bio.bi_opf;
-	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-	rbio->bio.bi_end_io	= bch2_read_endio;
-
-	if (rbio->bounce)
-		trace_and_count(c, read_bounce, &rbio->bio);
-
-	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-	/*
-	 * If it's being moved internally, we don't want to flag it as a cache
-	 * hit:
-	 */
-	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
-		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-			PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
-		bio_inc_remaining(&orig->bio);
-		trace_and_count(c, read_split, &orig->bio);
-	}
-
-	if (!rbio->pick.idx) {
-		if (!rbio->have_ioref) {
-			bch_err_inum_offset_ratelimited(c,
-					read_pos.inode,
-					read_pos.offset << 9,
-					"no device to read from");
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-			goto out;
-		}
-
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-			     bio_sectors(&rbio->bio));
-		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-		if (unlikely(c->opts.no_data_io)) {
-			if (likely(!(flags & BCH_READ_IN_RETRY)))
-				bio_endio(&rbio->bio);
-		} else {
-			if (likely(!(flags & BCH_READ_IN_RETRY)))
-				submit_bio(&rbio->bio);
-			else
-				submit_bio_wait(&rbio->bio);
-		}
-
-		/*
-		 * We just submitted IO which may block, we expect relock fail
-		 * events and shouldn't count them:
-		 */
-		trans->notrace_relock_fail = true;
-	} else {
-		/* Attempting reconstruct read: */
-		if (bch2_ec_read_extent(c, rbio)) {
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-			goto out;
-		}
-
-		if (likely(!(flags & BCH_READ_IN_RETRY)))
-			bio_endio(&rbio->bio);
-	}
-out:
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		return 0;
-	} else {
-		int ret;
-
-		rbio->context = RBIO_CONTEXT_UNBOUND;
-		bch2_read_endio(&rbio->bio);
-
-		ret = rbio->retry;
-		rbio = bch2_rbio_free(rbio);
-
-		if (ret == READ_RETRY_AVOID) {
-			bch2_mark_io_failure(failed, &pick);
-			ret = READ_RETRY;
-		}
-
-		if (!ret)
-			goto out_read_done;
-
-		return ret;
-	}
-
-err:
-	if (flags & BCH_READ_IN_RETRY)
-		return READ_ERR;
-
-	orig->bio.bi_status = BLK_STS_IOERR;
-	goto out_read_done;
-
-hole:
-	/*
-	 * won't normally happen in the BCH_READ_NODECODE
-	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
-	 * to read no longer exists we have to signal that:
-	 */
-	if (flags & BCH_READ_NODECODE)
-		orig->hole = true;
-
-	zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-	if (flags & BCH_READ_LAST_FRAGMENT)
-		bch2_rbio_done(orig);
-	return 0;
-}
-
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-		 struct bvec_iter bvec_iter, subvol_inum inum,
-		 struct bch_io_failures *failed, unsigned flags)
-{
-	struct btree_trans trans;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	BUG_ON(flags & BCH_READ_NODECODE);
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
-retry:
-	bch2_trans_begin(&trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-			     BTREE_ITER_SLOTS);
-	while (1) {
-		unsigned bytes, sectors, offset_into_extent;
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(&trans);
-		if (ret)
-			break;
-
-		bch2_btree_iter_set_pos(&iter,
-				POS(inum.inum, bvec_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(&trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			break;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		/*
-		 * With indirect extents, the amount of data to read is the min
-		 * of the original extent and the indirect extent:
-		 */
-		sectors = min(sectors, k.k->size - offset_into_extent);
-
-		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-		swap(bvec_iter.bi_size, bytes);
-
-		if (bvec_iter.bi_size == bytes)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
-					 data_btree, k,
-					 offset_into_extent, failed, flags);
-		if (ret)
-			break;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			break;
-
-		swap(bvec_iter.bi_size, bytes);
-		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
-		ret = btree_trans_too_many_iters(&trans);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_iter_exit(&trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    ret == READ_RETRY ||
-	    ret == READ_RETRY_AVOID)
-		goto retry;
-
-	bch2_trans_exit(&trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	if (ret) {
-		bch_err_inum_offset_ratelimited(c, inum.inum,
-						bvec_iter.bi_sector << 9,
-						"read error %i from btree lookup", ret);
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bch2_rbio_done(rbio);
-	}
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
-	if (c->promote_table.tbl)
-		rhashtable_destroy(&c->promote_table);
 	mempool_exit(&c->bio_bounce_pages);
 	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
 }
 
-int bch2_fs_io_init(struct bch_fs *c)
+int bch2_fs_io_write_init(struct bch_fs *c)
 {
-	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_init;
-
-	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_split_init;
-
 	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
 			BIOSET_NEED_BVECS))
 		return -BCH_ERR_ENOMEM_bio_write_init;
@@ -3044,8 +1665,5 @@ int bch2_fs_io_init(struct bch_fs *c)
 				   PAGE_SIZE, 0))
 		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
 
-	if (rhashtable_init(&c->promote_table, &bch_promote_params))
-		return -BCH_ERR_ENOMEM_promote_table_init;
-
 	return 0;
 }
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
new file mode 100644
index 00000000..93231672
--- /dev/null
+++ b/libbcachefs/io_write.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(DONE)				\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->watermark == BCH_WATERMARK_copygc
+		? op->c->copygc_wq
+		: op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts);
+	op->compression_opt	= opts.compression;
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->watermark		= BCH_WATERMARK_normal;
+	op->incompressible	= 0;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->subvol		= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
+}
+
+void bch2_write(struct closure *);
+
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_write_types.h
similarity index 54%
rename from libbcachefs/io_types.h
rename to libbcachefs/io_write_types.h
index 737f16d7..c7f97c2c 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_write_types.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
 
 #include "alloc_types.h"
 #include "btree_types.h"
@@ -13,75 +13,6 @@
 #include <linux/llist.h>
 #include <linux/workqueue.h>
 
-struct bch_read_bio {
-	struct bch_fs		*c;
-	u64			start_time;
-	u64			submit_time;
-
-	/*
-	 * Reads will often have to be split, and if the extent being read from
-	 * was checksummed or compressed we'll also have to allocate bounce
-	 * buffers and copy the data back into the original bio.
-	 *
-	 * If we didn't have to split, we have to save and restore the original
-	 * bi_end_io - @split below indicates which:
-	 */
-	union {
-	struct bch_read_bio	*parent;
-	bio_end_io_t		*end_io;
-	};
-
-	/*
-	 * Saved copy of bio->bi_iter, from submission time - allows us to
-	 * resubmit on IO error, and also to copy data back to the original bio
-	 * when we're bouncing:
-	 */
-	struct bvec_iter	bvec_iter;
-
-	unsigned		offset_into_extent;
-
-	u16			flags;
-	union {
-	struct {
-	u16			bounce:1,
-				split:1,
-				kmalloc:1,
-				have_ioref:1,
-				narrow_crcs:1,
-				hole:1,
-				retry:2,
-				context:2;
-	};
-	u16			_state;
-	};
-
-	struct bch_devs_list	devs_have;
-
-	struct extent_ptr_decoded pick;
-
-	/*
-	 * pos we read from - different from data_pos for indirect extents:
-	 */
-	u32			subvol;
-	struct bpos		read_pos;
-
-	/*
-	 * start pos of data we read (may not be pos of data we want) - for
-	 * promote, narrow extents paths:
-	 */
-	enum btree_id		data_btree;
-	struct bpos		data_pos;
-	struct bversion		version;
-
-	struct promote_op	*promote;
-
-	struct bch_io_opts	opts;
-
-	struct work_struct	work;
-
-	struct bio		bio;
-};
-
 struct bch_write_bio {
 	struct_group(wbio,
 	struct bch_fs		*c;
@@ -162,4 +93,4 @@ struct bch_write_op {
 	struct bch_write_bio	wbio;
 };
 
-#endif /* _BCACHEFS_IO_TYPES_H */
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 055920c2..fc3dd5be 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -132,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
-/* journal entry close/open: */
-
-void __bch2_journal_buf_put(struct journal *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	lockdep_assert_held(&j->lock);
+
+	if (__bch2_journal_pin_put(j, seq))
+		bch2_journal_reclaim_fast(j);
+	if (write)
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -204,13 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
 	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
-
 	cancel_delayed_work(&j->write_work);
 
 	bch2_journal_space_available(j);
 
-	bch2_journal_buf_put(j, old.idx);
+	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
 }
 
 void bch2_journal_halt(struct journal *j)
@@ -588,8 +594,13 @@ out:
 
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:		journal object
+ * @seq:	seq to flush
+ * @parent:	closure object to wait with
+ * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *		-EIO if @seq will never be flushed
  *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
@@ -829,12 +840,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 
 			ret = bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(&trans, ca,
+				bch2_trans_mark_metadata_bucket(trans, ca,
 						ob[nr_got]->bucket, BCH_DATA_journal,
 						ca->mi.bucket_size));
 			if (ret) {
 				bch2_open_bucket_put(c, ob[nr_got]);
-				bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+				bch_err_msg(c, ret, "marking new journal buckets");
 				break;
 			}
 
@@ -910,7 +921,7 @@ err_unblock:
 	if (ret && !new_fs)
 		for (i = 0; i < nr_got; i++)
 			bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(&trans, ca,
+				bch2_trans_mark_metadata_bucket(trans, ca,
 						bu[i], BCH_DATA_free, 0));
 err_free:
 	if (!new_fs)
@@ -944,7 +955,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		goto unlock;
 
 	while (ja->nr < nr) {
-		struct disk_reservation disk_res = { 0, 0 };
+		struct disk_reservation disk_res = { 0, 0, 0 };
 
 		/*
 		 * note: journal buckets aren't really counted as _sectors_ used yet, so
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 008a2e25..491133cc 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j)
 	return true;
 }
 
-void __bch2_journal_buf_put(struct journal *);
-
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
 {
 	union journal_res_state s;
 
@@ -264,9 +265,30 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
 				    .buf2_count = idx == 2,
 				    .buf3_count = idx == 3,
 				    }).v, &j->reservations.counter);
+	return s;
+}
 
-	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
-		__bch2_journal_buf_put(j);
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx))
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx)) {
+		spin_lock(&j->lock);
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		spin_unlock(&j->lock);
+	}
 }
 
 /*
@@ -286,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 				       BCH_JSET_ENTRY_btree_keys,
 				       0, 0, 0);
 
-	bch2_journal_buf_put(j, res->idx);
+	bch2_journal_buf_put(j, res->idx, res->seq);
 
 	res->ref = 0;
 }
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 34740dca..6a3d6a37 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -8,7 +8,6 @@
 #include "checksum.h"
 #include "disk_groups.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -238,17 +237,17 @@ static void journal_entry_err_msg(struct printbuf *out,
 
 #define journal_entry_err(c, version, jset, entry, msg, ...)		\
 ({									\
-	struct printbuf buf = PRINTBUF;					\
+	struct printbuf _buf = PRINTBUF;				\
 									\
-	journal_entry_err_msg(&buf, version, jset, entry);		\
-	prt_printf(&buf, msg, ##__VA_ARGS__);				\
+	journal_entry_err_msg(&_buf, version, jset, entry);		\
+	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
 									\
 	switch (flags & BKEY_INVALID_WRITE) {				\
 	case READ:							\
-		mustfix_fsck_err(c, "%s", buf.buf);			\
+		mustfix_fsck_err(c, "%s", _buf.buf);			\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
+		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
 		if (bch2_fs_inconsistent(c)) {				\
 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
@@ -256,7 +255,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 		break;							\
 	}								\
 									\
-	printbuf_exit(&buf);						\
+	printbuf_exit(&_buf);						\
 	true;								\
 })
 
@@ -1282,7 +1281,7 @@ int bch2_journal_read(struct bch_fs *c,
 			continue;
 
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+			ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
 			if (!i->ptrs[ptr].csum_good)
 				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1380,16 +1379,21 @@ static void __journal_write_alloc(struct journal *j,
 }
 
 /**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:		journal object
+ * @w:		journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
  */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_devs_mask devs;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 	unsigned target = c->opts.metadata_target ?:
 		c->opts.foreground_target;
 	unsigned i, replicas = 0, replicas_want =
@@ -1550,6 +1554,7 @@ static void journal_write_done(struct closure *cl)
 
 	if (!journal_state_count(new, new.unwritten_idx) &&
 	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		spin_unlock(&j->lock);
 		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
@@ -1562,10 +1567,11 @@ static void journal_write_done(struct closure *cl)
 		 * might want to be written now:
 		 */
 
+		spin_unlock(&j->lock);
 		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+	} else {
+		spin_unlock(&j->lock);
 	}
-
-	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1813,7 +1819,7 @@ void bch2_journal_write(struct closure *cl)
 
 retry_alloc:
 	spin_lock(&j->lock);
-	ret = journal_write_alloc(j, w, sectors);
+	ret = journal_write_alloc(j, w);
 
 	if (ret && j->can_discard) {
 		spin_unlock(&j->lock);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 10e1860d..9a584aaa 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -290,9 +290,8 @@ void bch2_journal_do_discards(struct journal *j)
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_reclaim_fast(struct journal *j)
 {
-	struct journal_entry_pin_list temp;
 	bool popped = false;
 
 	lockdep_assert_held(&j->lock);
@@ -303,7 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 	 */
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
-		fifo_pop(&j->pin, temp);
+		j->pin.front++;
 		popped = true;
 	}
 
@@ -311,19 +310,16 @@ static void bch2_journal_reclaim_fast(struct journal *j)
 		bch2_journal_space_available(j);
 }
 
-void __bch2_journal_pin_put(struct journal *j, u64 seq)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 
-	if (atomic_dec_and_test(&pin_list->count))
-		bch2_journal_reclaim_fast(j);
+	return atomic_dec_and_test(&pin_list->count);
 }
 
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-	if (atomic_dec_and_test(&pin_list->count)) {
+	if (__bch2_journal_pin_put(j, seq)) {
 		spin_lock(&j->lock);
 		bch2_journal_reclaim_fast(j);
 		spin_unlock(&j->lock);
@@ -419,6 +415,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:		journal object
+ * @pin:	pin to flush
  */
 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 {
@@ -579,7 +577,11 @@ static u64 journal_seq_to_flush(struct journal *j)
 }
 
 /**
- * bch2_journal_reclaim - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:		journal object
+ * @direct:	direct or background reclaim?
+ * @kicked:	requested to run since we last ran?
+ * Returns:	0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -758,7 +760,7 @@ int bch2_journal_reclaim_start(struct journal *j)
 			   "bch-reclaim/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(p);
 	if (ret) {
-		bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating journal reclaim thread");
 		return ret;
 	}
 
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index 0fd1af12..494d1a6e 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }
 
-void __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index d6b9f2cd..1e1a7940 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
 	struct journal_seq_blacklist_table *t;
 	struct bch_sb_field_journal_seq_blacklist *bl;
 	struct journal_seq_blacklist_entry *src, *dst;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	unsigned i, nr, new_nr;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		struct btree_iter iter;
 		struct btree *b;
 
-		bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+		bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
 					  0, 0, BTREE_ITER_PREFETCH);
 retry:
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		b = bch2_btree_iter_peek_node(&iter);
 
@@ -275,10 +273,10 @@ retry:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		return;
 
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
new file mode 100644
index 00000000..1bf19aaa
--- /dev/null
+++ b/libbcachefs/logged_ops.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+
+struct bch_logged_op_fn {
+	u8		type;
+	int		(*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)		{					\
+	.type		= KEY_TYPE_logged_op_##n,		\
+	.resume		= bch2_resume_logged_op_##n,		\
+},
+	BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+		if (logged_op_fns[i].type == type)
+			return logged_op_fns + i;
+	return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+			    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	if (!fn)
+		return 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+
+	ret = fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+			resume_logged_op(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	if (ret)
+		return ret;
+
+	k->k.p = iter.pos;
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			 __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+	int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+	/*
+	 * This needs to be a fatal error because we've left an unfinished
+	 * operation in the logged ops btree.
+	 *
+	 * We should only ever see an error here if the filesystem has already
+	 * been shut down, but make sure of that here:
+	 */
+	if (ret) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+				     __func__, buf.buf, bch2_err_str(ret));
+		printbuf_exit(&buf);
+	}
+}
diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h
new file mode 100644
index 00000000..4d1e786a
--- /dev/null
+++ b/libbcachefs/logged_ops.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()			\
+	x(truncate)				\
+	x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
index 3e8b8f2f..215a6533 100644
--- a/libbcachefs/lru.c
+++ b/libbcachefs/lru.c
@@ -151,10 +151,10 @@ int bch2_check_lrus(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-			bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+			bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 81c8cdba..e3a51f6d 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "errcode.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "migrate.h"
@@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	enum btree_id id;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		if (!btree_type_has_ptrs(id))
 			continue;
 
-		ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				NULL, NULL, BTREE_INSERT_NOFAIL,
-			bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
+			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct closure cl;
 	struct btree *b;
@@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
 		return -EINVAL;
 
+	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
-	bch2_trans_init(&trans, c, 0, 0);
 	closure_init_stack(&cl);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
 		ret = 0;
-		while (bch2_trans_begin(&trans),
+		while (bch2_trans_begin(trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
@@ -141,15 +139,14 @@ retry:
 				break;
 			}
 
-			ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
+			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 				ret = 0;
 				continue;
 			}
 
 			if (ret) {
-				bch_err(c, "Error updating btree node key: %s",
-					bch2_err_str(ret));
+				bch_err_msg(c, ret, "updating btree node key");
 				break;
 			}
 next:
@@ -158,7 +155,7 @@ next:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (ret)
 			goto err;
@@ -167,8 +164,8 @@ next:
 	bch2_btree_interior_updates_flush(c);
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
 	bch2_bkey_buf_exit(&k, c);
+	bch2_trans_put(trans);
 
 	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index fb76a1da..39a14e32 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -14,7 +14,8 @@
 #include "errcode.h"
 #include "error.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
@@ -524,7 +525,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 	struct bch_fs *c = ctxt->c;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct bkey_buf sk;
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct data_update_opts data_opts;
@@ -532,7 +533,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
 	int ret = 0, ret2;
 
 	bch2_bkey_buf_init(&sk);
-	bch2_trans_init(&trans, c, 0, 0);
 
 	if (ctxt->stats) {
 		ctxt->stats->data_type	= BCH_DATA_user;
@@ -540,15 +540,15 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		ctxt->stats->pos	= start;
 	}
 
-	bch2_trans_iter_init(&trans, &iter, btree_id, start,
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
 			     BTREE_ITER_PREFETCH|
 			     BTREE_ITER_ALL_SNAPSHOTS);
 
 	if (ctxt->rate)
 		bch2_ratelimit_reset(ctxt->rate);
 
-	while (!move_ratelimit(&trans, ctxt)) {
-		bch2_trans_begin(&trans);
+	while (!move_ratelimit(trans, ctxt)) {
+		bch2_trans_begin(trans);
 
 		k = bch2_btree_iter_peek(&iter);
 		if (!k.k)
@@ -569,7 +569,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		if (!bkey_extent_is_direct_data(k.k))
 			goto next_nondata;
 
-		ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+		ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
 		if (ret)
 			continue;
 
@@ -584,7 +584,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 		bch2_bkey_buf_reassemble(&sk, c, k);
 		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
+		ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
 					io_opts, btree_id, k, data_opts);
 		if (ret2) {
 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
@@ -592,7 +592,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(ctxt, &trans);
+				bch2_move_ctxt_wait_for_io(ctxt, trans);
 				continue;
 			}
 
@@ -609,8 +609,8 @@ next_nondata:
 		bch2_btree_iter_advance(&iter);
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
 	return ret;
@@ -627,7 +627,7 @@ int bch2_move_data(struct bch_fs *c,
 {
 	struct moving_context ctxt;
 	enum btree_id id;
-	int ret;
+	int ret = 0;
 
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
@@ -723,7 +723,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
 		if (!bp.level) {
 			const struct bch_extent_ptr *ptr;
-			struct bkey_s_c k;
 			unsigned i = 0;
 
 			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
@@ -826,15 +825,14 @@ int bch2_evacuate_bucket(struct bch_fs *c,
 			 struct write_point_specifier wp,
 			 bool wait_on_copygc)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct moving_context ctxt;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+	ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
 	bch2_moving_ctxt_exit(&ctxt);
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	return ret;
 }
@@ -851,14 +849,13 @@ static int bch2_move_btree(struct bch_fs *c,
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct btree *b;
 	enum btree_id id;
 	struct data_update_opts data_opts;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
 	progress_list_add(c, stats);
 
 	stats->data_type = BCH_DATA_btree;
@@ -871,11 +868,11 @@ static int bch2_move_btree(struct bch_fs *c,
 		if (!bch2_btree_id_root(c, id)->b)
 			continue;
 
-		bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
 		ret = 0;
-		while (bch2_trans_begin(&trans),
+		while (bch2_trans_begin(trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
 			if (kthread && kthread_should_stop())
@@ -890,7 +887,7 @@ retry:
 			if (!pred(c, arg, b, &io_opts, &data_opts))
 				goto next;
 
-			ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 				continue;
 			if (ret)
@@ -901,13 +898,13 @@ next:
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			goto retry;
 
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 
 		if (kthread && kthread_should_stop())
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index c3136abe..cbdd58db 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "data_update.h"
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 256431a6..4017120b 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -13,25 +13,17 @@
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
-#include "disk_groups.h"
 #include "errcode.h"
 #include "error.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
 #include "lru.h"
 #include "move.h"
 #include "movinggc.h"
-#include "super-io.h"
 #include "trace.h"
 
-#include <linux/bsearch.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sched/task.h>
-#include <linux/sort.h>
 #include <linux/wait.h>
 
 struct buckets_in_flight {
@@ -156,7 +148,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
 	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
 	int ret;
 
@@ -172,7 +164,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
 				  0, k, ({
 		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-		int ret = 0;
+		int ret2 = 0;
 
 		saw++;
 
@@ -181,11 +173,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 		else if (bucket_in_flight(buckets_in_flight, b.k))
 			in_flight++;
 		else {
-			ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
-			if (ret >= 0)
+			ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+			if (ret2 >= 0)
 				sectors += b.sectors;
 		}
-		ret;
+		ret2;
 	}));
 
 	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
@@ -242,7 +234,7 @@ err:
 		ret = 0;
 
 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
-		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "from bch2_move_data()");
 
 	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
 	trace_and_count(c, copygc, c, moved, 0, 0, 0);
@@ -308,25 +300,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct moving_context ctxt;
 	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	struct buckets_in_flight move_buckets;
+	struct buckets_in_flight buckets;
 	u64 last, wait;
 	int ret = 0;
 
-	memset(&move_buckets, 0, sizeof(move_buckets));
+	memset(&buckets, 0, sizeof(buckets));
 
-	ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+	ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
 	if (ret) {
-		bch_err(c, "error allocating copygc buckets in flight: %s",
-			bch2_err_str(ret));
+		bch_err_msg(c, ret, "allocating copygc buckets in flight");
 		return ret;
 	}
 
 	set_freezable();
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	bch2_move_stats_init(&move_stats, "copygc");
 	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -334,16 +325,16 @@ static int bch2_copygc_thread(void *arg)
 			      false);
 
 	while (!ret && !kthread_should_stop()) {
-		bch2_trans_unlock(&trans);
+		bch2_trans_unlock(trans);
 		cond_resched();
 
 		if (!c->copy_gc_enabled) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			kthread_wait_freezable(c->copy_gc_enabled);
 		}
 
 		if (unlikely(freezing(current))) {
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			__refrigerator(false);
 			continue;
 		}
@@ -354,7 +345,7 @@ static int bch2_copygc_thread(void *arg)
 		if (wait > clock->max_slop) {
 			c->copygc_wait_at = last;
 			c->copygc_wait = last + wait;
-			move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+			move_buckets_wait(trans, &ctxt, &buckets, true);
 			trace_and_count(c, copygc_wait, c, wait, last + wait);
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
@@ -364,15 +355,15 @@ static int bch2_copygc_thread(void *arg)
 		c->copygc_wait = 0;
 
 		c->copygc_running = true;
-		ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+		ret = bch2_copygc(trans, &ctxt, &buckets);
 		c->copygc_running = false;
 
 		wake_up(&c->copygc_running_wq);
 	}
 
-	move_buckets_wait(&trans, &ctxt, &move_buckets, true);
-	rhashtable_destroy(&move_buckets.table);
-	bch2_trans_exit(&trans);
+	move_buckets_wait(trans, &ctxt, &buckets, true);
+	rhashtable_destroy(&buckets.table);
+	bch2_trans_put(trans);
 	bch2_moving_ctxt_exit(&ctxt);
 
 	return 0;
@@ -404,7 +395,7 @@ int bch2_copygc_start(struct bch_fs *c)
 	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(t);
 	if (ret) {
-		bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating copygc thread");
 		return ret;
 	}
 
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 960bb247..739a2ef8 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -471,8 +471,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 			val = "0";
 		}
 
+		/* Unknown options are ignored: */
 		if (id < 0)
-			goto bad_opt;
+			continue;
 
 		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 			goto bad_opt;
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 8a9db110..c21c258e 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -469,7 +469,7 @@ struct bch_opts {
 #undef x
 };
 
-static const struct bch_opts bch2_opts_default = {
+static const __maybe_unused struct bch_opts bch2_opts_default = {
 #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
 	._name##_defined = true,					\
 	._name = _default,						\
diff --git a/libbcachefs/printbuf.c b/libbcachefs/printbuf.c
index c41daa18..de41f9a1 100644
--- a/libbcachefs/printbuf.c
+++ b/libbcachefs/printbuf.c
@@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
 }
 
 /**
- * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
- * terminated
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:	printbuf to terminate
+ * Returns:	Printbuf contents, as a nul terminated C string
  */
 const char *bch2_printbuf_str(const struct printbuf *buf)
 {
@@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf)
 }
 
 /**
- * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
  * against accidental use.
+ * @buf:	printbuf to exit
  */
 void bch2_printbuf_exit(struct printbuf *buf)
 {
@@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf)
 }
 
 /*
- * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
  *
  * @buf: printbuf to control
  * @spaces: number of spaces from previous tabpstop
@@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_add - add to the current indent level
+ * bch2_printbuf_indent_add() - add to the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to add to the current indent level
@@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_sub - subtract from the current indent level
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to subtract from the current indent level
@@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out)
 }
 
 /**
- * prt_tab - Advance printbuf to the next tabstop
- *
- * @buf: printbuf to control
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:	printbuf to control
  *
  * Advance output to the next tabstop by printing spaces.
  */
@@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
  * previous output
  *
  * @buf: printbuf to control
@@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
  *
- * @out: printbuf to output to
- * @str: string to print
- * @count: number of bytes to print
+ * @out:	output printbuf
+ * @str:	string to print
+ * @count:	number of bytes to print
  *
  * The following contol characters are handled as so:
  *   \n: prt_newline	newline that obeys current indent level
@@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
 }
 
 /**
- * prt_human_readable_u64 - Print out a u64 in human readable units
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
 {
-	bch2_printbuf_make_room(buf, 10);
-	buf->pos += string_get_size(v, 1, !buf->si_units,
-				    buf->buf + buf->pos,
-				    printbuf_remaining_size(buf));
+	bch2_printbuf_make_room(out, 10);
+	out->pos += string_get_size(v, 1, !out->si_units,
+				    out->buf + out->pos,
+				    printbuf_remaining_size(out));
 }
 
 /**
- * prt_human_readable_s64 - Print out a s64 in human readable units
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
 {
 	if (v < 0)
-		prt_char(buf, '-');
-	bch2_prt_human_readable_u64(buf, abs(v));
+		prt_char(out, '-');
+	bch2_prt_human_readable_u64(out, abs(v));
 }
 
 /**
- * prt_units_u64 - Print out a u64 according to printbuf unit options
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
@@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v)
 }
 
 /**
- * prt_units_s64 - Print out a s64 according to printbuf unit options
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index ca99772a..36de2f07 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -572,7 +572,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
 	if (!s_t.master_subvol)
 		goto advance;
 
-	ret = bch2_inode_find_by_inum_trans(trans,
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
 				(subvol_inum) {
 					le32_to_cpu(s_t.master_subvol),
 					k.k->p.offset,
@@ -599,7 +599,7 @@ advance:
 int bch2_fs_quota_read(struct bch_fs *c)
 {
 	struct bch_sb_field_quota *sb_quota;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
@@ -614,16 +614,16 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	bch2_sb_quota_read(c);
 	mutex_unlock(&c->sb_lock);
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
-	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
 			POS_MIN, BTREE_ITER_PREFETCH, k,
 		__bch2_quota_set(c, k, NULL)) ?:
-	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+	      for_each_btree_key2(trans, iter, BTREE_ID_inodes,
 			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-		bch2_fs_quota_read_inode(&trans, &iter, k));
+		bch2_fs_quota_read_inode(trans, &iter, k));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		bch_err_fn(c, ret);
@@ -786,7 +786,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 {
 	struct bch_fs *c = sb->s_fs_info;
 	struct bch_sb_field_quota *sb_quota;
-	struct bch_memquota_type *q;
 	int ret = 0;
 
 	if (0) {
@@ -810,8 +809,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
 		return -EINVAL;
 
-	q = &c->quotas[type];
-
 	mutex_lock(&c->sb_lock);
 	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
@@ -959,7 +956,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
 		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
 	return bch2_err_class(ret);
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 15ce3ecb..568f1e8e 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -8,8 +8,6 @@
 #include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
-#include "extents.h"
-#include "io.h"
 #include "move.h"
 #include "rebalance.h"
 #include "super-io.h"
@@ -350,7 +348,7 @@ int bch2_rebalance_start(struct bch_fs *c)
 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
 	ret = PTR_ERR_OR_ZERO(p);
 	if (ret) {
-		bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "creating rebalance thread");
 		return ret;
 	}
 
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 30efb3c9..1dceb7ee 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -20,6 +20,7 @@
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "lru.h"
+#include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
@@ -164,7 +165,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 				    (!k->allocated
 				     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
 				     : 0),
-			     bch2_journal_replay_key(&trans, k));
+			     bch2_journal_replay_key(trans, k));
 		if (ret) {
 			bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
 				bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
@@ -422,15 +423,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
 	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
 	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
 
-	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,
-				  &root_tree.k_i,
-				  NULL, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_snapshots,
-				  &root_snapshot.k_i,
-				  NULL, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_subvolumes,
-				  &root_volume.k_i,
-				  NULL, NULL, 0);
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -471,7 +466,7 @@ noinline_for_stack
 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
 	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-				__bch2_fs_upgrade_for_subvolumes(&trans));
+				__bch2_fs_upgrade_for_subvolumes(trans));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -561,7 +556,7 @@ static void check_version_upgrade(struct bch_fs *c)
 			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
 				prt_str(&buf, "fsck required");
 			else {
-				prt_str(&buf, "running recovery passses: ");
+				prt_str(&buf, "running recovery passes: ");
 				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
 			}
 
@@ -1009,9 +1004,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_pack(&packed_inode, &root_inode);
 	packed_inode.inode.k.p.snapshot = U32_MAX;
 
-	ret = bch2_btree_insert(c, BTREE_ID_inodes,
-				&packed_inode.inode.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
 	if (ret) {
 		bch_err_msg(c, ret, "creating root directory");
 		goto err;
@@ -1020,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_inode_init_early(c, &lostfound_inode);
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_create_trans(&trans,
+		bch2_create_trans(trans,
 				  BCACHEFS_ROOT_SUBVOL_INUM,
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h
index abf1f834..f3c9ea77 100644
--- a/libbcachefs/recovery_types.h
+++ b/libbcachefs/recovery_types.h
@@ -24,6 +24,7 @@
 	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
 	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
 	x(bucket_gens_init,		0)							\
+	x(resume_logged_ops,		PASS_ALWAYS)						\
 	x(check_snapshot_trees,		PASS_FSCK)						\
 	x(check_snapshots,		PASS_FSCK)						\
 	x(check_subvols,		PASS_FSCK)						\
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index 39f711d5..d77d0ea9 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -5,9 +5,11 @@
 #include "buckets.h"
 #include "extents.h"
 #include "inode.h"
-#include "io.h"
+#include "io_misc.h"
+#include "io_write.h"
 #include "reflink.h"
 #include "subvolume.h"
+#include "super-io.h"
 
 #include <linux/sched/signal.h>
 
@@ -89,6 +91,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+#if 0
+Currently disabled, needs to be debugged:
+
 bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
 	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
@@ -96,6 +101,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
 	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
+#endif
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 			      enum btree_id btree_id, unsigned level,
@@ -247,7 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 		     u64 remap_sectors,
 		     u64 new_i_size, s64 *i_sectors_delta)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter dst_iter, src_iter;
 	struct bkey_s_c src_k;
 	struct bkey_buf new_dst, new_src;
@@ -269,11 +275,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 
 	bch2_bkey_buf_init(&new_dst);
 	bch2_bkey_buf_init(&new_src);
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+	trans = bch2_trans_get(c);
 
-	bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
 			     BTREE_ITER_INTENT);
-	bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
 			     BTREE_ITER_INTENT);
 
 	while ((ret == 0 ||
@@ -281,21 +287,21 @@ s64 bch2_remap_range(struct bch_fs *c,
 	       bkey_lt(dst_iter.pos, dst_end)) {
 		struct disk_reservation disk_res = { 0 };
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
 						  &src_snapshot);
 		if (ret)
 			continue;
 
 		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
 
-		ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
 						  &dst_snapshot);
 		if (ret)
 			continue;
@@ -312,7 +318,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			continue;
 
 		if (bkey_lt(src_want, src_iter.pos)) {
-			ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
 					min(dst_end.offset,
 					    dst_iter.pos.offset +
 					    src_iter.pos.offset - src_want.offset),
@@ -326,7 +332,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 			bch2_bkey_buf_reassemble(&new_src, c, src_k);
 			src_k = bkey_i_to_s_c(new_src.k);
 
-			ret = bch2_make_extent_indirect(&trans, &src_iter,
+			ret = bch2_make_extent_indirect(trans, &src_iter,
 						new_src.k);
 			if (ret)
 				continue;
@@ -354,14 +360,14 @@ s64 bch2_remap_range(struct bch_fs *c,
 				min(src_k.k->p.offset - src_want.offset,
 				    dst_end.offset - dst_iter.pos.offset));
 
-		ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+		ret = bch2_extent_update(trans, dst_inum, &dst_iter,
 					 new_dst.k, &disk_res,
 					 new_i_size, i_sectors_delta,
 					 true);
 		bch2_disk_reservation_put(c, &disk_res);
 	}
-	bch2_trans_iter_exit(&trans, &dst_iter);
-	bch2_trans_iter_exit(&trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
 
 	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
 	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
@@ -373,23 +379,23 @@ s64 bch2_remap_range(struct bch_fs *c,
 		struct bch_inode_unpacked inode_u;
 		struct btree_iter inode_iter = { NULL };
 
-		bch2_trans_begin(&trans);
+		bch2_trans_begin(trans);
 
-		ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
 				       dst_inum, BTREE_ITER_INTENT);
 
 		if (!ret2 &&
 		    inode_u.bi_size < new_i_size) {
 			inode_u.bi_size = new_i_size;
-			ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-				bch2_trans_commit(&trans, NULL, NULL,
+			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+				bch2_trans_commit(trans, NULL, NULL,
 						  BTREE_INSERT_NOFAIL);
 		}
 
-		bch2_trans_iter_exit(&trans, &inode_iter);
+		bch2_trans_iter_exit(trans, &inode_iter);
 	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&new_src, c);
 	bch2_bkey_buf_exit(&new_dst, c);
 
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 5b591c59..dbef41cd 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -429,7 +429,7 @@ out:
 
 	return ret;
 err:
-	bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
+	bch_err_msg(c, ret, "adding replicas entry");
 	goto out;
 }
 
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index 14cffa68..458a1de0 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -31,7 +31,6 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 #define SIX_LOCK_HELD_intent		(1U << 26)
 #define SIX_LOCK_HELD_write		(1U << 27)
 #define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
 #define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
 #define SIX_LOCK_NOSPIN			(1U << 31)
 
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index 03ae280a..cdf9eda2 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -163,8 +163,7 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 
 	rcu_assign_pointer(c->snapshots, new);
 	c->snapshot_table_size = new_size;
-	if (old)
-		kvfree_rcu(old);
+	kvfree_rcu_mightsleep(old);
 
 	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
 }
@@ -344,7 +343,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 				       BTREE_ITER_WITH_UPDATES, snapshot, s);
 }
 
-int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
 {
 	struct bch_snapshot v;
 	int ret;
@@ -371,7 +370,7 @@ int bch2_snapshot_live(struct btree_trans *trans, u32 id)
  * it's part of such a linear chain: this correctly sets equivalence classes on
  * startup if we run leaf to root (i.e. in natural key order).
  */
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	unsigned i, nr_live = 0, live_idx = 0;
@@ -488,18 +487,18 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
 	bch2_trans_iter_exit(trans, &iter);
 
 	if (!ret && !found) {
-		struct bkey_i_subvolume *s;
+		struct bkey_i_subvolume *u;
 
 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
 
-		s = bch2_bkey_get_mut_typed(trans, &iter,
+		u = bch2_bkey_get_mut_typed(trans, &iter,
 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
 					    0, subvolume);
-		ret = PTR_ERR_OR_ZERO(s);
+		ret = PTR_ERR_OR_ZERO(u);
 		if (ret)
 			return ret;
 
-		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
 	}
 
 	return ret;
@@ -591,11 +590,11 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_snapshot_trees, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot_tree(&trans, &iter, k)));
+		check_snapshot_tree(trans, &iter, k)));
 
 	if (ret)
 		bch_err(c, "error %i checking snapshot trees", ret);
@@ -864,11 +863,11 @@ int bch2_check_snapshots(struct bch_fs *c)
 	 * the parent's depth already be correct:
 	 */
 	ret = bch2_trans_run(c,
-		for_each_btree_key_reverse_commit(&trans, iter,
+		for_each_btree_key_reverse_commit(trans, iter,
 			BTREE_ID_snapshots, POS_MAX,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_snapshot(&trans, &iter, k)));
+		check_snapshot(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -911,7 +910,7 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
 		swap(s->children[0], s->children[1]);
 }
 
-int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
@@ -1072,6 +1071,10 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 			goto err;
 
 		new_snapids[i]	= iter.pos.offset;
+
+		mutex_lock(&c->snapshot_table_lock);
+		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+		mutex_unlock(&c->snapshot_table_lock);
 	}
 err:
 	bch2_trans_iter_exit(trans, &iter);
@@ -1354,7 +1357,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_s_c_snapshot snap;
@@ -1366,35 +1369,35 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
 		ret = bch2_fs_read_write_early(c);
 		if (ret) {
-			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+			bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
 			return ret;
 		}
 	}
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	/*
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
 			POS_MIN, 0, k,
 			NULL, NULL, 0,
-		bch2_delete_redundant_snapshot(&trans, &iter, k));
+		bch2_delete_redundant_snapshot(trans, &iter, k));
 	if (ret) {
-		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "deleting redundant snapshots");
 		goto err;
 	}
 
-	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-			   POS_MIN, 0, k,
-		bch2_snapshot_set_equiv(&trans, k));
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+				  POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(trans, k));
 	if (ret) {
-		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
 		goto err;
 	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		if (k.k->type != KEY_TYPE_snapshot)
 			continue;
@@ -1406,7 +1409,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 				break;
 		}
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	if (ret) {
 		bch_err_msg(c, ret, "walking snapshots");
@@ -1421,16 +1424,16 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		if (!btree_type_has_snapshots(id))
 			continue;
 
-		ret = for_each_btree_key_commit(&trans, iter,
+		ret = for_each_btree_key_commit(trans, iter,
 				id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				&res, NULL, BTREE_INSERT_NOFAIL,
-			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
-		      for_each_btree_key_commit(&trans, iter,
+			snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+		      for_each_btree_key_commit(trans, iter,
 				id, POS_MIN,
 				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 				&res, NULL, BTREE_INSERT_NOFAIL,
-			move_key_to_correct_snapshot(&trans, &iter, k));
+			move_key_to_correct_snapshot(trans, &iter, k));
 
 		bch2_disk_reservation_put(c, &res);
 		darray_exit(&equiv_seen);
@@ -1441,7 +1444,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		}
 	}
 
-	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k, ret) {
 		u32 snapshot = k.k->p.offset;
 		u32 equiv = bch2_snapshot_equiv(c, snapshot);
@@ -1449,23 +1452,23 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 		if (equiv != snapshot)
 			snapshot_list_add(c, &deleted_interior, snapshot);
 	}
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	/*
 	 * Fixing children of deleted snapshots can't be done completely
 	 * atomically, if we crash between here and when we delete the interior
 	 * nodes some depth fields will be off:
 	 */
-	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
 				  BTREE_ITER_INTENT, k,
 				  NULL, NULL, BTREE_INSERT_NOFAIL,
-		bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
 	if (ret)
 		goto err;
 
 	darray_for_each(deleted, i) {
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, *i));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 			goto err;
@@ -1473,8 +1476,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	}
 
 	darray_for_each(deleted_interior, i) {
-		ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(&trans, *i));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
 		if (ret) {
 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
 			goto err;
@@ -1485,7 +1488,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 err:
 	darray_exit(&deleted_interior);
 	darray_exit(&deleted);
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -1618,7 +1621,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_buf sk;
-	int ret;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1640,7 +1644,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 	}
 
 	bch2_bkey_buf_exit(&sk, c);
-	return ret;
+
+	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 int bch2_snapshots_read(struct bch_fs *c)
@@ -1650,11 +1655,11 @@ int bch2_snapshots_read(struct bch_fs *c)
 	int ret = 0;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
-			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_snapshot_set_equiv(&trans, k)) ?:
-		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(trans, k)) ?:
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
 			   POS_MIN, 0, k,
 			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
 	if (ret)
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
index dabc9b9d..de215d9d 100644
--- a/libbcachefs/snapshot.h
+++ b/libbcachefs/snapshot.h
@@ -235,8 +235,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 			 struct bch_snapshot *s);
 int bch2_snapshot_get_subvol(struct btree_trans *, u32,
 			     struct bch_subvolume *);
-int bch2_snapshot_live(struct btree_trans *trans, u32 id);
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k);
 
 /* only exported for tests: */
 int bch2_snapshot_node_create(struct btree_trans *, u32,
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 0214a98d..caf2dd7d 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -41,8 +41,7 @@ static int check_subvol(struct btree_trans *trans,
 
 		ret = bch2_subvolume_delete(trans, iter->pos.offset);
 		if (ret)
-			bch_err(c, "error deleting subvolume %llu: %s",
-				iter->pos.offset, bch2_err_str(ret));
+			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
@@ -87,10 +86,10 @@ int bch2_check_subvols(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter,
+		for_each_btree_key_commit(trans, iter,
 			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-		check_subvol(&trans, &iter, k)));
+		check_subvol(trans, &iter, k)));
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -99,7 +98,7 @@ int bch2_check_subvols(struct bch_fs *c)
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-			   unsigned flags, struct printbuf *err)
+			   enum bkey_invalid_flags flags, struct printbuf *err)
 {
 	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
 	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
@@ -294,9 +293,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
 		bch2_evict_subvolume_inodes(c, &s);
 
 		for (id = s.data; id < s.data + s.nr; id++) {
-			ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
+			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
 			if (ret) {
-				bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
+				bch_err_msg(c, ret, "deleting subvolume %u", *id);
 				break;
 			}
 		}
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index 8d4c50f4..bb14f92e 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -10,7 +10,7 @@ enum bkey_invalid_flags;
 int bch2_check_subvols(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-			   unsigned, struct printbuf *);
+			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index b6021b73..c9bf342d 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -6,7 +6,6 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
@@ -23,6 +22,9 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
 struct bch2_metadata_version {
 	u16		version;
 	const char	*name;
@@ -161,7 +163,8 @@ void bch2_free_super(struct bch_sb_handle *sb)
 {
 	kfree(sb->bio);
 	if (!IS_ERR_OR_NULL(sb->bdev))
-		blkdev_put(sb->bdev, sb->mode);
+		blkdev_put(sb->bdev, sb->holder);
+	kfree(sb->holder);
 
 	kfree(sb->sb);
 	memset(sb, 0, sizeof(*sb));
@@ -182,7 +185,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 	if (sb->sb && sb->buffer_size >= new_buffer_size)
 		return 0;
 
-	if (sb->have_layout) {
+	if (sb->sb && sb->have_layout) {
 		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
 		if (new_bytes > max_bytes) {
@@ -243,9 +246,9 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 		/* XXX: we're not checking that offline device have enough space */
 
 		for_each_online_member(ca, c, i) {
-			struct bch_sb_handle *sb = &ca->disk_sb;
+			struct bch_sb_handle *dev_sb = &ca->disk_sb;
 
-			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
 				percpu_ref_put(&ca->ref);
 				return NULL;
 			}
@@ -381,7 +384,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 	}
 
 	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-		prt_printf(out, "Bad intenal UUID (got zeroes)");
+		prt_printf(out, "Bad internal UUID (got zeroes)");
 		return -BCH_ERR_invalid_sb_uuid;
 	}
 
@@ -664,27 +667,30 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 retry:
 #endif
 	memset(sb, 0, sizeof(*sb));
-	sb->mode	= FMODE_READ;
+	sb->mode	= BLK_OPEN_READ;
 	sb->have_bio	= true;
+	sb->holder	= kmalloc(1, GFP_KERNEL);
+	if (!sb->holder)
+		return -ENOMEM;
 
 #ifndef __KERNEL__
 	if (opt_get(*opts, direct_io) == false)
-		sb->mode |= FMODE_BUFFERED;
+		sb->mode |= BLK_OPEN_BUFFERED;
 #endif
 
 	if (!opt_get(*opts, noexcl))
-		sb->mode |= FMODE_EXCL;
+		sb->mode |= BLK_OPEN_EXCL;
 
 	if (!opt_get(*opts, nochanges))
-		sb->mode |= FMODE_WRITE;
+		sb->mode |= BLK_OPEN_WRITE;
 
-	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
 	if (IS_ERR(sb->bdev) &&
 	    PTR_ERR(sb->bdev) == -EACCES &&
 	    opt_get(*opts, read_only)) {
-		sb->mode &= ~FMODE_WRITE;
+		sb->mode &= ~BLK_OPEN_WRITE;
 
-		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
 		if (!IS_ERR(sb->bdev))
 			opt_set(*opts, nochanges, true);
 	}
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index e7dbc31b..e94a63a2 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -35,7 +35,8 @@
 #include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
@@ -68,6 +69,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
 
 #define KTYPE(type)							\
 static const struct attribute_group type ## _group = {			\
@@ -421,6 +423,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		return ret;
 	}
 
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret)
+		goto err;
+
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
@@ -430,7 +436,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 #ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_reinit(&c->writes);
 #else
-	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+	for (i = 0; i < BCH_WRITE_REF_NR; i++) {
 		BUG_ON(atomic_long_read(&c->writes[i]));
 		atomic_long_inc(&c->writes[i]);
 	}
@@ -465,7 +471,6 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
-	int cpu;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
@@ -479,7 +484,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_fsio_exit(c);
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
-	bch2_fs_io_exit(c);
+	bch2_fs_io_write_exit(c);
+	bch2_fs_io_read_exit(c);
 	bch2_fs_buckets_waiting_for_journal_exit(c);
 	bch2_fs_btree_interior_update_exit(c);
 	bch2_fs_btree_iter_exit(c);
@@ -496,12 +502,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
 
-	if (c->btree_paths_bufs)
-		for_each_possible_cpu(cpu)
-			kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
-
 	darray_exit(&c->btree_roots_extra);
-	free_percpu(c->btree_paths_bufs);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
@@ -581,8 +582,6 @@ void bch2_fs_free(struct bch_fs *c)
 {
 	unsigned i;
 
-	BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags));
-
 	mutex_lock(&bch_fs_list_lock);
 	list_del(&c->list);
 	mutex_unlock(&bch_fs_list_lock);
@@ -787,6 +786,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
 	if (c->opts.inodes_use_key_cache)
 		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
 
 	c->block_bits		= ilog2(block_sectors(c));
 	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
@@ -824,7 +824,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
@@ -846,13 +845,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
 	    bch2_fs_btree_write_buffer_init(c) ?:
 	    bch2_fs_subvolumes_init(c) ?:
-	    bch2_fs_io_init(c) ?:
+	    bch2_fs_io_read_init(c) ?:
+	    bch2_fs_io_write_init(c) ?:
 	    bch2_fs_nocow_locking_init(c) ?:
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
 	    bch2_fs_fsio_init(c) ?:
-	    bch2_fs_fs_io_buffered_init(c);
+	    bch2_fs_fs_io_buffered_init(c) ?:
 	    bch2_fs_fs_io_direct_init(c);
 	if (ret)
 		goto err;
@@ -990,7 +990,7 @@ out:
 	up_write(&c->state_lock);
 	return ret;
 err:
-	bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
+	bch_err_msg(c, ret, "starting filesystem");
 	goto out;
 }
 
@@ -1237,8 +1237,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 
 	/* Commit: */
 	ca->disk_sb = *sb;
-	if (sb->mode & FMODE_EXCL)
-		ca->disk_sb.bdev->bd_holder = ca;
 	memset(sb, 0, sizeof(*sb));
 
 	ca->dev = ca->disk_sb.bdev->bd_dev;
@@ -1457,7 +1455,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
 					BTREE_TRIGGER_NORUN, NULL);
 	if (ret)
-		bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "removing dev alloc info");
 
 	return ret;
 }
@@ -1486,31 +1484,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret) {
-		bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "dropping data");
 		goto err;
 	}
 
 	ret = bch2_dev_remove_alloc(c, ca);
 	if (ret) {
-		bch_err(ca, "Remove failed, error deleting alloc info");
+		bch_err_msg(ca, ret, "deleting alloc info");
 		goto err;
 	}
 
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
-		bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "flushing journal");
 		goto err;
 	}
 
 	ret = bch2_journal_flush(&c->journal);
 	if (ret) {
-		bch_err(ca, "Remove failed, journal error");
+		bch_err(ca, "journal error");
 		goto err;
 	}
 
 	ret = bch2_replicas_gc2(c);
 	if (ret) {
-		bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "in replicas_gc2()");
 		goto err;
 	}
 
@@ -1585,7 +1583,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
-		bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "reading super");
 		goto err;
 	}
 
@@ -1601,13 +1599,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_dev_may_add(sb.sb, c);
 	if (ret) {
-		bch_err(c, "device add error: %s", bch2_err_str(ret));
+		bch_err_fn(c, ret);
 		goto err;
 	}
 
 	ca = __bch2_dev_alloc(c, &dev_mi);
 	if (!ca) {
-		bch2_free_super(&sb);
 		ret = -ENOMEM;
 		goto err;
 	}
@@ -1615,14 +1612,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	bch2_dev_usage_init(ca);
 
 	ret = __bch2_dev_attach_bdev(ca, &sb);
-	if (ret) {
-		bch2_dev_free(ca);
+	if (ret)
 		goto err;
-	}
 
 	ret = bch2_dev_journal_alloc(ca);
 	if (ret) {
-		bch_err(c, "device add error: journal alloc failed");
+		bch_err_msg(c, ret, "allocating journal");
 		goto err;
 	}
 
@@ -1631,7 +1626,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
 	ret = bch2_sb_from_fs(c, ca);
 	if (ret) {
-		bch_err(c, "device add error: new device superblock too small");
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1640,8 +1635,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (!bch2_sb_resize_members(&ca->disk_sb,
 				le32_to_cpu(mi->field.u64s) +
 				sizeof(dev_mi) / sizeof(u64))) {
-		bch_err(c, "device add error: new device superblock too small");
 		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1653,8 +1648,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
 			goto have_slot;
 no_slot:
-	bch_err(c, "device add error: already have maximum number of devices");
 	ret = -BCH_ERR_ENOSPC_sb_members;
+	bch_err_msg(c, ret, "setting up new superblock");
 	goto err_unlock;
 
 have_slot:
@@ -1664,8 +1659,8 @@ have_slot:
 
 	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
 	if (!mi) {
-		bch_err(c, "device add error: no room in superblock for member info");
 		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
 	}
 
@@ -1681,7 +1676,7 @@ have_slot:
 	if (BCH_MEMBER_GROUP(&dev_mi)) {
 		ret = __bch2_dev_group_set(c, ca, label.buf);
 		if (ret) {
-			bch_err(c, "device add error: error setting label");
+			bch_err_msg(c, ret, "creating new label");
 			goto err_unlock;
 		}
 	}
@@ -1693,13 +1688,13 @@ have_slot:
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "marking new superblock");
 		goto err_late;
 	}
 
 	ret = bch2_fs_freespace_init(c);
 	if (ret) {
-		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "initializing free space");
 		goto err_late;
 	}
 
@@ -1749,7 +1744,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
 	if (ret) {
-		bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
+		bch_err_msg(c, ret, "bringing %s online", path);
 		goto err;
 	}
 
@@ -1761,8 +1756,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret) {
-		bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
-			path, bch2_err_str(ret));
+		bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
 		goto err;
 	}
 
@@ -1780,7 +1774,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
-		bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+		bch_err_msg(c, ret, "initializing free space");
 
 	up_write(&c->state_lock);
 	return 0;
@@ -1835,7 +1829,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
 	if (ret) {
-		bch_err(ca, "Resize error: %s", bch2_err_str(ret));
+		bch_err_msg(ca, ret, "resizing buckets");
 		goto err;
 	}
 
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 89419fc7..597a8db7 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -6,8 +6,9 @@ struct bch_sb_handle {
 	struct bch_sb		*sb;
 	struct block_device	*bdev;
 	struct bio		*bio;
+	void			*holder;
 	size_t			buffer_size;
-	fmode_t			mode;
+	blk_mode_t		mode;
 	unsigned		have_layout:1;
 	unsigned		have_bio:1;
 	unsigned		fs_sb:1;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 941f4bcb..1abc61cb 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -113,10 +113,6 @@ do {									\
 		prt_human_readable_s64(out, val);			\
 } while (0)
 
-#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var)		sysfs_print(_var, var(_var))
-#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
-
 #define sysfs_strtoul(file, var)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
@@ -139,30 +135,6 @@ do {									\
 	_v;								\
 })
 
-#define strtoul_restrict_or_return(cp, min, max)			\
-({									\
-	unsigned long __v = 0;						\
-	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
-	if (_r)								\
-		return _r;						\
-	__v;								\
-})
-
-#define strtoi_h_or_return(cp)						\
-({									\
-	u64 _v;								\
-	int _r = strtoi_h(cp, &_v);					\
-	if (_r)								\
-		return _r;						\
-	_v;								\
-})
-
-#define sysfs_hatoi(file, var)						\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
-} while (0)
-
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
@@ -280,7 +252,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	enum btree_id id;
@@ -291,18 +263,18 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	    incompressible_sectors = 0,
 	    compressed_sectors_compressed = 0,
 	    compressed_sectors_uncompressed = 0;
-	int ret;
+	int ret = 0;
 
 	if (!test_bit(BCH_FS_STARTED, &c->flags))
 		return -EPERM;
 
-	bch2_trans_init(&trans, c, 0, 0);
+	trans = bch2_trans_get(c);
 
 	for (id = 0; id < BTREE_ID_NR; id++) {
 		if (!btree_type_has_ptrs(id))
 			continue;
 
-		for_each_btree_key(&trans, iter, id, POS_MIN,
+		for_each_btree_key(trans, iter, id, POS_MIN,
 				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
 			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 			const union bch_extent_entry *entry;
@@ -336,10 +308,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 			else if (compressed)
 				nr_compressed_extents++;
 		}
-		bch2_trans_iter_exit(&trans, &iter);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		return ret;
@@ -1005,7 +977,7 @@ STORE(bch2_dev)
 		mutex_lock(&c->sb_lock);
 		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 
-		if (v != BCH_MEMBER_DURABILITY(mi)) {
+		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
 			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
 			bch2_write_super(c);
 		}
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index 72389c73..c907b3e0 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 static int test_delete(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
@@ -39,44 +39,43 @@ static int test_delete(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&k.k_i);
 	k.k.p.snapshot = U32_MAX;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(&trans, &iter, &k.k_i, 0));
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
 	pr_info("deleting once");
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error (first)");
 		goto err;
 	}
 
 	pr_info("deleting twice");
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error (second)");
 		goto err;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
@@ -84,58 +83,53 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&k.k_i);
 	k.k.p.snapshot = U32_MAX;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
 			     BTREE_ITER_INTENT);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_trans_update(&trans, &iter, &k.k_i, 0));
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "update error");
 		goto err;
 	}
 
-	bch2_trans_unlock(&trans);
+	bch2_trans_unlock(trans);
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = commit_do(&trans, NULL, NULL, 0,
+	ret = commit_do(trans, NULL, NULL, 0,
 		bch2_btree_iter_traverse(&iter) ?:
-		bch2_btree_delete_at(&trans, &iter, 0));
+		bch2_btree_delete_at(trans, &iter, 0));
 	if (ret) {
 		bch_err_msg(c, ret, "delete error");
 		goto err;
 	}
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i;
-		k.k.p.snapshot = U32_MAX;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -146,7 +140,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(k.k->p.offset != i++);
@@ -161,7 +155,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
 					 SPOS(0, U64_MAX, U32_MAX), 0, k,
 		({
 			BUG_ON(k.k->p.offset != --i);
@@ -174,35 +168,32 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test extents");
 
 	for (i = 0; i < nr; i += 8) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 8;
-		k.k.p.snapshot = U32_MAX;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 8;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -213,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i);
@@ -229,7 +220,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	pr_info("iterating backwards");
 
-	ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
 					 SPOS(0, U64_MAX, U32_MAX), 0, k,
 		({
 			BUG_ON(k.k->p.offset != i);
@@ -243,34 +234,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
 	BUG_ON(i);
 err:
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i * 2;
-		k.k.p.snapshot = U32_MAX;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i * 2;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -281,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(k.k->p.offset != i);
@@ -299,7 +287,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  BTREE_ITER_SLOTS, k, ({
 		if (i >= nr * 2)
@@ -317,34 +305,31 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 	}
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct bkey_s_c k;
 	u64 i;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i += 16) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 16;
-		k.k.p.snapshot = U32_MAX;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 16;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-					NULL, NULL, 0);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
 		if (ret) {
 			bch_err_msg(c, ret, "insert error");
 			goto err;
@@ -355,7 +340,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k, ({
 		BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -374,7 +359,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
 	i = 0;
 
-	ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
 				 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				 BTREE_ITER_SLOTS, k, ({
 		if (i == nr)
@@ -392,7 +377,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 	}
 	ret = 0;
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return 0;
 }
 
@@ -402,43 +387,41 @@ err:
  */
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return 0;
 }
 
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     SPOS(0, 0, U32_MAX), 0);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 	BUG_ON(k.k);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return 0;
 }
 
@@ -458,8 +441,7 @@ static int insert_test_extent(struct bch_fs *c,
 	k.k_i.k.size = end - start;
 	k.k_i.k.version.lo = test_version++;
 
-	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
 	if (ret)
 		bch_err_fn(c, ret);
 	return ret;
@@ -515,7 +497,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
 	k.k_i.k.size = len;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
 					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
 	if (ret)
 		bch_err_fn(c, ret);
@@ -538,7 +520,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
 /* Test skipping over keys in unrelated snapshots: */
 static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_cookie cookie;
@@ -546,20 +528,19 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = snapid_hi;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
 	if (ret)
 		return ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	trans = bch2_trans_get(c);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, snapid_lo), 0);
-	lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 
 	BUG_ON(k.k->p.snapshot != U32_MAX);
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -572,13 +553,12 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
 	bkey_cookie_init(&cookie.k_i);
 	cookie.k.p.snapshot = U32_MAX;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-				NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
 	if (ret)
 		return ret;
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
-		      bch2_snapshot_node_create(&trans, U32_MAX,
+		      bch2_snapshot_node_create(trans, U32_MAX,
 						snapids,
 						snapid_subvols,
 						2));
@@ -609,38 +589,34 @@ static u64 test_rand(void)
 
 static int rand_insert(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bkey_i_cookie k;
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i++) {
 		bkey_cookie_init(&k.k_i);
 		k.k.p.offset = test_rand();
 		k.k.p.snapshot = U32_MAX;
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int rand_insert_multi(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct bkey_i_cookie k[8];
 	int ret = 0;
 	unsigned j;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
 		for (j = 0; j < ARRAY_SIZE(k); j++) {
 			bkey_cookie_init(&k[j].k_i);
@@ -648,46 +624,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 			k[j].k.p.snapshot = U32_MAX;
 		}
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 		ret = bkey_err(k);
 		if (ret)
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -719,26 +694,25 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie cookie;
 	int ret = 0;
 	u64 i, rand;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
 			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		rand = test_rand();
-		ret = commit_do(&trans, NULL, NULL, 0,
-			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
+		ret = commit_do(trans, NULL, NULL, 0,
+			rand_mixed_trans(trans, &iter, &cookie, i, rand));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_iter_exit(&trans, &iter);
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -766,22 +740,20 @@ err:
 
 static int rand_delete(struct bch_fs *c, u64 nr)
 {
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	int ret = 0;
 	u64 i;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
 	for (i = 0; i < nr; i++) {
 		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-		ret = commit_do(&trans, NULL, NULL, 0,
-			__do_delete(&trans, pos));
+		ret = commit_do(trans, NULL, NULL, 0,
+			__do_delete(trans, pos));
 		if (ret)
 			break;
 	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 	return ret;
 }
 
@@ -794,14 +766,14 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 	bkey_cookie_init(&insert.k_i);
 
 	return bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
 					NULL, NULL, 0, ({
 			if (iter.pos.offset >= nr)
 				break;
 			insert.k.p = iter.pos;
-			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+			bch2_trans_update(trans, &iter, &insert.k_i, 0);
 		})));
 }
 
@@ -811,7 +783,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	return bch2_trans_run(c,
-		for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
 				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
 				  0, k,
 		0));
@@ -823,14 +795,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 	struct bkey_s_c k;
 
 	return bch2_trans_run(c,
-		for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
 					SPOS(0, 0, U32_MAX),
 					BTREE_ITER_INTENT, k,
 					NULL, NULL, 0, ({
 			struct bkey_i_cookie u;
 
 			bkey_reassemble(&u.k_i, k);
-			bch2_trans_update(&trans, &iter, &u.k_i, 0);
+			bch2_trans_update(trans, &iter, &u.k_i, 0);
 		})));
 }
 
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 97fe7742..19264492 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote,
 	TP_ARGS(bio)
 );
 
+TRACE_EVENT(read_nopromote,
+	TP_PROTO(struct bch_fs *c, int ret),
+	TP_ARGS(c, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__array(char,		ret, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%d,%d ret %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ret)
+);
+
 DEFINE_EVENT(bio, read_bounce,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 80a6c566..adeec805 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -112,10 +112,10 @@ got_unit:
 
 #define parse_or_ret(cp, _f)			\
 do {						\
-	int ret = _f;				\
-	if (ret < 0)				\
-		return ret;			\
-	cp += ret;				\
+	int _ret = _f;				\
+	if (_ret < 0)				\
+		return _ret;			\
+	cp += _ret;				\
 } while (0)
 
 static int __bch2_strtou64_h(const char *cp, u64 *res)
@@ -605,11 +605,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
 
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ *		some work
+ * @d:		the struct bch_ratelimit to update
+ * Returns:	the amount of time to delay by, in jiffies
  */
 u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 {
@@ -622,9 +620,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 
 /**
  * bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d:		the struct bch_ratelimit to update
+ * @done:	the amount of work done, in arbitrary units
  */
 void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 {
@@ -761,10 +758,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
 	}
 }
 
-int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
 	while (size) {
-		struct page *page = alloc_pages_noprof(gfp_mask, 0);
+		struct page *page = alloc_pages(gfp_mask, 0);
 		unsigned len = min_t(size_t, PAGE_SIZE, size);
 
 		if (!page)
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index d06671a0..67f1a1d2 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -60,13 +60,12 @@ static inline void vpfree(void *p, size_t size)
 		free_pages((unsigned long) p, get_order(size));
 }
 
-static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
 {
-	return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN,
-					      get_order(size)) ?:
-		__vmalloc_noprof(size, gfp_mask);
+	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+					 get_order(size)) ?:
+		__vmalloc(size, gfp_mask);
 }
-#define vpmalloc(_size, _gfp)	alloc_hooks(vpmalloc_noprof(_size, _gfp))
 
 static inline void kvpfree(void *p, size_t size)
 {
@@ -76,13 +75,12 @@ static inline void kvpfree(void *p, size_t size)
 		vpfree(p, size);
 }
 
-static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 {
 	return size < PAGE_SIZE
-		? kmalloc_noprof(size, gfp_mask)
-		: vpmalloc_noprof(size, gfp_mask);
+		? kmalloc(size, gfp_mask)
+		: vpmalloc(size, gfp_mask);
 }
-#define kvpmalloc(_size, _gfp)	alloc_hooks(kvpmalloc_noprof(_size, _gfp))
 
 int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
 
@@ -534,9 +532,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 
 void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t);
-#define bch2_bio_alloc_pages(_bio, _size, _gfp)				\
-	alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp))
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
@@ -779,12 +775,12 @@ static inline void __move_gap(void *array, size_t element_size,
 
 #define bubble_sort(_base, _nr, _cmp)					\
 do {									\
-	ssize_t _i, _end;						\
+	ssize_t _i, _last;						\
 	bool _swapped = true;						\
 									\
-	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
 		_swapped = false;					\
-		for (_i = 0; _i < _end; _i++)				\
+		for (_i = 0; _i < _last; _i++)				\
 			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
 				swap((_base)[_i], (_base)[_i + 1]);	\
 				_swapped = true;			\
diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c
index 2a2ab86e..cb4f33ed 100644
--- a/libbcachefs/varint.c
+++ b/libbcachefs/varint.c
@@ -13,10 +13,9 @@
 
 /**
  * bch2_varint_encode - encode a variable length integer
- * @out - destination to encode to
- * @v	- unsigned integer to encode
- *
- * Returns the size in bytes of the encoded integer - at most 9 bytes
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
  */
 int bch2_varint_encode(u8 *out, u64 v)
 {
@@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode - encode a variable length integer
- * @in	- varint to decode
- * @end	- end of buffer to decode from
- * @out	- on success, decoded integer
- *
- * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
  * have read past the end of the buffer)
  */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
@@ -73,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
 /**
  * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
  *
  * This version assumes it's always safe to write 8 bytes to @out, even if the
  * encoded integer would be smaller.
@@ -96,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
  *
  * This version assumes that it is safe to read at most 8 bytes past the end of
  * @end (we still return an error if the varint extends past @end).
diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h
index 53a694d7..a6561b4b 100644
--- a/libbcachefs/vstructs.h
+++ b/libbcachefs/vstructs.h
@@ -41,11 +41,11 @@
 	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
 
 #define vstruct_next(_s)						\
-	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_last(_s)						\
-	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_end(_s)							\
-	((void *)			((_s)->_data + __vstruct_u64s(_s)))
+	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 
 #define vstruct_for_each(_s, _i)					\
 	for (_i = (_s)->start;						\
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 6f6b3caf..b069b1a6 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "acl.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -130,6 +131,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 	       xattr.v->x_name,
 	       le16_to_cpu(xattr.v->x_val_len),
 	       (char *) xattr_val(xattr.v));
+
+	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+		prt_char(out, ' ');
+		bch2_acl_to_text(out, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
+	}
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -299,24 +307,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct btree_trans trans;
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
 	u64 offset = 0, inum = inode->ei_inode.bi_inum;
 	u32 snapshot;
 	int ret;
-
-	bch2_trans_init(&trans, c, 0, 0);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	iter = (struct btree_iter) { NULL };
 
-	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
 	if (ret)
 		goto err;
 
-	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
 			   SPOS(inum, offset, snapshot),
 			   POS(inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_xattr)
@@ -328,12 +334,12 @@ retry:
 	}
 
 	offset = iter.pos.offset;
-	bch2_trans_iter_exit(&trans, &iter);
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 
 	if (ret)
 		goto out;
@@ -358,7 +364,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	int ret = bch2_trans_do(c, NULL, NULL, 0,
-		bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
+		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
 
 	return bch2_err_class(ret);
 }
@@ -373,18 +379,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct bch_inode_unpacked inode_u;
-	struct btree_trans trans;
 	int ret;
 
-	bch2_trans_init(&trans, c, 0, 0);
-
-	ret = commit_do(&trans, NULL, NULL, 0,
-			bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+	ret = bch2_trans_run(c,
+		commit_do(trans, NULL, NULL, 0,
+			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
 				       &hash, name, value, size,
-				       handler->flags, flags));
-	if (!ret)
-		bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
-	bch2_trans_exit(&trans);
+				       handler->flags, flags)) ?:
+		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
 
 	return bch2_err_class(ret);
 }
diff --git a/linux/blkdev.c b/linux/blkdev.c
index ea901a46..54af9f87 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -162,7 +162,7 @@ sector_t get_capacity(struct gendisk *disk)
 	return bytes >> 9;
 }
 
-void blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, void *holder)
 {
 	fdatasync(bdev->bd_fd);
 	close(bdev->bd_sync_fd);
@@ -170,25 +170,25 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	free(bdev);
 }
 
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-					void *holder)
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+					void *holder, const struct blk_holder_ops *hop)
 {
 	struct block_device *bdev;
 	int fd, sync_fd, buffered_fd, flags = 0;
 
-	if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+	if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
 		flags = O_RDWR;
-	else if (mode & FMODE_READ)
+	else if (mode & BLK_OPEN_READ)
 		flags = O_RDONLY;
-	else if (mode & FMODE_WRITE)
+	else if (mode & BLK_OPEN_WRITE)
 		flags = O_WRONLY;
 
-	if (!(mode & FMODE_BUFFERED))
+	if (!(mode & BLK_OPEN_BUFFERED))
 		flags |= O_DIRECT;
 
 #if 0
 	/* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */
-	if (mode & FMODE_EXCL)
+	if (mode & BLK_OPEN_EXCL)
 		flags |= O_EXCL;
 #endif
 	buffered_fd = open(path, flags & ~O_DIRECT);
diff --git a/rust-src/bch_bindgen/src/bkey.rs b/rust-src/bch_bindgen/src/bkey.rs
index 64697ea6..d4830839 100644
--- a/rust-src/bch_bindgen/src/bkey.rs
+++ b/rust-src/bch_bindgen/src/bkey.rs
@@ -47,6 +47,8 @@ pub enum BkeyValC<'a> {
     inode_v3(&'a c::bch_inode_v3),
     bucket_gens(&'a c::bch_bucket_gens),
     snapshot_tree(&'a c::bch_snapshot_tree),
+    logged_op_truncate(&'a c::bch_logged_op_truncate),
+    logged_op_finsert(&'a c::bch_logged_op_finsert),
 }
 
 impl<'a, 'b> BkeySC<'a> {
@@ -96,6 +98,8 @@ impl<'a, 'b> BkeySC<'a> {
             KEY_TYPE_inode_v3               => inode_v3(unsafe { transmute(self.v) }),
             KEY_TYPE_bucket_gens            => bucket_gens(unsafe { transmute(self.v) }),
             KEY_TYPE_snapshot_tree          => snapshot_tree(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_truncate     => logged_op_truncate(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_finsert      => logged_op_finsert(unsafe { transmute(self.v) }),
             KEY_TYPE_MAX                    => unreachable!(),
         }
     }
diff --git a/rust-src/bch_bindgen/src/btree.rs b/rust-src/bch_bindgen/src/btree.rs
index 32b4e743..f738a466 100644
--- a/rust-src/bch_bindgen/src/btree.rs
+++ b/rust-src/bch_bindgen/src/btree.rs
@@ -11,24 +11,21 @@ use std::ptr;
 use bitflags::bitflags;
 
 pub struct BtreeTrans<'f> {
-    raw:    c::btree_trans,
+    raw:    *mut c::btree_trans,
     fs:     PhantomData<&'f Fs>
 }
 
 impl<'f> BtreeTrans<'f> {
     pub fn new(fs: &'f Fs) -> BtreeTrans {
         unsafe {
-            let mut trans: MaybeUninit<c::btree_trans> = MaybeUninit::uninit();
-
-            c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0);
-            BtreeTrans { raw: trans.assume_init(), fs: PhantomData }
+            BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData }
         }
     }
 }
 
 impl<'f> Drop for BtreeTrans<'f> {
     fn drop(&mut self) {
-        unsafe { c::bch2_trans_exit(&mut self.raw) }
+        unsafe { c::bch2_trans_put(&mut *self.raw) }
     }             
 }
 
@@ -64,9 +61,9 @@ impl<'t> BtreeIter<'t> {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
 
             c::bch2_trans_iter_init_outlined(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
-                btree as u32,
+                btree,
                 pos,
                 flags.bits as u32);
 
@@ -123,7 +120,7 @@ impl<'t> BtreeNodeIter<'t> {
         unsafe {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
             c::bch2_trans_node_iter_init(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
                 btree,
                 pos,
diff --git a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h
index e7bcfcfb..e68de664 100644
--- a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h
+++ b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h
@@ -13,8 +13,8 @@
 #include "../include/linux/blkdev.h"
 
 
-#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name;
+#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name;
 
-MARK_FIX_753(FMODE_READ);
-MARK_FIX_753(FMODE_WRITE);
-MARK_FIX_753(FMODE_EXCL);
\ No newline at end of file
+MARK_FIX_753(BLK_OPEN_READ);
+MARK_FIX_753(BLK_OPEN_WRITE);
+MARK_FIX_753(BLK_OPEN_EXCL);