Update bcachefs sources to e57b5958cf bcachefs: fix for building in userspace

2025-02-22 00:00:03 +03:00 · 2017-12-13 16:01:18 -05:00 · 2017-12-13 16:01:18 -05:00 · ea83a3985d
commit ea83a3985d
parent f2feceddae
50 changed files with 3422 additions and 3284 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-192d759a491f50d92c89c2e842639d2307c815a5
+e57b5958cf4e8530d26f7c36a6e1427fb284cc70
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
 	if (ret)
 		die("error reserving space in new filesystem: %s", strerror(-ret));

-	bch2_write_op_init(&op, c, res, NULL, 0,
+	bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
 			   POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
 	closure_call(&op.cl, bch2_write, NULL, &cl);
 	closure_sync(&cl);
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );

-DECLARE_EVENT_CLASS(page_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		size		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->size = size;
-	),
-
-	TP_printk("%pU size %llu", __entry->uuid, __entry->size)
-);
-
 /* io.c: */

 DEFINE_EVENT(bio, read_split,
@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote,
 	TP_ARGS(bio)
 );

-TRACE_EVENT(write_throttle,
-	TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
-	TP_ARGS(c, inode, bio, delay),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
-		__field(u64,		inode			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-		__field(u64,		delay			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->inode		= inode;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
-		__entry->delay		= delay;
-	),
-
-	TP_printk("%pU inode %llu  %s %llu + %u delay %llu",
-		  __entry->uuid, __entry->inode,
-		  __entry->rwbs, (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->delay)
-);
-
 /* Journal */

 DEFINE_EVENT(bch_fs, journal_full,
@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch,
 		__entry->uuid, __entry->free, __entry->total)
 );

-DEFINE_EVENT(bch_dev, prio_write_start,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_dev, prio_write_end,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
 TRACE_EVENT(invalidate,
 	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
 	TP_ARGS(ca, offset, sectors),
@ -502,151 +447,29 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_ARGS(ca, reserve)
 );

-TRACE_EVENT(freelist_empty_fail,
-	TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
-		 struct closure *cl),
-	TP_ARGS(c, reserve, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(enum alloc_reserve,	reserve		)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->reserve = reserve;
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
-		  __entry->cl)
-);
-
-DECLARE_EVENT_CLASS(open_bucket_alloc,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU cl %p",
-		  __entry->uuid, __entry->cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl)
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
 );

 /* Moving IO */

-DECLARE_EVENT_CLASS(moving_io,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k),
-
-	TP_STRUCT__entry(
-		__field(__u32,		inode			)
-		__field(__u64,		offset			)
-		__field(__u32,		sectors			)
-	),
-
-	TP_fast_assign(
-		__entry->inode		= k->p.inode;
-		__entry->offset		= k->p.offset;
-		__entry->sectors	= k->size;
-	),
-
-	TP_printk("%u:%llu sectors %u",
-		  __entry->inode, __entry->offset, __entry->sectors)
-);
-
-DEFINE_EVENT(moving_io, move_read,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, move_read_done,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, move_write,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, copy_collision,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-/* Copy GC */
-
-DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size)
-);
-
-DEFINE_EVENT(bch_dev, moving_gc_start,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-TRACE_EVENT(moving_gc_end,
-	TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
-		u64 buckets_moved),
-	TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		sectors_moved	)
-		__field(u64,		keys_moved	)
-		__field(u64,		buckets_moved	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->sectors_moved = sectors_moved;
-		__entry->keys_moved = keys_moved;
-		__entry->buckets_moved = buckets_moved;
-	),
-
-	TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
-		__entry->uuid, __entry->sectors_moved, __entry->keys_moved,
-		__entry->buckets_moved)
-);
-
-DEFINE_EVENT(bkey, gc_copy,
+DEFINE_EVENT(bkey, move_extent,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );

-/* Tiering */
-
-DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size)
+DEFINE_EVENT(bkey, move_alloc_fail,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
 );

-DEFINE_EVENT(bch_fs, tiering_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
+DEFINE_EVENT(bkey, move_race,
+	TP_PROTO(const struct bkey *k),
+	TP_ARGS(k)
 );

-TRACE_EVENT(tiering_end,
+TRACE_EVENT(move_data,
 	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
 		 u64 keys_moved),
 	TP_ARGS(c, sectors_moved, keys_moved),
@ -667,9 +490,34 @@ TRACE_EVENT(tiering_end,
 		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
 );

-DEFINE_EVENT(bkey, tiering_copy,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_dev *ca,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(ca,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16		)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
+	),
+
+	TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		__entry->uuid,
+		__entry->sectors_moved, __entry->sectors_not_moved,
+		__entry->buckets_moved, __entry->buckets_not_moved)
 );

 #endif /* _TRACE_BCACHE_H */
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@ -8,7 +8,7 @@ struct bkey;
 struct bucket;
 struct bch_dev;
 struct bch_fs;
-struct dev_group;
+struct bch_devs_List;

 struct dev_alloc_list {
 	unsigned	nr;
@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
 int bch2_alloc_read(struct bch_fs *, struct list_head *);
 int bch2_alloc_replay_key(struct bch_fs *, struct bpos);

-long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};

-void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+	*nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < wp->nr_ptrs_can_use; i++) {
+		struct open_bucket *ob = wp->ptrs[i];
+
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}

 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     enum bch_data_type,
 					     struct bch_devs_mask *,
-					     unsigned long,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
 					     unsigned, unsigned,
 					     enum alloc_reserve,
 					     unsigned,
 					     struct closure *);

-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
-				   unsigned, struct open_bucket *, unsigned);
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);

-struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
-				       enum bch_data_type,
-				       struct bch_devs_mask *,
-				       unsigned long,
-				       struct bkey_i_extent *,
-				       unsigned, unsigned,
-				       enum alloc_reserve,
-				       unsigned,
-				       struct closure *);
-
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
 	struct task_struct *p;
@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 	rcu_read_unlock();
 }

-#define open_bucket_for_each_ptr(_ob, _ptr)				\
-	for ((_ptr) = (_ob)->ptrs;					\
-	     (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs;			\
-	     (_ptr)++)
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}

 void bch2_recalc_capacity(struct bch_fs *);

@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);

+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
 void bch2_fs_allocator_init(struct bch_fs *);

 extern const struct bkey_ops bch2_bkey_alloc_ops;
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@ -47,19 +47,14 @@ enum alloc_reserve {
 #define OPEN_BUCKETS_COUNT	256
 #define WRITE_POINT_COUNT	32

-struct open_bucket_ptr {
-	struct bch_extent_ptr	ptr;
-	unsigned		sectors_free;
-};
-
 struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
 	u8			freelist;
-	u8			new_ob;
-	u8			nr_ptrs;
-
-	struct open_bucket_ptr	ptrs[BCH_REPLICAS_MAX * 2];
+	bool			valid;
+	bool			on_partial_list;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
 };

 struct write_point {
@ -69,13 +64,23 @@ struct write_point {
 	unsigned long		write_point;
 	enum bch_data_type	type;

+	u8			nr_ptrs;
+	/*
+	 * number of pointers in @ob we can't use, because we already had
+	 * pointers to those devices:
+	 */
+	u8			nr_ptrs_can_use;
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;

-	struct open_bucket	*ob;
+	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };

+struct write_point_specifier {
+	unsigned long		v;
+};
+
 struct alloc_heap_entry {
 	size_t			bucket;
 	unsigned long		key;
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -251,9 +251,6 @@ do {									\
 	BCH_DEBUG_PARAM(debug_check_bkeys,				\
 		"Run bkey_debugcheck (primarily checking GC/allocation "\
 		"information) when iterating over keys")		\
-	BCH_DEBUG_PARAM(version_stress_test,				\
-		"Assigns random version numbers to newly written "	\
-		"extents, to test overlapping extent cases")		\
 	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
@ -310,8 +307,9 @@ struct crypto_blkcipher;
 struct crypto_ahash;

 enum gc_phase {
-	GC_PHASE_SB_METADATA		= BTREE_ID_NR + 1,
+	GC_PHASE_SB		= BTREE_ID_NR + 1,
 	GC_PHASE_PENDING_DELETE,
+	GC_PHASE_ALLOC,
 	GC_PHASE_DONE
 };

@ -321,30 +319,6 @@ struct gc_pos {
 	unsigned		level;
 };

-struct bch_member_cpu {
-	u64			nbuckets;	/* device size */
-	u16			first_bucket;   /* index of first bucket used */
-	u16			bucket_size;	/* sectors */
-	u8			state;
-	u8			tier;
-	u8			replacement;
-	u8			discard;
-	u8			data_allowed;
-	u8			valid;
-};
-
-struct bch_replicas_cpu_entry {
-	u8			data_type;
-	u8			devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-	struct rcu_head		rcu;
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_cpu_entry entries[];
-};
-
 struct io_count {
 	u64			sectors[2][BCH_DATA_NR];
 };
@ -372,7 +346,7 @@ struct bch_dev {

 	struct bch_devs_mask	self;

-	/* biosets used in cloned bios for replicas and moving_gc */
+	/* biosets used in cloned bios for writing multiple replicas */
 	struct bio_set		replica_set;

 	struct task_struct	*alloc_thread;
@ -392,7 +366,7 @@ struct bch_dev {
 	unsigned		nr_invalidated;
 	bool			alloc_thread_started;

-	struct open_bucket_ptr	open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
+	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;

 	size_t			fifo_last_bucket;
@ -422,18 +396,20 @@ struct bch_dev {
 	bool			allocator_invalidating_data;

 	alloc_heap		alloc_heap;
-	bucket_heap		copygc_heap;

-	/* Moving GC: */
-	struct task_struct	*moving_gc_read;
-
-	struct bch_pd_controller moving_gc_pd;
+	/* Copying GC: */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;

 	struct journal_device	journal;

 	struct work_struct	io_error_work;

 	/* The rest of this all shows up in sysfs */
+	atomic_t		latency[2];
+
 	struct io_count __percpu *io_done;
 };

@ -473,6 +449,7 @@ struct bch_tier {
 	struct bch_pd_controller pd;

 	struct bch_devs_mask	devs;
+	struct write_point	wp;
 };

 enum bch_fs_state {
@ -557,10 +534,7 @@ struct bch_fs {
 	 * when allocating btree reserves fail halfway through) - instead, we
 	 * can stick them here:
 	 */
-	struct btree_alloc {
-		struct open_bucket	*ob;
-		BKEY_PADDED(k);
-	}			btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
 	unsigned		btree_reserve_cache_nr;
 	struct mutex		btree_reserve_cache_lock;

@ -573,15 +547,9 @@ struct bch_fs {
 	struct workqueue_struct	*copygc_wq;

 	/* ALLOCATION */
-	struct rw_semaphore	alloc_gc_lock;
-	struct bch_pd_controller foreground_write_pd;
 	struct delayed_work	pd_controllers_update;
 	unsigned		pd_controllers_update_seconds;
-	spinlock_t		foreground_write_pd_lock;
-	struct bch_write_op	*write_wait_head;
-	struct bch_write_op	*write_wait_tail;

-	struct timer_list	foreground_write_wakeup;

 	/*
 	 * These contain all r/w devices - i.e. devices we can currently
@ -622,8 +590,8 @@ struct bch_fs {

 	struct io_clock		io_clock[2];

-	/* SECTOR ALLOCATOR */
-	spinlock_t		open_buckets_lock;
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
 	u8			open_buckets_freelist;
 	u8			open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
@ -635,15 +603,6 @@ struct bch_fs {
 	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
 	struct mutex		write_points_hash_lock;

-	/*
-	 * This write point is used for migrating data off a device
-	 * and can point to any other device.
-	 * We can't use the normal write points because those will
-	 * gang up n replicas, and for migration we want only one new
-	 * replica.
-	 */
-	struct write_point	migration_write_point;
-
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
 	atomic_t		kick_gc;
@ -688,6 +647,11 @@ struct bch_fs {

 	atomic64_t		key_version;

+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+
 	struct bio_list		btree_write_error_list;
 	struct work_struct	btree_write_error_work;
 	spinlock_t		btree_write_error_lock;
@ -728,19 +692,14 @@ struct bch_fs {

 	/* The rest of this all shows up in sysfs */
 	atomic_long_t		read_realloc_races;
+	atomic_long_t		extent_migrate_done;
+	atomic_long_t		extent_migrate_raced;

 	unsigned		btree_gc_periodic:1;
-	unsigned		foreground_write_ratelimit_enabled:1;
 	unsigned		copy_gc_enabled:1;
 	unsigned		tiering_enabled:1;
 	unsigned		tiering_percent;

-	/*
-	 * foreground writes will be throttled when the number of free
-	 * buckets is below this percentage
-	 */
-	unsigned		foreground_target_percent;
-
 #define BCH_DEBUG_PARAM(name, description) bool name;
 	BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -344,11 +344,13 @@ struct bch_csum {

 enum bch_csum_type {
 	BCH_CSUM_NONE			= 0,
-	BCH_CSUM_CRC32C			= 1,
-	BCH_CSUM_CRC64			= 2,
+	BCH_CSUM_CRC32C_NONZERO		= 1,
+	BCH_CSUM_CRC64_NONZERO		= 2,
 	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
 	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
-	BCH_CSUM_NR			= 5,
+	BCH_CSUM_CRC32C			= 5,
+	BCH_CSUM_CRC64			= 6,
+	BCH_CSUM_NR			= 7,
 };

 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 /* Maximum possible size of an entire extent value: */
 /* There's a hack in the keylist code that needs to be fixed.. */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
-	(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
+	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))

 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
 /*
 * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
 *		BCH_MEMBER_DATA_ALLOWED
+ * Version 9:	incompatible extent nonce change
 */

 #define BCH_SB_VERSION_MIN		7
 #define BCH_SB_VERSION_EXTENT_MAX	8
-#define BCH_SB_VERSION_MAX		8
+#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
+#define BCH_SB_VERSION_MAX		9

 #define BCH_SB_SECTOR			8
 #define BCH_SB_LABEL_SIZE		32
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@ -4,6 +4,14 @@
 #include "bset.h"
 #include "util.h"

+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;

 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@ -146,6 +146,17 @@
 * first key in that range of bytes again.
 */

+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
 struct btree_node_iter;
 struct btree_node_iter_set;

@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
 		compiled_unpack_fn unpack_fn = b->aux_data;
 		unpack_fn(&dst, src);

-		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		if (btree_keys_expensive_checks(b)) {
 			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);

 			/*
@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define for_each_bset(_b, _t)					\
 	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)

-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-	return false;
-#endif
-}
-
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
 	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -24,6 +24,7 @@
 #include <linux/bitops.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <trace/events/bcachefs.h>

@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 /*
 * For runtime mark and sweep:
 */
-static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
+static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 			   struct bkey_s_c k, unsigned flags)
 {
+	struct gc_pos pos = { 0 };
+	struct bch_fs_usage *stats;
+	u8 ret = 0;
+
+	preempt_disable();
+	stats = this_cpu_ptr(c->usage_percpu);
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags);
-		return 0;
+		bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		break;
 	case BKEY_TYPE_EXTENTS:
-		bch2_gc_mark_key(c, k, k.k->size, false, flags);
-		return bch2_btree_key_recalc_oldest_gen(c, k);
+		bch2_mark_key(c, k, k.k->size, false, pos, stats,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		ret = bch2_btree_key_recalc_oldest_gen(c, k);
+		break;
 	default:
 		BUG();
 	}
+	preempt_enable();
+
+	return ret;
 }

 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		     max_t(u64, k.k->version.lo,
 			   atomic64_read(&c->key_version)));

-	bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+	bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
 fsck_err:
 	return ret;
 }
@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 					       btree_node_is_extents(b),
 					       &unpacked) {
 			bch2_bkey_debugcheck(c, b, k);
-			stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
+			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
 		}

 	return stale;
@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 	mutex_lock(&c->btree_root_lock);

 	b = c->btree_roots[btree_id].b;
-	bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+	bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));

 	mutex_unlock(&c->btree_root_lock);
 	return 0;
 }

-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct open_bucket *ob;
-	const struct open_bucket_ptr *ptr;
-	size_t i, j, iter;
-	unsigned ci;
-
-	down_write(&c->alloc_gc_lock);
-
-	for_each_member_device(ca, c, ci) {
-		spin_lock(&ca->freelist_lock);
-
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
-		for (ptr = ca->open_buckets_partial;
-		     ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
-		     ptr++)
-			bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
-
-		spin_unlock(&ca->freelist_lock);
-	}
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		open_bucket_for_each_ptr(ob, ptr) {
-			ca = c->devs[ptr->ptr.dev];
-			bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
-		}
-		spin_unlock(&ob->lock);
-	}
-
-	up_write(&c->alloc_gc_lock);
-}
-
-static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
-				  enum bucket_data_type type)
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bucket_data_type type,
+				  unsigned flags)
 {
 	u64 b = sector_to_bucket(ca, start);

 	do {
-		bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+		bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
+					  gc_phase(GC_PHASE_SB), flags);
 		b++;
 	} while (b < sector_to_bucket(ca, end));
 }

-static void bch2_dev_mark_superblocks(struct bch_dev *ca)
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+			      unsigned flags)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	unsigned i;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		if (layout->sb_offset[i] == BCH_SB_SECTOR)
-			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
-					      BUCKET_SB);
-
-		mark_metadata_sectors(ca,
-				      layout->sb_offset[i],
-				      layout->sb_offset[i] +
-				      (1 << layout->sb_max_size_bits),
-				      BUCKET_SB);
-	}
-}
-
-/*
- * Mark non btree metadata - prios, journal
- */
-void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
 	u64 b;

 	lockdep_assert_held(&c->sb_lock);

-	bch2_dev_mark_superblocks(ca);
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		if (layout->sb_offset[i] == BCH_SB_SECTOR)
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BUCKET_SB, flags);
+
+		mark_metadata_sectors(c, ca,
+				      layout->sb_offset[i],
+				      layout->sb_offset[i] +
+				      (1 << layout->sb_max_size_bits),
+				      BUCKET_SB, flags);
+	}

 	spin_lock(&c->journal.lock);

 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
-		bch2_mark_metadata_bucket(ca, ca->buckets + b,
-					 BUCKET_JOURNAL, true);
+		bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
+					  BUCKET_JOURNAL,
+					  gc_phase(GC_PHASE_SB), flags);
 	}

 	spin_unlock(&c->journal.lock);
 }

-static void bch2_mark_metadata(struct bch_fs *c)
+static void bch2_mark_superblocks(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;

 	mutex_lock(&c->sb_lock);
-	gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));

 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_metadata(c, ca);
+		bch2_mark_dev_superblock(c, ca,
+					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					 BCH_BUCKET_MARK_GC_LOCK_HELD);
 	mutex_unlock(&c->sb_lock);
 }

 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
+	struct gc_pos pos = { 0 };
 	struct bch_fs_usage stats = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;
@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)

 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			__bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-					c->opts.btree_node_size, true,
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+				      c->opts.btree_node_size, true, pos,
 				      &stats, 0,
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+				      BCH_BUCKET_MARK_GC_LOCK_HELD);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	mutex_unlock(&c->btree_interior_update_lock);
 }

+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	spin_lock(&c->freelist_lock);
+	gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+	for_each_member_device(ca, c, ci) {
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+					       gc_pos_alloc(c, NULL),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+						       gc_pos_alloc(c, NULL),
+						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+	}
+
+	spin_unlock(&c->freelist_lock);
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid) {
+			gc_pos_set(c, gc_pos_alloc(c, ob));
+			ca = c->devs[ob->ptr.dev];
+			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
+					       gc_pos_alloc(c, ob),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+		}
+		spin_unlock(&ob->lock);
+	}
+}
+
 void bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c)

 	bch2_gc_start(c);

-	/* Walk allocator's references: */
-	bch2_mark_allocator_buckets(c);
-
 	/* Walk btree: */
 	while (c->gc_pos.phase < (int) BTREE_ID_NR) {
 		int ret = c->btree_roots[c->gc_pos.phase].b
@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c)
 		gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
 	}

-	bch2_mark_metadata(c);
+	bch2_mark_superblocks(c);
 	bch2_mark_pending_btree_node_frees(c);
+	bch2_mark_allocator_buckets(c);

 	for_each_member_device(ca, c, i)
 		atomic_long_set(&ca->saturated_count, 0);
@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 	struct bkey_format new_format;

 	memset(new_nodes, 0, sizeof(new_nodes));
-	bch2_keylist_init(&keylist, NULL, 0);
+	bch2_keylist_init(&keylist, NULL);

 	/* Count keys that are not deleted */
 	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
@ -1023,8 +1040,6 @@ again:
 	if (ret)
 	return ret;

-	bch2_mark_metadata(c);
-
 	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
 		if (iter++ > 2) {
 			bch_info(c, "Unable to fix bucket gens, looping");
@ -1043,6 +1058,8 @@ again:
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);

+	bch2_mark_superblocks(c);
+
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);

--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *);
 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
 int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
 				struct bkey_s_c);
-void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);

 /*
 * For concurrent mark and sweep (with other index updates), we define a total
@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 	};
 }

+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_ALLOC,
+		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
+	};
+}
+
 static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 	BUG_ON(iter->data->k > iter->data->end);

 	if (iter->data->k == iter->data->end)
-		memmove(&iter->data[0],
-			&iter->data[1],
-			sizeof(iter->data[0]) * --iter->used);
+		array_remove_item(iter->data, iter->used, 0);
 	else
 		sort_iter_sift(iter, cmp);
 }
@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio)
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);

+	bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+
 	INIT_WORK(&rb->work, btree_node_read_work);
 	schedule_work(&rb->work);
 }
@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;

+	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
 	if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
 	    bch2_meta_write_fault("btree"))
 		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -10,6 +10,7 @@ struct btree_iter;

 struct btree_read_bio {
 	struct bch_fs		*c;
+	unsigned		submit_time_us;
 	u64			start_time;
 	struct extent_pick_ptr	pick;
 	struct work_struct	work;
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);

-	EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
+	EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);

 	if (lock_type != BTREE_NODE_UNLOCKED)
 		six_unlock_type(&iter->nodes[level]->lock, lock_type);
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -55,6 +55,16 @@ struct btree_write {
 	struct closure_waitlist		wait;
 };

+struct btree_ob_ref {
+	u8			nr;
+	u8			refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+	struct btree_ob_ref	ob;
+	BKEY_PADDED(k);
+};
+
 struct btree {
 	/* Hottest entries first */
 	struct rhash_head	hash;
@ -118,7 +128,7 @@ struct btree {
 	 */
 	struct btree_update	*will_make_reachable;

-	struct open_bucket	*ob;
+	struct btree_ob_ref	ob;

 	/* lru list */
 	struct list_head	list;
@ -317,18 +327,6 @@ struct btree_root {
 struct btree_iter;
 struct btree_node_iter;

-enum extent_insert_hook_ret {
-	BTREE_HOOK_DO_INSERT,
-	BTREE_HOOK_NO_INSERT,
-	BTREE_HOOK_RESTART_TRANS,
-};
-
-struct extent_insert_hook {
-	enum extent_insert_hook_ret
-	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
-	      struct bkey_s_c, const struct bkey_i *);
-};
-
 enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* extent spanned multiple leaf nodes: have to traverse to next node: */
@ -342,6 +340,12 @@ enum btree_insert_ret {
 	BTREE_INSERT_NEED_GC_LOCK,
 };

+struct extent_insert_hook {
+	enum btree_insert_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
 enum btree_gc_coalesce_fail_reason {
 	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
 	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -211,7 +211,7 @@ found:
 			     -c->opts.btree_node_size, true, b
 			     ? gc_pos_btree_node(b)
 			     : gc_pos_btree_root(as->btree_id),
-			     &tmp, 0);
+			     &tmp, 0, 0);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
 	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_need_write(b));
 	BUG_ON(b == btree_node_root(c, b));
-	BUG_ON(b->ob);
+	BUG_ON(b->ob.nr);
 	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON(b->will_make_reachable);

@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,

 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
-	struct open_bucket *ob = b->ob;
+	struct btree_ob_ref ob = b->ob;

 	btree_update_drop_new_node(c, b);

-	b->ob = NULL;
+	b->ob.nr = 0;

 	clear_btree_node_dirty(b);

 	__btree_node_free(c, b, NULL);

-	bch2_open_bucket_put(c, ob);
+	bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
 }

 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		     -c->opts.btree_node_size, true,
 		     gc_phase(GC_PHASE_PENDING_DELETE),
-		     &stats, 0);
+		     &stats, 0, 0);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,

 void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
 {
-	bch2_open_bucket_put(c, b->ob);
-	b->ob = NULL;
+	bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
 }

 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 					     struct closure *cl,
 					     unsigned flags)
 {
-	BKEY_PADDED(k) tmp;
-	struct open_bucket *ob;
+	struct write_point *wp;
 	struct btree *b;
+	BKEY_PADDED(k) tmp;
+	struct bkey_i_extent *e;
+	struct btree_ob_ref ob;
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;

@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);

 retry:
-	/* alloc_sectors is weird, I suppose */
-	bkey_extent_init(&tmp.k);
-	tmp.k.k.size = c->opts.btree_node_size,
-
-	ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
-				bkey_i_to_extent(&tmp.k),
+	wp = bch2_alloc_sectors_start(c, NULL,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
 				      res->nr_replicas,
 				      c->opts.metadata_replicas_required,
 				      alloc_reserve, 0, cl);
-	if (IS_ERR(ob))
-		return ERR_CAST(ob);
+	if (IS_ERR(wp))
+		return ERR_CAST(wp);

-	if (tmp.k.k.size < c->opts.btree_node_size) {
-		bch2_open_bucket_put(c, ob);
+	if (wp->sectors_free < c->opts.btree_node_size) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		writepoint_for_each_ptr(wp, ob, i)
+			if (ob->sectors_free < c->opts.btree_node_size)
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
 		goto retry;
 	}
+
+	e = bkey_extent_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+
+	ob.nr = 0;
+	bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(c);

 	/* we hold cannibalize_lock: */
 	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob);
+	BUG_ON(b->ob.nr);

 	bkey_copy(&b->key, &tmp.k);
-	b->key.k.size = 0;
 	b->ob = ob;

 	return b;
@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];

 			a->ob = b->ob;
-			b->ob = NULL;
+			b->ob.nr = 0;
 			bkey_copy(&a->k, &b->key);
 		} else {
-			bch2_open_bucket_put(c, b->ob);
-			b->ob = NULL;
+			bch2_btree_open_bucket_put(c, b);
 		}

 		__btree_node_free(c, b, NULL);
@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b)

 	BUG();
 found:
-	as->nr_new_nodes--;
-	memmove(&as->new_nodes[i],
-		&as->new_nodes[i + 1],
-		sizeof(struct btree *) * (as->nr_new_nodes - i));
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
 	b->will_make_reachable = NULL;
 }

@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	as->reserve	= reserve;
 	INIT_LIST_HEAD(&as->write_blocked_list);

-	bch2_keylist_init(&as->parent_keys, as->inline_keys,
-			 ARRAY_SIZE(as->inline_keys));
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);

 	mutex_lock(&c->btree_interior_update_lock);
 	list_add(&as->list, &c->btree_interior_update_list);
@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
 		      c->opts.btree_node_size, true,
 		      gc_pos_btree_root(b->btree_id),
-		      &stats, 0);
+		      &stats, 0, 0);

 	if (old)
 		bch2_btree_node_free_index(as, NULL,
@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	if (bkey_extent_is_data(&insert->k))
 		bch2_mark_key(c, bkey_i_to_s_c(insert),
 			     c->opts.btree_node_size, true,
-			     gc_pos_btree_node(b), &stats, 0);
+			     gc_pos_btree_node(b), &stats, 0, 0);

 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	struct closure cl;
 	int ret = 0;

+	/*
+	 * We already have a disk reservation and open buckets pinned; this
+	 * allocation must not block:
+	 */
+	if (iter->btree_id == BTREE_ID_EXTENTS)
+		btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
+
 	closure_init_stack(&cl);

 	/* Hack, because gc and splitting nodes doesn't mix yet: */
@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_iter_set_locks_want(iter, 1);
 out:
 	up_read(&c->gc_lock);
+	closure_sync(&cl);
 	return ret;
 }

@ -1904,7 +1919,7 @@ retry:
 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
 			      c->opts.btree_node_size, true,
 			      gc_pos_btree_root(b->btree_id),
-			      &stats, 0);
+			      &stats, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
 					   &stats);
@ -1928,6 +1943,7 @@ out:
 	}
 	bch2_btree_iter_unlock(&iter);
 	up_read(&c->gc_lock);
+	closure_sync(&cl);
 	return ret;
 err:
 	if (as)
@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
 					     BTREE_INSERT_USE_RESERVE|
 					     BTREE_INSERT_USE_ALLOC_RESERVE,
 					     &cl);
+		closure_sync(&cl);
+
 		if (!IS_ERR(as))
 			break;

 		if (PTR_ERR(as) == -ENOSPC)
 			return PTR_ERR(as);
-
-		closure_sync(&cl);
 	}

 	b = __btree_root_alloc(as, 0);
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -355,6 +355,11 @@ retry:

 	multi_lock_write(c, trans);

+	if (race_fault()) {
+		ret = -EINTR;
+		goto unlock;
+	}
+
 	u64s = 0;
 	trans_for_each_entry(trans, i) {
 		/* Multiple inserts might go to same leaf: */
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 		      stats.online_reserved);
 }

+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+	struct bch_dev_usage stats =
+		__bch2_dev_usage_read(ca);
+	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	BUG_ON(stats.buckets[S_META]		> n);
+	BUG_ON(stats.buckets[S_DIRTY]		> n);
+	BUG_ON(stats.buckets_cached		> n);
+	BUG_ON(stats.buckets_alloc		> n);
+	BUG_ON(stats.buckets_unavailable	> n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		u64 used = __bch2_fs_sectors_used(c);
+		u64 cached = 0;
+		u64 avail = atomic64_read(&c->sectors_available);
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+		if (used + avail + cached > c->capacity)
+			panic("used %llu avail %llu cached %llu capacity %llu\n",
+			      used, avail, cached, c->capacity);
+	}
+}
+
 #else

 static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}

 #endif

@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
 	return bch2_usage_read_raw(ca->usage_percpu);
 }

-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(ca->fs,
-				ca->usage_cached,
-				ca->usage_percpu);
+	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
 }

 struct bch_fs_usage
@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
 		!m.dirty_sectors && !!m.cached_sectors;
 }

+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
 static inline enum s_alloc bucket_type(struct bucket_mark m)
 {
 	return is_meta_bucket(m) ? S_META : S_DIRTY;
@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	memset(stats, 0, sizeof(*stats));
 }

-static void bch2_dev_usage_update(struct bch_dev *ca,
-				  struct bucket_mark old, struct bucket_mark new)
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bucket *g, struct bucket_mark old,
+				  struct bucket_mark new)
 {
-	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage *dev_usage;

+	BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
+	       (g - ca->buckets) >= ca->mi.nbuckets);
+
 	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 			old.data_type != new.data_type, c,
 			"different types of metadata in same bucket: %u, %u",
@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage_percpu);

-	dev_usage->sectors_cached +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
+	dev_usage->buckets[S_META] +=
+		is_meta_bucket(new) - is_meta_bucket(old);
+	dev_usage->buckets[S_DIRTY] +=
+		is_dirty_bucket(new) - is_dirty_bucket(old);
+	dev_usage->buckets_cached +=
+		is_cached_bucket(new) - is_cached_bucket(old);
+	dev_usage->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);

 	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
 	dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
-
-	dev_usage->buckets_alloc +=
-		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
-
-	dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
-	dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
-	dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+	dev_usage->sectors_cached +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
 	preempt_enable();

 	if (!is_available_bucket(old) && is_available_bucket(new))
 		bch2_wake_allocator(ca);
+
+	bch2_dev_stats_verify(ca);
 }

-#define bucket_data_cmpxchg(ca, g, new, expr)			\
+#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(ca, _old, new);			\
+	bch2_dev_usage_update(c, ca, g, _old, new);		\
 	_old;							\
 })

-bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
-			    struct bucket_mark *old)
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, struct bucket_mark *old)
 {
 	struct bucket_mark new;

-	*old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	*old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (!is_available_bucket(new))
 			return false;

@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 		new.dirty_sectors	= 0;
 		new.gen++;
 	}));
+	lg_local_unlock(&c->usage_lock);

 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 	return true;
 }

-bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket *g)
 {
 	struct bucket_mark new, old;

-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (new.touched_this_mount ||
 		    !is_available_bucket(new))
 			return false;
@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
 		new.owned_by_allocator	= 1;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);

 	return true;
 }

-void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;

-	old = bucket_data_cmpxchg(ca, g, new, ({
-		new.touched_this_mount	= 1;
-		new.owned_by_allocator	= 0;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-	}));
-
-	BUG_ON(bucket_became_unavailable(ca->fs, old, new));
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
 	}

-void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
-			   bool owned_by_allocator)
-{
-	struct bucket_mark old, new;
-
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		new.touched_this_mount	= 1;
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
+	lg_local_unlock(&c->usage_lock);

 	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       ca->fs->gc_pos.phase == GC_PHASE_DONE);
+	       c->gc_pos.phase == GC_PHASE_DONE);
 }

 #define saturated_add(ca, dst, src, max)			\
@ -377,41 +419,49 @@ do {								\
 	}							\
 } while (0)

-void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
-			       enum bucket_data_type type,
-			       bool may_make_unavailable)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       struct bucket *g, enum bucket_data_type type,
+			       struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;

 	BUG_ON(!type);

-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
+	}
+
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
 			      GC_MAX_SECTORS_USED);
 		new.data_type		= type;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);

 	if (old.data_type != type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);

-	BUG_ON(!may_make_unavailable &&
-	       bucket_became_unavailable(ca->fs, old, new));
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
 }

 /* Reverting this until the copygc + compression issue is fixed: */

-static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
 	if (!sectors)
 		return 0;

-	return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
-				    crc_uncompressed_size(NULL, crc)));
+	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+				    crc.uncompressed_size));
 }

 /*
@ -421,8 +471,8 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 */
 static void bch2_mark_pointer(struct bch_fs *c,
 			      struct bkey_s_c_extent e,
-			     const union bch_extent_crc *crc,
 			      const struct bch_extent_ptr *ptr,
+			      struct bch_extent_crc_unpacked crc,
 			      s64 sectors, enum s_alloc type,
 			      struct bch_fs_usage *stats,
 			      u64 journal_seq, unsigned flags)
@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		? BUCKET_BTREE : BUCKET_DATA;
 	u64 v;

-	if (crc_compression_type(crc)) {
+	if (crc.compression_type) {
 		unsigned old_sectors, new_sectors;

 		if (sectors > 0) {
@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.counter,
 			      new.counter)) != old.counter);

-	bch2_dev_usage_update(ca, old, new);
+	bch2_dev_usage_update(c, ca, g, old, new);

 	if (old.data_type != data_type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);

 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	}
 }

-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
-			    s64 sectors, bool metadata,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags)
-{
-	const struct bch_extent_ptr *ptr;
-	const union bch_extent_crc *crc;
-	enum s_alloc type = metadata ? S_META : S_DIRTY;
-	unsigned replicas = 0;
-
-	BUG_ON(metadata && bkey_extent_is_cached(e.k));
-	BUG_ON(!sectors);
-
-	extent_for_each_ptr_crc(e, ptr, crc) {
-		bch2_mark_pointer(c, e, crc, ptr, sectors, type,
-				  stats, journal_seq, flags);
-		replicas += !ptr->cached;
-	}
-
-	BUG_ON(replicas >= BCH_REPLICAS_MAX);
-
-	if (replicas)
-		stats->s[replicas - 1].data[type] += sectors;
-}
-
-void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata,
-		     struct bch_fs_usage *stats,
-		     u64 journal_seq, unsigned flags)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
-				stats, journal_seq, flags);
-		break;
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (r.v->nr_replicas)
-			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-		break;
-	}
-	}
-}
-
-void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata, unsigned flags)
-{
-	struct bch_fs_usage stats = { 0 };
-
-	__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
-			flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	preempt_disable();
-	bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
-	preempt_enable();
-}
-
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		  s64 sectors, bool metadata, struct gc_pos gc_pos,
-		  struct bch_fs_usage *stats, u64 journal_seq)
+		   s64 sectors, bool metadata,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
 {
-	unsigned flags = gc_will_visit(c, gc_pos)
-		? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@ -614,50 +605,87 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	 * To know whether we should mark a given reference (GC either isn't
 	 * running, or has already marked references at this position) we
 	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @gc_pos - with
+	 * compare the position of the reference we're marking - @pos - with
 	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @gc_pos; if GC's current position
-	 * is greater than @gc_pos GC has either already walked this position,
-	 * or isn't running.
+	 * current position will be less than @pos; if GC's current position is
+	 * greater than @pos GC has either already walked this position, or
+	 * isn't running.
 	 *
 	 * To avoid racing with GC's position changing, we have to deal with
 	 *  - GC's position being set to GC_POS_MIN when GC starts:
 	 *    usage_lock guards against this
-	 *  - GC's position overtaking @gc_pos: we guard against this with
+	 *  - GC's position overtaking @pos: we guard against this with
 	 *    whatever lock protects the data structure the reference lives in
 	 *    (e.g. the btree node lock, or the relevant allocator lock).
 	 */
+
 	lg_local_lock(&c->usage_lock);
-	__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
-	bch2_fs_stats_verify(c);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		unsigned replicas = 0;
+
+		BUG_ON(metadata && bkey_extent_is_cached(e.k));
+		BUG_ON(!sectors);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+					  stats, journal_seq, flags);
+			replicas += !ptr->cached;
+		}
+
+		BUG_ON(replicas >= BCH_REPLICAS_MAX);
+
+		if (replicas)
+			stats->s[replicas - 1].data[type] += sectors;
+		break;
+	}
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (r.v->nr_replicas)
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+		break;
+	}
+	}
 	lg_local_unlock(&c->usage_lock);
 }

+/* Disk reservations: */
+
 static u64 __recalc_sectors_available(struct bch_fs *c)
 {
-	return c->capacity - bch2_fs_sectors_used(c);
+	u64 avail;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+
+	avail = c->capacity - bch2_fs_sectors_used(c);
+
+	avail <<= RESERVE_FACTOR;
+	avail /= (1 << RESERVE_FACTOR) + 1;
+	return avail;
 }

 /* Used by gc when it's starting: */
 void bch2_recalc_sectors_available(struct bch_fs *c)
 {
-	int cpu;
-
 	lg_global_lock(&c->usage_lock);
-
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
-
-	atomic64_set(&c->sectors_available,
-		     __recalc_sectors_available(c));
-
+	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
 	lg_global_unlock(&c->usage_lock);
 }

-void bch2_disk_reservation_put(struct bch_fs *c,
-			      struct disk_reservation *res)
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
-	if (res->sectors) {
 	lg_local_lock(&c->usage_lock);
 	this_cpu_sub(c->usage_percpu->online_reserved,
 		     res->sectors);
@ -667,16 +695,14 @@ void bch2_disk_reservation_put(struct bch_fs *c,

 	res->sectors = 0;
 }
-}

 #define SECTORS_CACHE	1024

-int bch2_disk_reservation_add(struct bch_fs *c,
-			     struct disk_reservation *res,
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 			      unsigned sectors, int flags)
 {
 	struct bch_fs_usage *stats;
-	u64 old, new, v;
+	u64 old, v, get;
 	s64 sectors_available;
 	int ret;

@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
 	lg_local_lock(&c->usage_lock);
 	stats = this_cpu_ptr(c->usage_percpu);

-	if (sectors >= stats->available_cache)
+	if (sectors <= stats->available_cache)
 		goto out;

 	v = atomic64_read(&c->sectors_available);
 	do {
 		old = v;
-		if (old < sectors) {
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
 			lg_local_unlock(&c->usage_lock);
 			goto recalculate;
 		}
-
-		new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, new)) != old);
+				       old, old - get)) != old);
+
+	stats->available_cache	+= get;

-	stats->available_cache	+= old - new;
 out:
 	stats->available_cache	-= sectors;
 	stats->online_reserved	+= sectors;
 	res->sectors		+= sectors;

+	bch2_disk_reservations_verify(c, flags);
 	bch2_fs_stats_verify(c);
 	lg_local_unlock(&c->usage_lock);
 	return 0;
@ -738,6 +766,8 @@ recalculate:
 		stats->online_reserved	+= sectors;
 		res->sectors		+= sectors;
 		ret = 0;
+
+		bch2_disk_reservations_verify(c, flags);
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
 		ret = -ENOSPC;
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -95,24 +95,26 @@ static inline bool bucket_unused(struct bucket_mark mark)
 /* Per device stats: */

 struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);

 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
-	return max_t(s64, 0,
-		     ca->mi.nbuckets - ca->mi.first_bucket -
-		     stats.buckets[S_META] -
-		     stats.buckets[S_DIRTY] -
-		     stats.buckets_alloc);
+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	if (WARN_ONCE(stats.buckets_unavailable > total,
+		      "buckets_unavailable overflow\n"))
+		return 0;
+
+	return total - stats.buckets_unavailable;
 }

 /*
 * Number of reclaimable buckets - only for use by the allocator thread:
 */
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
 }

 static inline u64 __dev_buckets_free(struct bch_dev *ca,
@ -123,9 +125,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca,
 		fifo_used(&ca->free_inc);
 }

-static inline u64 dev_buckets_free(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 {
-	return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
 }

 /* Cache set stats: */
@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	return sum;
 }

+#define RESERVE_FACTOR	6
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
 static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
 {
 	struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));

-	return sum.data + sum.reserved + (sum.reserved >> 7);
+	return sum.data + reserve_factor(sum.reserved);
 }

 static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,

 void bch2_bucket_seq_cleanup(struct bch_fs *);

-bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
-			    struct bucket_mark *);
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
-void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
-void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
-void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
-			       enum bucket_data_type, bool);
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+			    struct bucket *, struct bucket_mark *);
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
+				    struct bucket *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+			    struct bucket *, bool,
+			    struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			       struct bucket *, enum bucket_data_type,
+			       struct gc_pos, unsigned);

 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 1)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 2)
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)

-void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);

-void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
-		      s64, bool, unsigned);
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
-		  struct gc_pos, struct bch_fs_usage *, u64);
-
 void bch2_recalc_sectors_available(struct bch_fs *);

-void bch2_disk_reservation_put(struct bch_fs *,
-			      struct disk_reservation *);
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors)
+		__bch2_disk_reservation_put(c, res);
+}

 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
 #define BCH_DISK_RESERVATION_METADATA		(1 << 1)
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -59,6 +59,7 @@ struct bch_dev_usage {
 	u64			buckets[S_ALLOC_NR];
 	u64			buckets_cached;
 	u64			buckets_alloc;
+	u64			buckets_unavailable;

 	/* _compressed_ sectors: */
 	u64			sectors[S_ALLOC_NR];
@ -79,13 +80,6 @@ struct bch_fs_usage {
 	u64			available_cache;
 };

-struct bucket_heap_entry {
-	size_t			bucket;
-	struct bucket_mark	mark;
-};
-
-typedef HEAP(struct bucket_heap_entry) bucket_heap;
-
 /*
 * A reservation for space on disk:
 */
@ -95,4 +89,11 @@ struct disk_reservation {
 	unsigned	nr_replicas;
 };

+struct copygc_heap_entry {
+	u64			offset;
+	struct bucket_mark	mark;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
 #endif /* _BUCKETS_TYPES_H */
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type)
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
-	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC32C_NONZERO:
 		return U32_MAX;
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_CRC64_NONZERO:
 		return U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return 0;
+	case BCH_CSUM_CRC64:
+		return 0;
 	default:
 		BUG();
 	}
@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc)
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
-	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC32C_NONZERO:
 		return crc ^ U32_MAX;
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_CRC64_NONZERO:
 		return crc ^ U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return crc;
+	case BCH_CSUM_CRC64:
+		return crc;
 	default:
 		BUG();
 	}
@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
 	case BCH_CSUM_CRC32C:
 		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC64:
 		return bch2_crc64_update(crc, data, len);
 	default:
@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 {
 	switch (type) {
 	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
 		u64 crc = bch2_checksum_init(type);
@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 		crc = bch2_checksum_update(type, crc, data, len);
 		crc = bch2_checksum_final(type, crc);

-		return (struct bch_csum) { .lo = crc };
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
 	}

 	case BCH_CSUM_CHACHA20_POLY1305_80:
@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type,
 	do_encrypt(c->chacha20, nonce, data, len);
 }

-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
-				  struct nonce nonce, struct bio *bio)
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
 {
 	struct bio_vec bv;
-	struct bvec_iter iter;

 	switch (type) {
 	case BCH_CSUM_NONE:
 		return (struct bch_csum) { 0 };
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
 		u64 crc = bch2_checksum_init(type);

-		bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
 			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
 			crc = bch2_checksum_update(type,
 				crc, p, bv.bv_len);
 			kunmap_atomic(p);
 		}
-
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crc = bch2_checksum_update(type, crc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
 		crc = bch2_checksum_final(type, crc);
-		return (struct bch_csum) { .lo = crc };
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
 	}

 	case BCH_CSUM_CHACHA20_POLY1305_80:
@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,

 		gen_poly_key(c, desc, nonce);

-		bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
 			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;

 			crypto_shash_update(desc, p, bv.bv_len);
 			kunmap_atomic(p);
 		}
-
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
 		crypto_shash_final(desc, digest);

 		memcpy(&ret, digest, bch_crc_bytes[type]);
@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	}
 }

+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
 void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 		      struct nonce nonce, struct bio *bio)
 {
@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,

 	sg_init_table(sgl, ARRAY_SIZE(sgl));

-	bio_for_each_contig_segment(bv, bio, iter) {
+	bio_for_each_segment(bv, bio, iter) {
 		if (sg == sgl + ARRAY_SIZE(sgl)) {
 			sg_mark_end(sg - 1);
 			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);

-			le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
+			nonce = nonce_add(nonce, bytes);
 			bytes = 0;

 			sg_init_table(sgl, ARRAY_SIZE(sgl));
@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,

 		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
 		bytes += bv.bv_len;
-
 	}

 	sg_mark_end(sg - 1);
 	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 }

+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+					   struct bch_csum a,
+					   struct bch_csum b, size_t b_len)
+{
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned b = min(b_len, PAGE_SIZE);
+
+		a.lo = bch2_checksum_update(type, a.lo,
+				page_address(ZERO_PAGE(0)), b);
+		b_len -= b;
+	}
+
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type },
+		{ crc_b, len_b, new_csum_type },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_old.compression_type);
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum))
+		return -EIO;
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
 #ifdef __KERNEL__
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 {
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@ -2,6 +2,7 @@
 #define _BCACHEFS_CHECKSUM_H

 #include "bcachefs.h"
+#include "extents_types.h"
 #include "super-io.h"

 #include <crypto/chacha20.h>
@ -37,6 +38,13 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,

 struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
 				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
 void bch2_encrypt_bio(struct bch_fs *, unsigned,
 		    struct nonce, struct bio *);

@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool);
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);

-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
 {
 	switch (type) {
 	case BCH_CSUM_OPT_NONE:
 	     return BCH_CSUM_NONE;
 	case BCH_CSUM_OPT_CRC32C:
-	     return BCH_CSUM_CRC32C;
+	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
 	case BCH_CSUM_OPT_CRC64:
-	     return BCH_CSUM_CRC64;
+	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
 	default:
 	     BUG();
 	}
@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
 			? BCH_CSUM_CHACHA20_POLY1305_128
 			: BCH_CSUM_CHACHA20_POLY1305_80;

-	return bch2_csum_opt_to_type(c->opts.data_checksum);
+	return bch2_csum_opt_to_type(c->opts.data_checksum, true);
 }

 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 	if (c->sb.encryption_type)
 		return BCH_CSUM_CHACHA20_POLY1305_128;

-	return bch2_csum_opt_to_type(c->opts.metadata_checksum);
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }

 static inline enum bch_compression_type
@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 	return nonce;
 }

+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
 static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
 {
 	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -1,4 +1,5 @@
 #include "bcachefs.h"
+#include "checksum.h"
 #include "compress.h"
 #include "extents.h"
 #include "io.h"
@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
 }

 static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-			    void *dst_data, struct bch_extent_crc128 crc)
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
 {
 	struct bbuf src_data = { NULL };
 	size_t src_len = src->bi_iter.bi_size;
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+	size_t dst_len = crc.uncompressed_size << 9;
 	int ret;

 	src_data = bio_map_or_bounce(c, src, READ);
@ -212,65 +213,58 @@ err:
 }

 int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
-			       unsigned live_data_sectors,
-			       struct bch_extent_crc128 crc)
+				struct bch_extent_crc_unpacked *crc)
 {
-	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
-	int ret = -ENOMEM;
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;

-	BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);

-	if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
-	    crc_compressed_size(NULL, &crc)   > c->sb.encoded_extent_max)
+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc->compressed_size	> c->sb.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
 		return -EIO;
-
-	dst_data = __bounce_alloc(c, dst_len, WRITE);
-
-	ret = __bio_uncompress(c, bio, dst_data.b, crc);
-	if (ret)
-		goto err;
-
-	while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
-		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (!bv->bv_page)
-			goto use_mempool;
-
-		bv->bv_len = PAGE_SIZE;
-		bv->bv_offset = 0;
-		bio->bi_vcnt++;
 	}

-	bio->bi_iter.bi_size = live_data_sectors << 9;
-copy_data:
-	memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
-err:
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-use_mempool:
-	/*
-	 * We already allocated from mempool, we can't allocate from it again
-	 * without freeing the pages we already allocated or else we could
-	 * deadlock:
-	 */
+	data = __bounce_alloc(c, dst_len, WRITE);

-	bch2_bio_free_pages_pool(c, bio);
-	bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
-	goto copy_data;
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * might have to free existing pages and retry allocation from mempool -
+	 * do this _after_ decompressing:
+	 */
+	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
 }

 int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 		       struct bio *dst, struct bvec_iter dst_iter,
-		       struct bch_extent_crc128 crc)
+		       struct bch_extent_crc_unpacked crc)
 {
 	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+	size_t dst_len = crc.uncompressed_size << 9;
 	int ret = -ENOMEM;

-	if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
-	    crc_compressed_size(NULL, &crc)   > c->sb.encoded_extent_max)
+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc.compressed_size		> c->sb.encoded_extent_max)
 		return -EIO;

 	dst_data = dst_len == dst_iter.bi_size
@ -288,21 +282,25 @@ err:
 	return ret;
 }

-static int __bio_compress(struct bch_fs *c,
+static unsigned __bio_compress(struct bch_fs *c,
 			       struct bio *dst, size_t *dst_len,
 			       struct bio *src, size_t *src_len,
-			  unsigned *compression_type)
+			       unsigned compression_type)
 {
 	struct bbuf src_data = { NULL }, dst_data = { NULL };
 	unsigned pad;
 	int ret = 0;

+	/* If it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->opts.block_size)
+		goto err;
+
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);

-	switch (*compression_type) {
+	switch (compression_type) {
 	case BCH_COMPRESSION_LZ4_OLD:
-		*compression_type = BCH_COMPRESSION_LZ4;
+		compression_type = BCH_COMPRESSION_LZ4;

 	case BCH_COMPRESSION_LZ4: {
 		void *workspace;
@ -403,19 +401,24 @@ zlib_err:

 	if (dst_data.type != BB_NONE)
 		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
+	return compression_type;
 err:
-	ret = -1;
+	compression_type = 0;
 	goto out;
 }

-void bch2_bio_compress(struct bch_fs *c,
+unsigned bch2_bio_compress(struct bch_fs *c,
 			   struct bio *dst, size_t *dst_len,
 			   struct bio *src, size_t *src_len,
-		       unsigned *compression_type)
+			   unsigned compression_type)
 {
 	unsigned orig_dst = dst->bi_iter.bi_size;
 	unsigned orig_src = src->bi_iter.bi_size;
@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c,
 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
 				     c->sb.encoded_extent_max << 9);
-
 	/* Don't generate a bigger output than input: */
-	dst->bi_iter.bi_size =
-		min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);

-	/* If it's only one block, don't bother trying to compress: */
-	if (*compression_type != BCH_COMPRESSION_NONE &&
-	    bio_sectors(src) > c->opts.block_size &&
-	    !__bio_compress(c, dst, dst_len, src, src_len, compression_type))
-		goto out;
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len, compression_type);

-	/* If compressing failed (didn't get smaller), just copy: */
-	*compression_type = BCH_COMPRESSION_NONE;
-	*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-	bio_copy_data(dst, src);
-out:
 	dst->bi_iter.bi_size = orig_dst;
 	src->bi_iter.bi_size = orig_src;
-
-	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
-	BUG_ON(*dst_len & (block_bytes(c) - 1));
-	BUG_ON(*src_len & (block_bytes(c) - 1));
+	return compression_type;
 }

 /* doesn't write superblock: */
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@ -1,12 +1,14 @@
 #ifndef _BCACHEFS_COMPRESS_H
 #define _BCACHEFS_COMPRESS_H

+#include "extents_types.h"
+
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
-			       unsigned, struct bch_extent_crc128);
+				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
-		       struct bvec_iter, struct bch_extent_crc128);
-void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
-		      struct bio *, size_t *, unsigned *);
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);

 int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -19,6 +19,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "super-io.h"
+#include "util.h"
 #include "xattr.h"

 #include <trace/events/bcachefs.h>
@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }

+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr_crc(e, ptr, crc)
+			if (!ptr->cached &&
+			    crc.compression_type != BCH_COMPRESSION_NONE &&
+			    crc.compressed_size < crc.live_size)
+				ret = max_t(unsigned, ret, crc.compressed_size);
+	}
+
+	return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+			     struct bch_extent_ptr m, u64 offset)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (ptr->dev	== m.dev &&
+		    ptr->gen	== m.gen &&
+		    (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+		    (s64) m.offset  - offset)
+			return ptr;
+
+	return NULL;
+}
+
 /* Doesn't cleanup redundant crcs */
 void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
 {
@ -186,24 +225,30 @@ found:
 	bch2_extent_drop_ptr(e, ptr);
 }

-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
 {
-	return extent_crc_type(l) == extent_crc_type(r) &&
-		!memcmp(l, r, extent_entry_bytes(to_entry(l)));
+	return !u.compression_type &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
 }

-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+				 struct bch_extent_crc_unpacked n)
 {
-	union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;

-	extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
-		if (!extent_entry_is_ptr(entry))
-			return;
+	if (!n.csum_type)
+		return false;

-		entry->ptr.offset += crc_offset(crc);
-	}
+	extent_for_each_crc(e, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
 }

 /*
@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
 * not compressed, we can modify them to point to only the data that is
 * currently live (so that readers won't have to bounce) while we've got the
 * checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent                              |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
 */
-void bch2_extent_narrow_crcs(struct bkey_s_extent e)
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+			     struct bch_extent_crc_unpacked n)
 {
-	union bch_extent_crc *crc;
-	bool have_wide = false, have_narrow = false;
-	struct bch_csum csum = { 0 };
-	unsigned csum_type = 0;
+	struct bch_extent_crc_unpacked u;
+	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *i;

-	extent_for_each_crc(e, crc) {
-		if (crc_compression_type(crc) ||
-		    bch2_csum_type_is_encryption(crc_csum_type(crc)))
-			continue;
-
-		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-			have_wide = true;
-		} else {
-			have_narrow = true;
-			csum = crc_csum(crc);
-			csum_type = crc_csum_type(crc);
-		}
-	}
-
-	if (!have_wide || !have_narrow)
-		return;
-
-	extent_for_each_crc(e, crc) {
-		if (crc_compression_type(crc))
-			continue;
-
-		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-			switch (extent_crc_type(crc)) {
-			case BCH_EXTENT_CRC_NONE:
-				BUG();
-			case BCH_EXTENT_CRC32:
-				if (bch_crc_bytes[csum_type] > 4)
-					continue;
-
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc32._compressed_size	= e.k->size - 1;
-				crc->crc32._uncompressed_size	= e.k->size - 1;
-				crc->crc32.offset		= 0;
-				crc->crc32.csum_type		= csum_type;
-				crc->crc32.csum			= csum.lo;
-				break;
-			case BCH_EXTENT_CRC64:
-				if (bch_crc_bytes[csum_type] > 10)
-					continue;
-
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc64._compressed_size	= e.k->size - 1;
-				crc->crc64._uncompressed_size	= e.k->size - 1;
-				crc->crc64.offset		= 0;
-				crc->crc64.csum_type		= csum_type;
-				crc->crc64.csum_lo		= csum.lo;
-				crc->crc64.csum_hi		= csum.hi;
-				break;
-			case BCH_EXTENT_CRC128:
-				if (bch_crc_bytes[csum_type] > 16)
-					continue;
-
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc128._compressed_size	= e.k->size - 1;
-				crc->crc128._uncompressed_size	= e.k->size - 1;
-				crc->crc128.offset		= 0;
-				crc->crc128.csum_type		= csum_type;
-				crc->crc128.csum		= csum;
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type)
+		extent_for_each_crc(extent_i_to_s(e), u, i)
+			if (!u.compression_type &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
 				break;
 			}
+
+	if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+		return false;
+
+	BUG_ON(n.compression_type);
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != e->k.size);
+
+	bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+		if (can_narrow_crc(u, n)) {
+			ptr->offset += u.offset;
+			extent_ptr_append(e, *ptr);
+			__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+			goto restart_narrow_pointers;
 		}
-	}
+
+	bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+	return true;
 }

 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 {
 	union bch_extent_entry *entry = e.v->start;
 	union bch_extent_crc *crc, *prev = NULL;
+	struct bch_extent_crc_unpacked u, prev_u;

 	while (entry != extent_entry_last(e)) {
 		union bch_extent_entry *next = extent_entry_next(entry);
@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 			goto next;

 		crc = entry_to_crc(entry);
+		u = bch2_extent_crc_unpack(e.k, crc);

 		if (next == extent_entry_last(e)) {
 			/* crc entry with no pointers after it: */
@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 			goto drop;
 		}

-		if (prev && crc_cmp(crc, prev)) {
+		if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
 			/* identical to previous crc entry: */
 			goto drop;
 		}

 		if (!prev &&
-		    !crc_csum_type(crc) &&
-		    !crc_compression_type(crc)) {
+		    !u.csum_type &&
+		    !u.compression_type) {
 			/* null crc entry: */
-			bch2_extent_crc_narrow_pointers(e, crc);
+			union bch_extent_entry *e2;
+
+			extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+				if (!extent_entry_is_ptr(e2))
+					break;
+
+				e2->ptr.offset += u.offset;
+			}
 			goto drop;
 		}

 		prev = crc;
+		prev_u = u;
 next:
 		entry = next;
 		continue;
@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 {
 	char *out = buf, *end = buf + size;
 	const union bch_extent_entry *entry;
-	const union bch_extent_crc *crc;
+	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
 	struct bch_dev *ca;
 	bool first = true;
@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
-			crc = entry_to_crc(entry);
+			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));

-			p("crc: c_size %u size %u offset %u csum %u compress %u",
-			  crc_compressed_size(e.k, crc),
-			  crc_uncompressed_size(e.k, crc),
-			  crc_offset(crc), crc_csum_type(crc),
-			  crc_compression_type(crc));
+			p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			  crc.compressed_size,
+			  crc.uncompressed_size,
+			  crc.offset, crc.nonce,
+			  crc.csum_type,
+			  crc.compression_type);
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
@ -499,13 +508,24 @@ out:
 	return out - buf;
 }

+static inline bool dev_latency_better(struct bch_dev *dev1,
+				      struct bch_dev *dev2)
+{
+	unsigned l1 = atomic_read(&dev1->latency[READ]);
+	unsigned l2 = atomic_read(&dev2->latency[READ]);
+
+	/* Pick at random, biased in favor of the faster device: */
+
+	return bch2_rand_range(l1 + l2) > l1;
+}
+
 static void extent_pick_read_device(struct bch_fs *c,
 				    struct bkey_s_c_extent e,
 				    struct bch_devs_mask *avoid,
 				    struct extent_pick_ptr *pick)
 {
-	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;

 	extent_for_each_ptr_crc(e, ptr, crc) {
 		struct bch_dev *ca = c->devs[ptr->dev];
@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c,
 		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
 			continue;

-		if (avoid && test_bit(ca->dev_idx, avoid->d))
+		if (avoid) {
+			if (test_bit(ca->dev_idx, avoid->d))
 				continue;

-		if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
-			continue;
+			if (pick->ca &&
+			    test_bit(pick->ca->dev_idx, avoid->d))
+				goto use;
+		}

+		if (pick->ca && !dev_latency_better(ca, pick->ca))
+			continue;
+use:
 		if (!percpu_ref_tryget(&ca->io_ref))
 			continue;

@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c,

 		*pick = (struct extent_pick_ptr) {
 			.ptr	= *ptr,
+			.crc	= crc,
 			.ca	= ca,
 		};
-
-		if (e.k->size)
-			pick->crc = crc_to_128(e.k, crc);
 	}
 }

@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
 		const struct bch_extent_ptr *ptr;
-		const union bch_extent_crc *crc;
 		const char *reason;

-		extent_for_each_entry(e, entry)
+		extent_for_each_entry(e, entry) {
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";

-		extent_for_each_ptr_crc(e, ptr, crc) {
+			if (extent_entry_is_crc(entry))
+				return "has crc field";
+		}
+
+		extent_for_each_ptr(e, ptr) {
 			reason = extent_ptr_invalid(c, e, ptr,
 						    c->opts.btree_node_size,
 						    true);
@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
 				return reason;
 		}

-		if (crc)
-			return "has crc field";
-
 		return NULL;
 	}

@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 		__set_bkey_deleted(k.k);
 	else if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_extent e = bkey_s_to_extent(k);
-		struct bch_extent_ptr *ptr;
-		union bch_extent_crc *crc, *prev_crc = NULL;
+		union bch_extent_entry *entry;
+		bool seen_crc = false;

-		extent_for_each_ptr_crc(e, ptr, crc) {
-			switch (extent_crc_type(crc)) {
-			case BCH_EXTENT_CRC_NONE:
-				ptr->offset += e.k->size - len;
+		extent_for_each_entry(e, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC32:
-				if (prev_crc != crc)
-					crc->crc32.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC64:
-				if (prev_crc != crc)
-					crc->crc64.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC128:
-				if (prev_crc != crc)
-					crc->crc128.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += e.k->size - len;
 				break;
 			}
-			prev_crc = crc;
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
 		}
 	}

@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
 		return;

 	bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
-		     &s->stats, s->trans->journal_res.seq);
+		      &s->stats, s->trans->journal_res.seq, 0);
 }

 static void bch2_subtract_sectors(struct extent_insert_state *s,
@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s)

 	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
 	    bkey_cmp(s->committed, insert->k.p) &&
-	    bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
+	    bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
 		/* XXX: possibly need to increase our reservation? */
 		bch2_cut_subtract_back(s, s->committed,
 				      bkey_i_to_s(&split.k));
@ -1152,46 +1176,24 @@ done:
 	s->trans->did_work		= true;
 }

-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 __extent_insert_advance_pos(struct extent_insert_state *s,
 			    struct bpos next_pos,
 			    struct bkey_s_c k)
 {
 	struct extent_insert_hook *hook = s->trans->hook;
-	enum extent_insert_hook_ret ret;
-#if 0
-	/*
-	 * Currently disabled for encryption - broken with fcollapse. Will have
-	 * to reenable when versions are exposed for send/receive - versions
-	 * will have to be monotonic then:
-	 */
-	if (k.k && k.k->size &&
-	    !bversion_zero(s->insert->k->k.version) &&
-	    bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
-		ret = BTREE_HOOK_NO_INSERT;
-	} else
-#endif
+	enum btree_insert_ret ret;
+
 	if (hook)
 		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
 	else
-		ret = BTREE_HOOK_DO_INSERT;
+		ret = BTREE_INSERT_OK;

 	EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);

-	switch (ret) {
-	case BTREE_HOOK_DO_INSERT:
-		break;
-	case BTREE_HOOK_NO_INSERT:
-		extent_insert_committed(s);
-		bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
-		bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
-		break;
-	case BTREE_HOOK_RESTART_TRANS:
-		return ret;
-	}
-
+	if (ret == BTREE_INSERT_OK)
 		s->committed = next_pos;
+
 	return ret;
 }

@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
 * Update iter->pos, marking how much of @insert we've processed, and call hook
 * fn:
 */
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 {
 	struct btree *b = s->insert->iter->nodes[0];
 	struct bpos next_pos = bpos_min(s->insert->k->k.p,
 					k.k ? k.k->p : b->key.k.p);
+	enum btree_insert_ret ret;
+
+	if (race_fault())
+		return BTREE_INSERT_NEED_TRAVERSE;

 	/* hole? */
 	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-		bool have_uncommitted = bkey_cmp(s->committed,
-				bkey_start_pos(&s->insert->k->k)) > 0;
-
-		switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
-						    bkey_s_c_null)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			/*
-			 * we had to split @insert and insert the committed
-			 * part - need to bail out and recheck journal
-			 * reservation/btree node before we advance pos past @k:
-			 */
-			if (have_uncommitted)
-				return BTREE_HOOK_NO_INSERT;
-			break;
-		case BTREE_HOOK_RESTART_TRANS:
-			return BTREE_HOOK_RESTART_TRANS;
-		}
+		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+						    bkey_s_c_null);
+		if (ret != BTREE_INSERT_OK)
+			return ret;
 	}

 	/* avoid redundant calls to hook fn: */
 	if (!bkey_cmp(s->committed, next_pos))
-		return BTREE_HOOK_DO_INSERT;
+		return BTREE_INSERT_OK;

 	return __extent_insert_advance_pos(s, next_pos, k);
 }
@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
 	unsigned sectors;

 	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bkey_extent_is_compressed(k))) {
+	    (sectors = bch2_extent_is_compressed(k))) {
 		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;

 		if (s->trans->flags & BTREE_INSERT_NOFAIL)
@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	struct btree_iter *iter = s->insert->iter;
 	struct btree *b = iter->nodes[0];
 	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	enum btree_insert_ret ret;

 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 			k.k->p = orig_pos;
 			extent_save(b, node_iter, _k, k.k);

-			if (extent_insert_advance_pos(s, k.s_c) ==
-			    BTREE_HOOK_RESTART_TRANS)
-				return BTREE_INSERT_NEED_TRAVERSE;
+			ret = extent_insert_advance_pos(s, k.s_c);
+			if (ret != BTREE_INSERT_OK)
+				return ret;

 			extent_insert_committed(s);
 			/*
@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
 		if (ret != BTREE_INSERT_OK)
 			goto stop;

-		switch (extent_insert_advance_pos(s, k.s_c)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			continue;
-		case BTREE_HOOK_RESTART_TRANS:
-			ret = BTREE_INSERT_NEED_TRAVERSE;
+		ret = extent_insert_advance_pos(s, k.s_c);
+		if (ret)
 			goto stop;
-		}

 		s->do_journal = true;

@ -1469,10 +1455,9 @@ next:
 		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 	}

-	if (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	    ret == BTREE_INSERT_OK &&
-	    extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-		ret = BTREE_INSERT_NEED_TRAVERSE;
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s->committed, insert->k.p) < 0)
+		ret = extent_insert_advance_pos(s, bkey_s_c_null);
 stop:
 	extent_insert_committed(s);

@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,

 		/*
 		 * Only call advance pos & call hook for nonzero size extents:
-		 * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
-		 * overlaps with @k:
 		 */
-		switch (extent_insert_advance_pos(&s, k.s_c)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			continue;
-		case BTREE_HOOK_RESTART_TRANS:
-			ret = BTREE_INSERT_NEED_TRAVERSE;
+		ret = extent_insert_advance_pos(&s, k.s_c);
+		if (ret != BTREE_INSERT_OK)
 			goto stop;
-		}

 		if (k.k->size &&
 		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
@ -1623,10 +1600,9 @@ squash:
 			goto stop;
 	}

-	if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-	    ret == BTREE_INSERT_OK &&
-	    extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-		ret = BTREE_INSERT_NEED_TRAVERSE;
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s.committed, insert->k->k.p) < 0)
+		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
 stop:
 	extent_insert_committed(&s);
 	/*
@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
 	case BCH_EXTENT_CACHED: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
-		const union bch_extent_crc *crc;
+		struct bch_extent_crc_unpacked crc;
 		const struct bch_extent_ptr *ptr;
 		unsigned size_ondisk = e.k->size;
 		const char *reason;
+		unsigned nonce = UINT_MAX;

 		extent_for_each_entry(e, entry) {
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";

 			if (extent_entry_is_crc(entry)) {
-				crc = entry_to_crc(entry);
+				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));

-				if (crc_offset(crc) + e.k->size >
-				    crc_uncompressed_size(e.k, crc))
+				if (crc.offset + e.k->size >
+				    crc.uncompressed_size)
 					return "checksum offset + key size > uncompressed size";

-				size_ondisk = crc_compressed_size(e.k, crc);
+				size_ondisk = crc.compressed_size;

-				if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
+				if (!bch2_checksum_type_valid(c, crc.csum_type))
 					return "invalid checksum type";

-				if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+				if (crc.compression_type >= BCH_COMPRESSION_NR)
 					return "invalid compression type";
+
+				if (bch2_csum_type_is_encryption(crc.csum_type)) {
+					if (nonce == UINT_MAX)
+						nonce = crc.offset + crc.nonce;
+					else if (nonce != crc.offset + crc.nonce)
+						return "incorrect nonce";
+				}
 			} else {
 				ptr = entry_to_ptr(entry);

@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c,
 }

 static void bch2_extent_crc_init(union bch_extent_crc *crc,
-				 unsigned compressed_size,
-				 unsigned uncompressed_size,
-				 unsigned compression_type,
-				 unsigned nonce,
-				 struct bch_csum csum, unsigned csum_type)
+				 struct bch_extent_crc_unpacked new)
 {
-	if (bch_crc_bytes[csum_type]	<= 4 &&
-	    uncompressed_size		<= CRC32_SIZE_MAX &&
-	    nonce			<= CRC32_NONCE_MAX) {
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		._compressed_size	= _crc.compressed_size - 1,	\
+		._uncompressed_size	= _crc.uncompressed_size - 1,	\
+		.offset			= _crc.offset
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX) {
 		crc->crc32 = (struct bch_extent_crc32) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc32,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum			= *((__le32 *) &csum.lo),
+			common_fields(new),
+			.csum			= *((__le32 *) &new.csum.lo),
 		};
 		return;
 	}

-	if (bch_crc_bytes[csum_type]	<= 10 &&
-	    uncompressed_size		<= CRC64_SIZE_MAX &&
-	    nonce			<= CRC64_NONCE_MAX) {
+	if (bch_crc_bytes[new.csum_type]	<= 10 &&
+	    new.uncompressed_size		<= CRC64_SIZE_MAX &&
+	    new.nonce				<= CRC64_NONCE_MAX) {
 		crc->crc64 = (struct bch_extent_crc64) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc64,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.nonce			= nonce,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum_lo		= csum.lo,
-			.csum_hi		= *((__le16 *) &csum.hi),
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum_lo		= new.csum.lo,
+			.csum_hi		= *((__le16 *) &new.csum.hi),
 		};
 		return;
 	}

-	if (bch_crc_bytes[csum_type]	<= 16 &&
-	    uncompressed_size		<= CRC128_SIZE_MAX &&
-	    nonce			<= CRC128_NONCE_MAX) {
+	if (bch_crc_bytes[new.csum_type]	<= 16 &&
+	    new.uncompressed_size		<= CRC128_SIZE_MAX &&
+	    new.nonce				<= CRC128_NONCE_MAX) {
 		crc->crc128 = (struct bch_extent_crc128) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.nonce			= nonce,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum			= csum,
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum			= new.csum,
 		};
 		return;
 	}
-
+#undef common_fields
 	BUG();
 }

 void bch2_extent_crc_append(struct bkey_i_extent *e,
-			    unsigned compressed_size,
-			    unsigned uncompressed_size,
-			    unsigned compression_type,
-			    unsigned nonce,
-			    struct bch_csum csum, unsigned csum_type)
+			    struct bch_extent_crc_unpacked new)
 {
-	union bch_extent_crc *crc;
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;

-	BUG_ON(compressed_size > uncompressed_size);
-	BUG_ON(uncompressed_size != e->k.size);
-	BUG_ON(!compressed_size || !uncompressed_size);
+	BUG_ON(new.compressed_size > new.uncompressed_size);
+	BUG_ON(new.live_size != e->k.size);
+	BUG_ON(!new.compressed_size || !new.uncompressed_size);

 	/*
 	 * Look up the last crc entry, so we can check if we need to add
 	 * another:
 	 */
-	extent_for_each_crc(extent_i_to_s(e), crc)
+	extent_for_each_crc(extent_i_to_s(e), crc, i)
 		;

-	if (!crc && !csum_type && !compression_type)
+	if (!memcmp(&crc, &new, sizeof(crc)))
 		return;

-	if (crc &&
-	    crc_compressed_size(&e->k, crc)	== compressed_size &&
-	    crc_uncompressed_size(&e->k, crc)	== uncompressed_size &&
-	    crc_offset(crc)			== 0 &&
-	    crc_nonce(crc)			== nonce &&
-	    crc_csum_type(crc)			== csum_type &&
-	    crc_compression_type(crc)		== compression_type &&
-	    crc_csum(crc).lo			== csum.lo &&
-	    crc_csum(crc).hi			== csum.hi)
-		return;
-
-	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
-			    compressed_size,
-			    uncompressed_size,
-			    compression_type,
-			    nonce, csum, csum_type);
+	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
 	__extent_entry_push(e);
 }

@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 }

 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-				      struct bkey_s_extent e,
-				      unsigned nr_cached)
+				      struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
+	unsigned tier = 0, nr_cached = 0, nr_good = 0;
 	bool have_higher_tier;
-	unsigned tier = 0;

-	if (!nr_cached)
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached &&
+		    c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
+			nr_good++;
+
+	if (nr_good <= c->opts.data_replicas)
 		return;

+	nr_cached = nr_good - c->opts.data_replicas;
+
 	do {
 		have_higher_tier = false;

--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -3,7 +3,7 @@

 #include "bcachefs.h"
 #include "bkey.h"
-#include "io_types.h"
+#include "extents_types.h"

 struct bch_fs;
 struct journal_res;
@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *,
 			struct btree_insert_entry *);

 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *,
-				     struct bkey_s_extent, unsigned);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);

 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);

 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	}
 }

+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
 static inline bool bkey_extent_is_cached(const struct bkey *k)
 {
 	return k->type == BCH_EXTENT_CACHED;
@ -170,6 +182,8 @@ union bch_extent_crc {
 		(struct bch_extent_ptr *) (_entry));			\
 })

+/* checksum entries: */
+
 enum bch_extent_crc_type {
 	BCH_EXTENT_CRC_NONE,
 	BCH_EXTENT_CRC32,
@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc)
 	: __extent_crc_type((union bch_extent_crc *) _crc);		\
 })

+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+	case BCH_EXTENT_CRC32:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+			.csum.lo		= crc->crc32.csum,
+		};
+	case BCH_EXTENT_CRC64:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= crc->crc64.csum_lo,
+			.csum.hi		= crc->crc64.csum_hi,
+		};
+	case BCH_EXTENT_CRC128:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
 #define extent_entry_next(_entry)					\
 	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))

@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc)

 /* Iterate over crcs only: */

-#define extent_crc_next(_e, _p)						\
+#define __extent_crc_next(_e, _p)					\
 ({									\
 	typeof(&(_e).v->start[0]) _entry = _p;				\
 									\
@ -237,24 +295,40 @@ __extent_crc_type(const union bch_extent_crc *crc)
 	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
 })

-#define extent_for_each_crc(_e, _crc)					\
-	for ((_crc) = extent_crc_next(_e, (_e).v->start);		\
+#define __extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
 	     (_crc);							\
-	     (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter)				\
+({									\
+	extent_for_each_entry_from(_e, _iter, _iter)			\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+			break;						\
+		}							\
+									\
+	(_iter) < extent_entry_last(_e);				\
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_iter) = (_e).v->start;					\
+	     extent_crc_next(_e, _crc, _iter);				\
+	     (_iter) = extent_entry_next(_iter))

 /* Iterate over pointers, with crcs: */

-#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter)		\
+#define extent_ptr_crc_next(_e, _ptr, _crc)				\
 ({									\
 	__label__ out;							\
 	typeof(&(_e).v->start[0]) _entry;				\
 									\
 	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
 		if (extent_entry_is_crc(_entry)) {			\
-			(_crc) = entry_to_crc(_entry);			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
 		} else {						\
 			_ptr = entry_to_ptr(_entry);			\
-			if (_filter)					\
 			goto out;					\
 		}							\
 									\
@ -263,34 +337,25 @@ out:									\
 	_ptr;								\
 })

-#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter)		\
-	for ((_crc) = NULL,						\
-	     (_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
-	     (_ptr)++)
-
 #define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
-	extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
+	     (_ptr)++)

 /* Iterate over pointers only, and from a given position: */

-#define extent_ptr_next_filter(_e, _ptr, _filter)			\
+#define extent_ptr_next(_e, _ptr)					\
 ({									\
-	typeof(__entry_to_crc(&(_e).v->start[0])) _crc;			\
+	struct bch_extent_crc_unpacked _crc;				\
 									\
-	extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter);		\
+	extent_ptr_crc_next(_e, _ptr, _crc);				\
 })

-#define extent_ptr_next(_e, _ptr)					\
-	extent_ptr_next_filter(_e, _ptr, true)
-
-#define extent_for_each_ptr_filter(_e, _ptr, _filter)			\
-	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));	\
-	     (_ptr)++)
-
 #define extent_for_each_ptr(_e, _ptr)					\
-	extent_for_each_ptr_filter(_e, _ptr, true)
+	for ((_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
+	     (_ptr)++)

 #define extent_ptr_prev(_e, _ptr)					\
 ({									\
@ -315,8 +380,8 @@ out:									\
 	     (_ptr);							\
 	     (_ptr) = extent_ptr_prev(_e, _ptr))

-void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
-			   unsigned, unsigned, struct bch_csum, unsigned);
+void bch2_extent_crc_append(struct bkey_i_extent *,
+			    struct bch_extent_crc_unpacked);

 static inline void __extent_entry_push(struct bkey_i_extent *e)
 {
@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e,
 	__extent_entry_push(e);
 }

-static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
-						  const union bch_extent_crc *crc)
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
 {
-	EBUG_ON(!k->size);
-
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return (struct bch_extent_crc128) {
-			._compressed_size	= k->size - 1,
-			._uncompressed_size	= k->size - 1,
-		};
-	case BCH_EXTENT_CRC32:
-		return (struct bch_extent_crc128) {
-			.type			= 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= crc->crc32._compressed_size,
-			._uncompressed_size	= crc->crc32._uncompressed_size,
-			.offset			= crc->crc32.offset,
-			.csum_type		= crc->crc32.csum_type,
-			.compression_type	= crc->crc32.compression_type,
-			.csum.lo		= crc->crc32.csum,
-		};
-	case BCH_EXTENT_CRC64:
-		return (struct bch_extent_crc128) {
-			.type			= 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= crc->crc64._compressed_size,
-			._uncompressed_size	= crc->crc64._uncompressed_size,
-			.offset			= crc->crc64.offset,
-			.nonce			= crc->crc64.nonce,
-			.csum_type		= crc->crc64.csum_type,
-			.compression_type	= crc->crc64.compression_type,
-			.csum.lo		= crc->crc64.csum_lo,
-			.csum.hi		= crc->crc64.csum_hi,
-		};
-	case BCH_EXTENT_CRC128:
-		return crc->crc128;
-	default:
-		BUG();
-	}
-}
-
-#define crc_compressed_size(_k, _crc)					\
-({									\
-	unsigned _size = 0;						\
-									\
-	switch (extent_crc_type(_crc)) {				\
-	case BCH_EXTENT_CRC_NONE:					\
-		_size = ((const struct bkey *) (_k))->size;		\
-		break;							\
-	case BCH_EXTENT_CRC32:						\
-		_size = ((struct bch_extent_crc32 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	case BCH_EXTENT_CRC64:						\
-		_size = ((struct bch_extent_crc64 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	case BCH_EXTENT_CRC128:						\
-		_size = ((struct bch_extent_crc128 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	}								\
-	_size;								\
-})
-
-#define crc_uncompressed_size(_k, _crc)					\
-({									\
-	unsigned _size = 0;						\
-									\
-	switch (extent_crc_type(_crc)) {				\
-	case BCH_EXTENT_CRC_NONE:					\
-		_size = ((const struct bkey *) (_k))->size;		\
-		break;							\
-	case BCH_EXTENT_CRC32:						\
-		_size = ((struct bch_extent_crc32 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	case BCH_EXTENT_CRC64:						\
-		_size = ((struct bch_extent_crc64 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	case BCH_EXTENT_CRC128:						\
-		_size = ((struct bch_extent_crc128 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	}								\
-	_size;								\
-})
-
-static inline unsigned crc_offset(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.offset;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.offset;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.offset;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_nonce(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-	case BCH_EXTENT_CRC32:
-		return 0;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.nonce;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.nonce;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.csum_type;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.csum_type;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.csum_type;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.compression_type;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.compression_type;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.compression_type;
-	default:
-		BUG();
-	}
-}
-
-static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return (struct bch_csum) { 0 };
-	case BCH_EXTENT_CRC32:
-		return (struct bch_csum) { .lo = crc->crc32.csum };
-	case BCH_EXTENT_CRC64:
-		return (struct bch_csum) {
-			.lo = crc->crc64.csum_lo,
-			.hi = crc->crc64.csum_hi,
-		};
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.csum;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e;
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
 	const struct bch_extent_ptr *ptr;
-	const union bch_extent_crc *crc;
-	unsigned ret = 0;

-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-
-		extent_for_each_ptr_crc(e, ptr, crc)
-			if (!ptr->cached &&
-			    crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
-			    crc_compressed_size(e.k, crc) < k.k->size)
-				ret = max_t(unsigned, ret,
-					    crc_compressed_size(e.k, crc));
-	}
+	extent_for_each_ptr(e, ptr)
+		ret.devs[ret.nr++] = ptr->dev;

 	return ret;
 }

-static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
-{
-	const union bch_extent_crc *crc;
-
-	extent_for_each_crc(e, crc)
-		if (bch2_csum_type_is_encryption(crc_csum_type(crc)))
-			return crc_offset(crc) + crc_nonce(crc);
-
-	return 0;
-}
-
-void bch2_extent_narrow_crcs(struct bkey_s_extent);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+				 struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);

 void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
 void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
 void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);

-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-struct bch_extent_ptr *
-bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
-		     struct bch_extent_ptr);
-struct bch_extent_ptr *
-bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
-			      struct bkey_s_c_extent);
-
 bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@ -0,0 +1,27 @@
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			compressed_size;
+	u16			uncompressed_size;
+
+	u16			offset;
+	u16			live_size;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_pick_ptr {
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_crc_unpacked	crc;
+	struct bch_dev			*ca;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 	EBUG_ON(i >= size);

 	if (eytzinger1_left_child(i) < size) {
-		i = eytzinger1_left_child(i);
+		i = eytzinger1_left_child(i) + 1;

 		i <<= __fls(size) - __fls(i);
 		i -= 1;
@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 	     (_i) != 0;					\
 	     (_i) = eytzinger1_next((_i), (_size)))

-#if 0
-void eytzinger0_test(void)
-{
-	unsigned i, j, size;
-
-	for (size = 2;
-	     size < 65536000;
-	     size++) {
-		if (!(size % 4096))
-			printk(KERN_INFO "tree size %u\n", size);
-
-		assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
-		assert(eytzinger1_next(0, size) == eytzinger1_first(size));
-
-		assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
-		assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
-
-		eytzinger1_for_each(j, size) {
-			assert(from_inorder(i, size) == j);
-			assert(to_inorder(j, size) == i);
-
-			if (j != eytzinger1_last(size)) {
-				unsigned next = eytzinger1_next(j, size);
-
-				assert(eytzinger1_prev(next, size) == j);
-			}
-		}
-	}
-
-}
-#endif
-
 /* Zero based indexing version: */

 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i)
 	return eytzinger0_child(i, 1);
 }

-#if 0
 static inline unsigned eytzinger0_first(unsigned size)
 {
+	return eytzinger1_first(size + 1) - 1;
 }

 static inline unsigned eytzinger0_last(unsigned size)
 {
+	return eytzinger1_last(size + 1) - 1;
 }

 static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 {
+	return eytzinger1_next(i + 1, size + 1) - 1;
 }

 static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 {
+	return eytzinger1_prev(i + 1, size + 1) - 1;
 }
-#endif

 static inline unsigned eytzinger0_extra(unsigned size)
 {
-	return (size + 1 - rounddown_pow_of_two(size)) << 1;
+	return eytzinger1_extra(size + 1);
 }

 static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
 }

+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
 typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);

+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
 static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-				     eytzinger_cmp_fn cmp, void *search)
+				     eytzinger_cmp_fn cmp, const void *search)
 {
 	size_t i = 0;
 	int res;
@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
 	       (res = cmp(search, base + i * size, size)))
 		i = eytzinger0_child(i, res > 0);

-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		bool found1 = i < nr, found2 = false;
-		size_t j;
-
-		for (j = 0; j < nr; j++)
-			if (!cmp(base + j * size, search, size))
-				found2 = true;
-
-		BUG_ON(found1 != found2);
-	}
-
 	return i;
 }

--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -26,9 +26,67 @@
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>

-struct bio_set *bch2_writepage_bioset;
-struct bio_set *bch2_dio_read_bioset;
-struct bio_set *bch2_dio_write_bioset;
+struct i_sectors_hook {
+	struct extent_insert_hook	hook;
+	s64				sectors;
+	struct bch_inode_info		*inode;
+};
+
+struct bchfs_write_op {
+	struct bch_inode_info		*inode;
+	s64				sectors_added;
+	bool				is_dio;
+	bool				unalloc;
+	u64				new_i_size;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+					struct bch_inode_info *inode,
+					bool is_dio)
+{
+	op->inode		= inode;
+	op->sectors_added	= 0;
+	op->is_dio		= is_dio;
+	op->unalloc		= false;
+	op->new_i_size		= U64_MAX;
+}
+
+struct bch_writepage_io {
+	struct closure			cl;
+
+	/* must be last: */
+	struct bchfs_write_op		op;
+};
+
+struct dio_write {
+	struct closure			cl;
+	struct kiocb			*req;
+	struct bch_fs			*c;
+	long				written;
+	long				error;
+	loff_t				offset;
+
+	struct disk_reservation		res;
+
+	struct iovec			*iovec;
+	struct iovec			inline_vecs[UIO_FASTIOV];
+	struct iov_iter			iter;
+
+	struct task_struct		*task;
+
+	/* must be last: */
+	struct bchfs_write_op		iop;
+};
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	struct bch_read_bio		rbio;
+};

 /* pagecache_block must be held */
 static int write_invalidate_inode_pages_range(struct address_space *mapping,
@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode)

 /* i_sectors accounting: */

-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 i_sectors_hook_fn(struct extent_insert_hook *hook,
 		  struct bpos committed_pos,
 		  struct bpos next_pos,
@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,

 	h->sectors += sectors * sign;

-	return BTREE_HOOK_DO_INSERT;
+	return BTREE_INSERT_OK;
 }

 static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook {
 	bool				need_inode_update;
 };

-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 bchfs_extent_update_hook(struct extent_insert_hook *hook,
 			 struct bpos committed_pos,
 			 struct bpos next_pos,
@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
 	bool do_pack = false;

+	if (h->op->unalloc &&
+	    !bch2_extent_is_fully_allocated(k))
+		return BTREE_INSERT_ENOSPC;
+
 	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));

 	/* XXX: inode->i_size locking */
@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,

 		if (!h->need_inode_update) {
 			h->need_inode_update = true;
-			return BTREE_HOOK_RESTART_TRANS;
+			return BTREE_INSERT_NEED_TRAVERSE;
 		}

 		h->inode_u.bi_size = offset;
@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	if (sectors) {
 		if (!h->need_inode_update) {
 			h->need_inode_update = true;
-			return BTREE_HOOK_RESTART_TRANS;
+			return BTREE_INSERT_NEED_TRAVERSE;
 		}

 		h->inode_u.bi_sectors += sectors;
@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	if (do_pack)
 		bch2_inode_pack(&h->inode_p, &h->inode_u);

-	return BTREE_HOOK_DO_INSERT;
+	return BTREE_INSERT_OK;
 }

 static int bchfs_write_index_update(struct bch_write_op *wop)
@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
 					BTREE_INSERT_ENTRY(&extent_iter, k));
 		}
+
+		BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+		BUG_ON(!ret != !k->k.size);
 err:
 		if (ret == -EINTR)
 			continue;
 		if (ret)
 			break;

+		BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
 		bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));

@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(bio, k);

-		if (!bkey_extent_is_allocation(k.k) ||
-		    bkey_extent_is_compressed(k))
+		if (!bch2_extent_is_fully_allocated(k))
 			bch2_mark_pages_unalloc(bio);

 		if (pick.ca) {
@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 				trace_read_split(&rbio->bio);
 			}

-			bch2_read_extent(c, rbio, k, &pick, flags);
+			bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
+					 &pick, flags);
 		} else {
 			zero_fill_bio(bio);

@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 alloc_io:
 		w->io = container_of(bio_alloc_bioset(GFP_NOFS,
 						      BIO_MAX_PAGES,
-						      bch2_writepage_bioset),
+						      &c->writepage_bioset),
 				     struct bch_writepage_io, op.op.wbio.bio);

 		closure_init(&w->io->cl, NULL);
-		w->io->op.inode		= inode;
-		w->io->op.sectors_added	= 0;
-		w->io->op.is_dio	= false;
+		bch2_fswrite_op_init(&w->io->op, inode, false);
 		bch2_write_op_init(&w->io->op.op, c,
 				(struct disk_reservation) {
 					.nr_replicas = c->opts.data_replicas,
 				},
 				c->fastest_devs,
-				inode->ei_last_dirtied,
+				writepoint_hashed(inode->ei_last_dirtied),
 				POS(inum, 0),
 				&inode->ei_journal_seq,
-				BCH_WRITE_THROTTLE);
+				0);
 		w->io->op.op.index_update_fn = bchfs_write_index_update;
 	}

@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,

 	bio = bio_alloc_bioset(GFP_KERNEL,
 			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       bch2_dio_read_bioset);
+			       &c->dio_read_bioset);

 	bio->bi_end_io = bch2_direct_IO_read_endio;

@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
 		return;
 	}

-	dio->iop.inode		= inode;
 	dio->iop.sectors_added	= 0;
-	dio->iop.is_dio		= true;
-	dio->iop.new_i_size	= U64_MAX;
 	bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
 			   dio->c->fastest_devs,
-			   (unsigned long) dio->task,
+			   writepoint_hashed((unsigned long) dio->task),
 			   POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
 			   &inode->ei_journal_seq,
-			   flags|BCH_WRITE_THROTTLE);
+			   flags);
 	dio->iop.op.index_update_fn = bchfs_write_index_update;

+	if (!dio->iop.unalloc) {
 		dio->res.sectors -= bio_sectors(bio);
 		dio->iop.op.res.sectors = bio_sectors(bio);
+	}

 	task_io_account_write(bio->bi_iter.bi_size);

@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl)
 	}
 }

+static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
+				      u64 size)
+{
+	struct btree_iter iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	end.offset += size;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+			     BTREE_ITER_WITH_HOLES, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bch2_extent_is_fully_allocated(k)) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 static int bch2_direct_IO_write(struct bch_fs *c,
 				struct kiocb *req, struct file *file,
 				struct bch_inode_info *inode,
@ -1610,8 +1698,9 @@ static int bch2_direct_IO_write(struct bch_fs *c,

 	bio = bio_alloc_bioset(GFP_KERNEL,
 			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       bch2_dio_write_bioset);
+			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
+	closure_init(&dio->cl, NULL);
 	dio->req		= req;
 	dio->c			= c;
 	dio->written		= 0;
@ -1620,7 +1709,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	dio->iovec		= NULL;
 	dio->iter		= *iter;
 	dio->task		= current;
-	closure_init(&dio->cl, NULL);
+	bch2_fswrite_op_init(&dio->iop, inode, true);

 	if (offset + iter->count > inode->v.i_size)
 		sync = true;
@ -1635,11 +1724,17 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	 */
 	ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
 	if (unlikely(ret)) {
+		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
+						      offset >> 9),
+					       iter->count >> 9)) {
 			closure_debug_destroy(&dio->cl);
 			bio_put(bio);
 			return ret;
 		}

+		dio->iop.unalloc = true;
+	}
+
 	inode_dio_begin(&inode->v);
 	__pagecache_block_get(&mapping->add_lock);

@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);

 		if (reservation.v.nr_replicas < replicas ||
-		    bkey_extent_is_compressed(k)) {
+		    bch2_extent_is_compressed(k)) {
 			ret = bch2_disk_reservation_get(c, &disk_res,
 						       sectors, 0);
 			if (ret)
@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 	return -EINVAL;
 }

+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+	    bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio)) ||
+	    bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, iop.op.wbio.bio)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 #endif /* NO_BCACHEFS_FS */
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@ -1,7 +1,11 @@
 #ifndef _BCACHEFS_FS_IO_H
 #define _BCACHEFS_FS_IO_H

+#ifndef NO_BCACHEFS_FS
+
 #include "buckets.h"
+#include "io_types.h"
+
 #include <linux/uio.h>

 int bch2_set_page_dirty(struct page *);
@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t);
 int bch2_migrate_page(struct address_space *, struct page *,
 		      struct page *, enum migrate_mode);

-struct i_sectors_hook {
-	struct extent_insert_hook	hook;
-	s64				sectors;
-	struct bch_inode_info		*inode;
-};
-
-struct bchfs_write_op {
-	struct bch_inode_info		*inode;
-	s64				sectors_added;
-	bool				is_dio;
-	u64				new_i_size;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct bch_writepage_io {
-	struct closure			cl;
-
-	/* must be last: */
-	struct bchfs_write_op		op;
-};
-
-extern struct bio_set *bch2_writepage_bioset;
-
-struct dio_write {
-	struct closure			cl;
-	struct kiocb			*req;
-	struct bch_fs			*c;
-	long				written;
-	long				error;
-	loff_t				offset;
-
-	struct disk_reservation		res;
-
-	struct iovec			*iovec;
-	struct iovec			inline_vecs[UIO_FASTIOV];
-	struct iov_iter			iter;
-
-	struct task_struct		*task;
-
-	/* must be last: */
-	struct bchfs_write_op		iop;
-};
-
-extern struct bio_set *bch2_dio_write_bioset;
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	struct bch_read_bio		rbio;
-};
-
-extern struct bio_set *bch2_dio_read_bioset;
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif

 #endif /* _BCACHEFS_FS_IO_H */
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 	if (bkey_extent_is_data(&k->k)) {
 		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 		const struct bch_extent_ptr *ptr;
-		const union bch_extent_crc *crc;
+		struct bch_extent_crc_unpacked crc;
 		int ret;

 		extent_for_each_ptr_crc(e, ptr, crc) {
 			int flags2 = 0;
 			u64 offset = ptr->offset;

-			if (crc_compression_type(crc))
+			if (crc.compression_type)
 				flags2 |= FIEMAP_EXTENT_ENCODED;
 			else
-				offset += crc_offset(crc);
+				offset += crc.offset;

 			if ((offset & (PAGE_SECTORS - 1)) ||
 			    (e.k->size & (PAGE_SECTORS - 1)))
@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
-	if (bch2_dio_write_bioset)
-		bioset_free(bch2_dio_write_bioset);
-	if (bch2_dio_read_bioset)
-		bioset_free(bch2_dio_read_bioset);
-	if (bch2_writepage_bioset)
-		bioset_free(bch2_writepage_bioset);
 	if (bch2_inode_cache)
 		kmem_cache_destroy(bch2_inode_cache);
 }
@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void)
 	if (!bch2_inode_cache)
 		goto err;

-	bch2_writepage_bioset =
-		bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
-	if (!bch2_writepage_bioset)
-		goto err;
-
-	bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
-	if (!bch2_dio_read_bioset)
-		goto err;
-
-	bch2_dio_write_bioset =
-		bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
-	if (!bch2_dio_write_bioset)
-		goto err;
-
 	ret = register_filesystem(&bcache_fs_type);
 	if (ret)
 		goto err;
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -2,6 +2,8 @@
 #define _BCACHEFS_IO_H

 #include <linux/hash.h>
+#include "alloc.h"
+#include "checksum.h"
 #include "io_types.h"

 #define to_wbio(_bio)			\
@ -12,6 +14,9 @@

 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+void bch2_latency_acct(struct bch_dev *, unsigned, int);

 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
@ -20,14 +25,15 @@ enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
 	BCH_WRITE_FLUSH			= (1 << 2),
-	BCH_WRITE_DATA_COMPRESSED	= (1 << 3),
-	BCH_WRITE_THROTTLE		= (1 << 4),
-	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 5),
+	BCH_WRITE_DATA_ENCODED		= (1 << 3),
+	BCH_WRITE_PAGES_STABLE		= (1 << 4),
+	BCH_WRITE_PAGES_OWNED		= (1 << 5),
+	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),

 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 6),
-	BCH_WRITE_DONE			= (1 << 7),
-	BCH_WRITE_LOOPED		= (1 << 8),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 7),
+	BCH_WRITE_DONE			= (1 << 8),
+	BCH_WRITE_LOOPED		= (1 << 9),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 		? op->journal_seq_p : &op->journal_seq;
 }

-void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
-			struct disk_reservation,
-			struct bch_devs_mask *,
-			unsigned long,
-			struct bpos, u64 *, unsigned);
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+{
+	op->c			= c;
+	op->io_wq		= index_update_wq(op);
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c);
+	op->compression_type	=
+		bch2_compression_opt_to_type(c->opts.compression);
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->alloc_reserve	= RESERVE_NONE;
+	op->open_buckets_nr	= 0;
+	op->devs_have.nr	= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->devs		= NULL;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->journal_seq		= 0;
+	op->index_update_fn	= bch2_write_index_default;
+}
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct disk_reservation res,
+				      struct bch_devs_mask *devs,
+				      struct write_point_specifier write_point,
+				      struct bpos pos,
+				      u64 *journal_seq, unsigned flags)
+{
+	__bch2_write_op_init(op, c);
+	op->flags	= flags;
+	op->nr_replicas	= res.nr_replicas;
+	op->pos		= pos;
+	op->res		= res;
+	op->devs	= devs;
+	op->write_point	= write_point;
+
+	if (journal_seq) {
+		op->journal_seq_p = journal_seq;
+		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+	}
+}
+
 void bch2_write(struct closure *);

 static inline struct bch_write_bio *wbio_init(struct bio *bio)
@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
 	return wbio;
 }

-void bch2_wake_delayed_writes(unsigned long data);
-
 struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_pick_ptr;

 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
+		       struct bkey_s_c_extent e, struct extent_pick_ptr *,
+		       unsigned);
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
 		 u64, struct bch_devs_mask *, unsigned);

@ -66,21 +120,22 @@ enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
 	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,

 	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 3,
-	BCH_READ_MUST_CLONE		= 1 << 4,
-	BCH_READ_IN_RETRY		= 1 << 5,
+	BCH_READ_MUST_BOUNCE		= 1 << 4,
+	BCH_READ_MUST_CLONE		= 1 << 5,
+	BCH_READ_IN_RETRY		= 1 << 6,
 };

 static inline void bch2_read_extent(struct bch_fs *c,
 				    struct bch_read_bio *rbio,
-				    struct bkey_s_c k,
+				    struct bkey_s_c_extent e,
 				    struct extent_pick_ptr *pick,
 				    unsigned flags)
 {
 	rbio->_state = 0;
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
 }

 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@ -1,20 +1,16 @@
 #ifndef _BCACHEFS_IO_TYPES_H
 #define _BCACHEFS_IO_TYPES_H

+#include "alloc_types.h"
 #include "btree_types.h"
 #include "buckets_types.h"
+#include "extents_types.h"
 #include "keylist_types.h"
 #include "super_types.h"

 #include <linux/llist.h>
 #include <linux/workqueue.h>

-struct extent_pick_ptr {
-	struct bch_extent_crc128	crc;
-	struct bch_extent_ptr		ptr;
-	struct bch_dev			*ca;
-};
-
 struct bch_read_bio {
 	struct bch_fs		*c;

@ -44,26 +40,22 @@ struct bch_read_bio {
 	struct {
 	u8			bounce:1,
 				split:1,
-				process_context:1,
-				retry:2;
+				narrow_crcs:1,
+				retry:2,
+				context:2;
 	};
 	u8			_state;
 	};

+	struct bch_devs_list	devs_have;
+
 	struct extent_pick_ptr	pick;
+	/* start pos of data we read (may not be pos of data we want) */
+	struct bpos		pos;
 	struct bversion		version;

 	struct promote_op	*promote;

-	/*
-	 * If we have to retry the read (IO error, checksum failure, read stale
-	 * data (raced with allocator), we retry the portion of the parent bio
-	 * that failed (i.e. this bio's portion, bvec_iter).
-	 *
-	 * But we need to stash the inode somewhere:
-	 */
-	u64			inode;
-
 	struct work_struct	work;

 	struct bio		bio;
@ -98,36 +90,33 @@ struct bch_write_op {
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;

-	unsigned		written; /* sectors */
-
-	short			error;
-
 	u16			flags;
+	u16			written; /* sectors */
+	s8			error;
+
 	unsigned		csum_type:4;
 	unsigned		compression_type:4;
 	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:4;
-	unsigned		nonce:14;
+
+	u8			open_buckets_nr;
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;

 	struct bpos		pos;
 	struct bversion		version;

-	/* For BCH_WRITE_DATA_COMPRESSED: */
-	struct bch_extent_crc128 crc;
-	unsigned		size;
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;

 	struct bch_devs_mask	*devs;
-	unsigned long		write_point;
+	struct write_point_specifier write_point;

 	struct disk_reservation	res;

-	union {
 	u8			open_buckets[16];
-	struct {
-	struct bch_write_op	*next;
-	unsigned long		expires;
-	};
-	};

 	/*
 	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
 	if (invalid) {
 		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
 				     bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
+		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+				 type, invalid, buf);

 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
+	spin_unlock(&j->lock);

 	while (ja->nr < nr) {
-		/* must happen under journal lock, to avoid racing with gc: */
-		long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
-		if (b < 0) {
-			if (!closure_wait(&c->freelist_wait, &cl)) {
-				spin_unlock(&j->lock);
+		struct open_bucket *ob;
+		size_t bucket;
+		int ob_idx;
+
+		ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
+		if (ob_idx < 0) {
+			if (!closure_wait(&c->freelist_wait, &cl))
 				closure_sync(&cl);
-				spin_lock(&j->lock);
-			}
 			continue;
 		}

-		bch2_mark_metadata_bucket(ca, &ca->buckets[b],
-					 BUCKET_JOURNAL, false);
-		bch2_mark_alloc_bucket(ca, &ca->buckets[b], false);
+		ob = c->open_buckets + ob_idx;
+		bucket = sector_to_bucket(ca, ob->ptr.offset);

-		memmove(ja->buckets + ja->last_idx + 1,
-			ja->buckets + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
-		memmove(ja->bucket_seq + ja->last_idx + 1,
-			ja->bucket_seq + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
-		memmove(journal_buckets->buckets + ja->last_idx + 1,
-			journal_buckets->buckets + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
+		spin_lock(&j->lock);
+		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
+		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);

-		ja->buckets[ja->last_idx] = b;
-		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+		ja->buckets[ja->last_idx] = bucket;
+		ja->bucket_seq[ja->last_idx] = 0;
+		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);

 		if (ja->last_idx < ja->nr) {
 			if (ja->cur_idx >= ja->last_idx)
@ -1604,10 +1601,15 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 			ja->last_idx++;
 		}
 		ja->nr++;
-
-	}
 		spin_unlock(&j->lock);

+		bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket],
+					  BUCKET_JOURNAL,
+					  gc_phase(GC_PHASE_SB), 0);
+
+		bch2_open_bucket_put(c, ob);
+	}
+
 	BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));

 	bch2_write_super(c);
@ -1623,6 +1625,8 @@ err:
 	if (!ret)
 		bch2_dev_allocator_add(c, ca);

+	closure_sync(&cl);
+
 	return ret;
 }

--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
 void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
 void bch2_keylist_pop_front(struct keylist *);

-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys,
-				    size_t nr_inline_u64s)
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
 {
 	l->top_p = l->keys_p = inline_keys;
 }
@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
 {
 	if (l->keys_p != inline_keys)
 		kfree(l->keys_p);
-	memset(l, 0, sizeof(*l));
+	bch2_keylist_init(l, inline_keys);
 }

 static inline void bch2_keylist_push(struct keylist *l)
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@ -13,31 +13,16 @@
 #include "move.h"
 #include "super-io.h"

-static int issue_migration_move(struct bch_dev *ca,
-				struct moving_context *ctxt,
-				struct bch_devs_mask *devs,
-				struct bkey_s_c k)
+static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 {
-	struct bch_fs *c = ca->fs;
-	struct disk_reservation res;
+	struct bch_dev *ca = arg;
 	const struct bch_extent_ptr *ptr;
-	int ret;

-	if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
-		return -ENOSPC;
-
-	extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
+	extent_for_each_ptr(e, ptr)
 		if (ptr->dev == ca->dev_idx)
-			goto found;
+			return true;

-	BUG();
-found:
-	/* XXX: we need to be doing something with the disk reservation */
-
-	ret = bch2_data_move(c, ctxt, devs, k, ptr);
-	if (ret)
-		bch2_disk_reservation_put(c, &res);
-	return ret;
+	return false;
 }

 #define MAX_DATA_OFF_ITER	10
@ -58,10 +43,11 @@ found:

 int bch2_move_data_off_device(struct bch_dev *ca)
 {
-	struct moving_context ctxt;
 	struct bch_fs *c = ca->fs;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 keys_moved, sectors_moved;
 	unsigned pass = 0;
-	u64 seen_key_count;
 	int ret = 0;

 	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
 		return 0;

-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
-	bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
-	__set_bit(ca->dev_idx, ctxt.avoid.d);
-
 	/*
 	 * In theory, only one pass should be necessary as we've
 	 * quiesced all writes before calling this.
@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	 * Thus this scans the tree one more time than strictly necessary,
 	 * but that can be viewed as a verification pass.
 	 */
-
 	do {
-		struct btree_iter iter;
-		struct bkey_s_c k;
-
-		seen_key_count = 0;
-		atomic_set(&ctxt.error_count, 0);
-		atomic_set(&ctxt.error_flags, 0);
-
-		bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-				     BTREE_ITER_PREFETCH);
-
-		while (!bch2_move_ctxt_wait(&ctxt) &&
-		       (k = bch2_btree_iter_peek(&iter)).k &&
-		       !(ret = btree_iter_err(k))) {
-			if (!bkey_extent_is_data(k.k) ||
-			    !bch2_extent_has_device(bkey_s_c_to_extent(k),
-						   ca->dev_idx))
-				goto next;
-
-			ret = issue_migration_move(ca, &ctxt, NULL, k);
-			if (ret == -ENOMEM) {
-				bch2_btree_iter_unlock(&iter);
-
-				/*
-				 * memory allocation failure, wait for some IO
-				 * to finish
-				 */
-				bch2_move_ctxt_wait_for_io(&ctxt);
-				continue;
+		ret = bch2_move_data(c, NULL,
+				     SECTORS_IN_FLIGHT_PER_DEVICE,
+				     NULL,
+				     writepoint_hashed((unsigned long) current),
+				     0,
+				     ca->dev_idx,
+				     migrate_pred, ca,
+				     &keys_moved,
+				     &sectors_moved);
+		if (ret) {
+			bch_err(c, "error migrating data: %i", ret);
+			return ret;
 		}
-			if (ret == -ENOSPC)
-				break;
-			BUG_ON(ret);
+	} while (keys_moved && pass++ < MAX_DATA_OFF_ITER);

-			seen_key_count++;
+	if (keys_moved) {
+		bch_err(c, "unable to migrate all data in %d iterations",
+			MAX_DATA_OFF_ITER);
+		return -1;
+	}
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
+		if (!bkey_extent_is_data(k.k))
 			continue;
-next:
-			if (bkey_extent_is_data(k.k)) {
+
 		ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
 					    BCH_DATA_USER);
-				if (ret)
+		if (ret) {
+			bch_err(c, "error migrating data %i from check_mark_super()", ret);
 			break;
 		}
-			bch2_btree_iter_advance_pos(&iter);
-			bch2_btree_iter_cond_resched(&iter);
-
-		}
-		bch2_btree_iter_unlock(&iter);
-		bch2_move_ctxt_exit(&ctxt);
-
-		if (ret)
-			goto err;
-	} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
-
-	if (seen_key_count) {
-		pr_err("Unable to migrate all data in %d iterations.",
-		       MAX_DATA_OFF_ITER);
-		ret = -1;
-		goto err;
 	}

-err:
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 	return ret;
@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 			       enum btree_id id)
 {
 	struct btree_iter iter;
-	struct closure cl;
 	struct btree *b;
 	int ret;

 	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);

-	closure_init_stack(&cl);
-
 	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);

--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -9,41 +9,38 @@
 #include "keylist.h"

 #include <linux/ioprio.h>
+#include <linux/kthread.h>

 #include <trace/events/bcachefs.h>

-static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
-					    struct bkey_s_extent e,
-					    struct bch_extent_ptr ptr)
-{
-	struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca = c->devs[ptr.dev];
+struct moving_io {
+	struct list_head	list;
+	struct closure		cl;
+	bool			read_completed;
+	unsigned		sectors;

-	extent_for_each_ptr(e, ptr2)
-		if (ptr2->dev == ptr.dev &&
-		    ptr2->gen == ptr.gen &&
-		    PTR_BUCKET_NR(ca, ptr2) ==
-		    PTR_BUCKET_NR(ca, &ptr))
-			return ptr2;
+	struct bch_read_bio	rbio;

-	return NULL;
-}
+	struct migrate_write	write;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};

-static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
-							struct bkey_s_extent e)
-{
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_ptr *ret;
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;

-	if (m->move)
-		ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
-	else
-		extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
-			if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
-				break;
+	/* Key and sector moves issued, updated from submission context */
+	u64			keys_moved;
+	u64			sectors_moved;
+	atomic64_t		sectors_raced;

-	return ret;
-}
+	struct list_head	reads;
+
+	atomic_t		sectors_in_flight;
+
+	wait_queue_head_t	wait;
+};

 static int bch2_migrate_index_update(struct bch_write_op *op)
 {
@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			     BTREE_ITER_INTENT);

 	while (1) {
-		struct bkey_s_extent insert =
-			bkey_i_to_s_extent(bch2_keylist_front(keys));
 		struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+		struct bkey_i_extent *insert, *new =
+			bkey_i_to_extent(bch2_keylist_front(keys));
+		BKEY_PADDED(k) _new, _insert;
 		struct bch_extent_ptr *ptr;
-		struct bkey_s_extent e;
-		BKEY_PADDED(k) new;
+		struct bch_extent_crc_unpacked crc;
+		bool did_work = false;

-		if (!k.k) {
+		if (btree_iter_err(k)) {
 			ret = bch2_btree_iter_unlock(&iter);
 			break;
 		}

-		if (!bkey_extent_is_data(k.k))
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bkey_extent_is_data(k.k) ||
+		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+					     m->ptr, m->offset))
 			goto nomatch;

-		bkey_reassemble(&new.k, k);
-		bch2_cut_front(iter.pos, &new.k);
-		bch2_cut_back(insert.k->p, &new.k.k);
-		e = bkey_i_to_s_extent(&new.k);
+		bkey_reassemble(&_insert.k, k);
+		insert = bkey_i_to_extent(&_insert.k);

-		/* hack - promotes can race: */
-		if (m->promote)
-			extent_for_each_ptr(insert, ptr)
-				if (bch2_extent_has_device(e.c, ptr->dev))
-					goto nomatch;
+		bkey_copy(&_new.k, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(&_new.k);

-		ptr = bch2_migrate_matching_ptr(m, e);
-		if (ptr) {
-			int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c);
-			unsigned insert_flags =
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL;
+		bch2_cut_front(iter.pos, &insert->k_i);
+		bch2_cut_back(new->k.p, &insert->k);
+		bch2_cut_back(insert->k.p, &new->k);

-			/* copygc uses btree node reserve: */
-			if (m->move)
-				insert_flags |= BTREE_INSERT_USE_RESERVE;
+		if (m->move_dev >= 0 &&
+		    (ptr = (struct bch_extent_ptr *)
+		     bch2_extent_has_device(extent_i_to_s_c(insert),
+					    m->move_dev)))
+			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);

-			if (m->move) {
-				nr_new_dirty -= !ptr->cached;
-				__bch2_extent_drop_ptr(e, ptr);
+
+		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
 			}

-			BUG_ON(nr_new_dirty < 0);
+			bch2_extent_crc_append(insert, crc);
+			extent_ptr_append(insert, *ptr);
+			did_work = true;
+		}

-			memcpy_u64s(extent_entry_last(e),
-				    insert.v,
-				    bkey_val_u64s(insert.k));
-			e.k->u64s += bkey_val_u64s(insert.k);
+		if (!did_work)
+			goto nomatch;

-			bch2_extent_narrow_crcs(e);
-			bch2_extent_drop_redundant_crcs(e);
-			bch2_extent_normalize(c, e.s);
-			bch2_extent_mark_replicas_cached(c, e, nr_new_dirty);
+		bch2_extent_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, extent_i_to_s(insert).s);
+		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));

 		ret = bch2_btree_insert_at(c, &op->res,
 				NULL, op_journal_seq(op),
-					insert_flags,
-					BTREE_INSERT_ENTRY(&iter, &new.k));
-			if (ret && ret != -EINTR)
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				m->btree_insert_flags,
+				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+		if (!ret)
+			atomic_long_inc(&c->extent_migrate_done);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
 			break;
-		} else {
-nomatch:
-			bch2_btree_iter_advance_pos(&iter);
-		}
-
+next:
 		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
 			bch2_keylist_pop_front(keys);
 			if (bch2_keylist_empty(keys))
@ -131,96 +135,83 @@ nomatch:
 		}

 		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		continue;
+nomatch:
+		if (m->ctxt)
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->sectors_raced);
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_advance_pos(&iter);
+		goto next;
 	}
 out:
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }

-void bch2_migrate_write_init(struct bch_fs *c,
-			     struct migrate_write *m,
-			     struct bch_devs_mask *devs,
-			     struct bkey_s_c k,
-			     const struct bch_extent_ptr *move_ptr,
-			     unsigned flags)
+void bch2_migrate_write_init(struct migrate_write *m,
+			     struct bch_read_bio *rbio)
 {
-	bkey_reassemble(&m->key, k);
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);

-	m->promote = false;
-	m->move = move_ptr != NULL;
-	if (move_ptr)
-		m->move_ptr = *move_ptr;
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;

-	if (bkey_extent_is_cached(k.k) ||
-	    (move_ptr && move_ptr->cached))
-		flags |= BCH_WRITE_CACHED;
+	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
+		m->op.csum_type = m->op.crc.csum_type;
+	}

-	bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
-			   devs, (unsigned long) current,
-			   bkey_start_pos(k.k), NULL,
-			   flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
+	if (m->move_dev >= 0)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);

-	if (m->move)
+	if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
 		m->op.alloc_reserve = RESERVE_MOVINGGC;

-	m->op.nonce		= extent_current_nonce(bkey_s_c_to_extent(k));
+	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+		BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED;
+
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 	m->op.nr_replicas	= 1;
+	m->op.nr_replicas_required = 1;
 	m->op.index_update_fn	= bch2_migrate_index_update;
 }

-static void migrate_bio_init(struct moving_io *io, struct bio *bio,
-			     unsigned sectors)
+static void move_free(struct closure *cl)
 {
-	bio_init(bio, io->bi_inline_vecs,
-		 DIV_ROUND_UP(sectors, PAGE_SECTORS));
-	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
-	bio->bi_iter.bi_size	= sectors << 9;
-	bio->bi_private		= &io->cl;
-	bch2_bio_map(bio, NULL);
-}
-
-static void moving_io_free(struct moving_io *io)
-{
-	struct moving_context *ctxt = io->ctxt;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->write.ctxt;
 	struct bio_vec *bv;
 	int i;

-	atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
-	wake_up(&ctxt->wait);
-
 	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
 		if (bv->bv_page)
 			__free_page(bv->bv_page);
+
+	atomic_sub(io->sectors, &ctxt->sectors_in_flight);
+	wake_up(&ctxt->wait);
+
 	kfree(io);
 }

-static void moving_error(struct moving_context *ctxt, unsigned flag)
-{
-	atomic_inc(&ctxt->error_count);
-	//atomic_or(flag, &ctxt->error_flags);
-}
-
-static void moving_write_done(struct closure *cl)
+static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);

-	if (io->write.op.error)
-		moving_error(io->ctxt, MOVING_FLAG_WRITE);
-
-	//if (io->replace.failures)
-	//	trace_copy_collision(q, &io->key.k);
-
-	moving_io_free(io);
+	if (likely(!io->rbio.bio.bi_error)) {
+		bch2_migrate_write_init(&io->write, &io->rbio);
+		closure_call(&io->write.op.cl, bch2_write, NULL, cl);
 	}

-static void write_moving(struct closure *cl)
-{
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct bch_write_op *op = &io->write.op;
-
-	closure_call(&op->cl, bch2_write, NULL, &io->cl);
-	closure_return_with_destructor(&io->cl, moving_write_done);
+	closure_return_with_destructor(cl, move_free);
 }

 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
 	return io && io->read_completed ? io : NULL;
 }

-static void read_moving_endio(struct bio *bio)
+static void move_read_endio(struct bio *bio)
 {
-	struct closure *cl = bio->bi_private;
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_context *ctxt = io->ctxt;
-
-	trace_move_read_done(&io->write.key.k);
-
-	if (bio->bi_error)
-		moving_error(io->ctxt, MOVING_FLAG_READ);
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;

 	io->read_completed = true;
 	if (next_pending_write(ctxt))
@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio)
 	closure_put(&ctxt->cl);
 }

-int bch2_data_move(struct bch_fs *c,
+static int bch2_move_extent(struct bch_fs *c,
 			  struct moving_context *ctxt,
 			  struct bch_devs_mask *devs,
-		   struct bkey_s_c k,
-		   const struct bch_extent_ptr *move_ptr)
+			  struct write_point_specifier wp,
+			  int btree_insert_flags,
+			  int move_device,
+			  struct bkey_s_c k)
 {
 	struct extent_pick_ptr pick;
 	struct moving_io *io;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned sectors = k.k->size, pages;

-	bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
+	bch2_extent_pick_ptr(c, k, NULL, &pick);
 	if (IS_ERR_OR_NULL(pick.ca))
 		return pick.ca ? PTR_ERR(pick.ca) : 0;

-	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
-		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
+	/* write path might have to decompress data: */
+	extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
+		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
 	if (!io)
-		return -ENOMEM;
+		goto err;

-	io->ctxt = ctxt;
+	io->write.ctxt	= ctxt;
+	io->sectors	= k.k->size;

-	migrate_bio_init(io, &io->rbio.bio, k.k->size);
+	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
+
+	bch2_bio_map(&io->write.op.wbio.bio, NULL);
+	if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
+		kfree(io);
+		goto err;
+	}
+
+	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;

 	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
-	io->rbio.bio.bi_end_io		= read_moving_endio;
+	io->rbio.bio.bi_end_io		= move_read_endio;

-	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
-		kfree(io);
-		return -ENOMEM;
-	}
-
-	migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
-
-	bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
-
-	trace_move_read(&io->write.key.k);
+	__bch2_write_op_init(&io->write.op, c);
+	io->write.btree_insert_flags = btree_insert_flags;
+	io->write.move_dev	= move_device;
+	io->write.op.devs	= devs;
+	io->write.op.write_point = wp;

 	ctxt->keys_moved++;
 	ctxt->sectors_moved += k.k->size;
-	if (ctxt->rate)
-		bch2_ratelimit_increment(ctxt->rate, k.k->size);

-	atomic_add(k.k->size, &ctxt->sectors_in_flight);
+	trace_move_extent(k.k);
+
+	atomic_add(io->sectors, &ctxt->sectors_in_flight);
 	list_add_tail(&io->list, &ctxt->reads);

 	/*
-	 * dropped by read_moving_endio() - guards against use after free of
+	 * dropped by move_read_endio() - guards against use after free of
 	 * ctxt when doing wakeup
 	 */
-	closure_get(&io->ctxt->cl);
-	bch2_read_extent(c, &io->rbio, k, &pick, 0);
+	closure_get(&ctxt->cl);
+	bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
+			 &pick, BCH_READ_NODECODE);
 	return 0;
+err:
+	trace_move_alloc_fail(k.k);
+	return -ENOMEM;
 }

 static void do_pending_writes(struct moving_context *ctxt)
@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt)

 	while ((io = next_pending_write(ctxt))) {
 		list_del(&io->list);
-
-		if (io->rbio.bio.bi_error) {
-			moving_io_free(io);
-			continue;
-		}
-
-		trace_move_write(&io->write.key.k);
-		closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
 	}
 }

@ -330,18 +331,7 @@ do {								\
 		     next_pending_write(_ctxt) || (_cond));	\
 } while (1)

-int bch2_move_ctxt_wait(struct moving_context *ctxt)
-{
-	move_ctxt_wait_event(ctxt,
-			     atomic_read(&ctxt->sectors_in_flight) <
-			     ctxt->max_sectors_in_flight);
-
-	return ctxt->rate
-		? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate)
-		: 0;
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 {
 	unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);

@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
 }

-void bch2_move_ctxt_exit(struct moving_context *ctxt)
+static void bch2_move_ctxt_exit(struct moving_context *ctxt)
 {
 	move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
 	closure_sync(&ctxt->cl);
@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt)
 	EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
 }

-void bch2_move_ctxt_init(struct moving_context *ctxt,
-			struct bch_ratelimit *rate,
-			unsigned max_sectors_in_flight)
+static void bch2_move_ctxt_init(struct moving_context *ctxt)
 {
 	memset(ctxt, 0, sizeof(*ctxt));
 	closure_init_stack(&ctxt->cl);

-	ctxt->rate = rate;
-	ctxt->max_sectors_in_flight = max_sectors_in_flight;
-
 	INIT_LIST_HEAD(&ctxt->reads);
 	init_waitqueue_head(&ctxt->wait);
 }
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   unsigned sectors_in_flight,
+		   struct bch_devs_mask *devs,
+		   struct write_point_specifier wp,
+		   int btree_insert_flags,
+		   int move_device,
+		   move_pred_fn pred, void *arg,
+		   u64 *keys_moved,
+		   u64 *sectors_moved)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct moving_context ctxt;
+	struct btree_iter iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_move_ctxt_init(&ctxt);
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+
+	if (rate)
+		bch2_ratelimit_reset(rate);
+
+	while (!kthread || !(ret = kthread_should_stop())) {
+		if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
+			bch2_btree_iter_unlock(&iter);
+			move_ctxt_wait_event(&ctxt,
+					     atomic_read(&ctxt.sectors_in_flight) <
+					     sectors_in_flight);
+		}
+
+		if (rate &&
+		    bch2_ratelimit_delay(rate) &&
+		    (bch2_btree_iter_unlock(&iter),
+		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+			break;
+
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+		ret = btree_iter_err(k);
+		if (ret)
+			break;
+
+		if (!bkey_extent_is_data(k.k) ||
+		    !pred(arg, bkey_s_c_to_extent(k)))
+			goto next;
+
+		/* unlock before doing IO: */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&iter);
+
+		if (bch2_move_extent(c, &ctxt, devs, wp,
+				     btree_insert_flags,
+				     move_device, k)) {
+			/* memory allocation failure, wait for some IO to finish */
+			bch2_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+
+		if (rate)
+			bch2_ratelimit_increment(rate, k.k->size);
+next:
+		bch2_btree_iter_advance_pos(&iter);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+	bch2_move_ctxt_exit(&ctxt);
+
+	trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+	*keys_moved	= ctxt.keys_moved;
+	*sectors_moved	= ctxt.sectors_moved;
+
+	return ret;
+}
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@ -4,77 +4,31 @@
 #include "buckets.h"
 #include "io_types.h"

-enum moving_flag_bitnos {
-	MOVING_FLAG_BITNO_READ = 0,
-	MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ	(1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE	(1U << MOVING_FLAG_BITNO_WRITE)
+struct bch_read_bio;
+struct moving_context;

 struct migrate_write {
-	BKEY_PADDED(key);
-	bool			promote;
-	bool			move;
-	struct bch_extent_ptr	move_ptr;
+	struct moving_context	*ctxt;
+
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
+
+	int			move_dev;
+	int			btree_insert_flags;
 	struct bch_write_op	op;
 };

-void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-			     struct bch_devs_mask *, struct bkey_s_c,
-			     const struct bch_extent_ptr *, unsigned);
+void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);

 #define SECTORS_IN_FLIGHT_PER_DEVICE	2048

-struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
-	struct closure		cl;
+typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);

-	/* Number and types of errors reported */
-	atomic_t		error_count;
-	atomic_t		error_flags;
-
-	/* Key and sector moves issued, updated from submission context */
-	u64			keys_moved;
-	u64			sectors_moved;
-
-	/* Rate-limiter counting submitted reads */
-	struct bch_ratelimit	*rate;
-
-	/* Try to avoid reading the following device */
-	struct bch_devs_mask	avoid;
-
-	struct list_head	reads;
-
-	/* Configuration */
-	unsigned		max_sectors_in_flight;
-	atomic_t		sectors_in_flight;
-
-	wait_queue_head_t	wait;
-};
-
-struct moving_io {
-	struct list_head	list;
-	struct rb_node		node;
-	struct closure		cl;
-	struct moving_context	*ctxt;
-	struct migrate_write	write;
-	bool			read_completed;
-
-	struct bch_read_bio	rbio;
-	/* Must be last since it is variable size */
-	struct bio_vec		bi_inline_vecs[0];
-};
-
-int bch2_data_move(struct bch_fs *, struct moving_context *,
-		   struct bch_devs_mask *, struct bkey_s_c,
-		   const struct bch_extent_ptr *);
-
-int bch2_move_ctxt_wait(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
-
-void bch2_move_ctxt_exit(struct moving_context *);
-void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
-			unsigned);
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+		   unsigned, struct bch_devs_mask *,
+		   struct write_point_specifier,
+		   int, int, move_pred_fn, void *,
+		   u64 *, u64 *);

 #endif /* _BCACHEFS_MOVE_H */
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -6,6 +6,7 @@

 #include "bcachefs.h"
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
 #include "extents.h"
@ -23,137 +24,63 @@
 #include <linux/sort.h>
 #include <linux/wait.h>

-/* Moving GC - IO loop */
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)

-static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+static inline int sectors_used_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
 {
-	const struct bucket_heap_entry *l = _l;
-	const struct bucket_heap_entry *r = _r;
-
-	if (l->bucket < r->bucket)
-		return -1;
-	if (l->bucket > r->bucket)
-		return 1;
-	return 0;
+	return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
 }

-static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
-						struct bkey_s_c k)
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
-	bucket_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr;
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;

-	if (bkey_extent_is_data(k.k) &&
-	    (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
-					  ca->dev_idx))) {
-		struct bucket_heap_entry search = {
-			.bucket = PTR_BUCKET_NR(ca, ptr)
-		};
+	return (l->offset > r->offset) - (l->offset < r->offset);
+}

-		size_t i = eytzinger0_find(h->data, h->used,
+static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
+{
+	struct bch_dev *ca = arg;
+	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_extent_has_device(e, ca->dev_idx);
+
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
+
+		size_t i = eytzinger0_find_le(h->data, h->used,
 					      sizeof(h->data[0]),
-					   bucket_idx_cmp, &search);
+					      bucket_offset_cmp, &search);

-		if (i < h->used)
-			return ptr;
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].mark.gen);
 	}

-	return NULL;
-}
-
-static int issue_moving_gc_move(struct bch_dev *ca,
-				struct moving_context *ctxt,
-				struct bkey_s_c k)
-{
-	struct bch_fs *c = ca->fs;
-	const struct bch_extent_ptr *ptr;
-	int ret;
-
-	ptr = moving_pred(ca, k);
-	if (!ptr) /* We raced - bucket's been reused */
-		return 0;
-
-	ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
-	if (!ret)
-		trace_gc_copy(k.k);
-	else
-		trace_moving_gc_alloc_fail(c, k.k->size);
-	return ret;
-}
-
-static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
-			u64 sectors_to_move)
-{
-	struct bch_fs *c = ca->fs;
-	bucket_heap *h = &ca->copygc_heap;
-	struct moving_context ctxt;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 sectors_not_moved = 0;
-	size_t buckets_not_moved = 0;
-	struct bucket_heap_entry *i;
-
-	bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
-	bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
-				SECTORS_IN_FLIGHT_PER_DEVICE);
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-
-	while (1) {
-		if (kthread_should_stop())
-			goto out;
-		if (bch2_move_ctxt_wait(&ctxt))
-			goto out;
-		k = bch2_btree_iter_peek(&iter);
-		if (!k.k)
-			break;
-		if (btree_iter_err(k))
-			goto out;
-
-		if (!moving_pred(ca, k))
-			goto next;
-
-		if (issue_moving_gc_move(ca, &ctxt, k)) {
-			bch2_btree_iter_unlock(&iter);
-
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(&ctxt);
-			continue;
-		}
-next:
-		bch2_btree_iter_advance_pos(&iter);
-		//bch2_btree_iter_cond_resched(&iter);
-
-		/* unlock before calling moving_context_wait() */
-		bch2_btree_iter_unlock(&iter);
-		cond_resched();
-	}
-
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-				   buckets_to_move);
-
-	/* don't check this if we bailed out early: */
-	for (i = h->data; i < h->data + h->used; i++) {
-		struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
-
-		if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
-			sectors_not_moved += bucket_sectors_used(m);
-			buckets_not_moved++;
-		}
-	}
-
-	if (sectors_not_moved)
-		bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
-			 sectors_not_moved, sectors_to_move,
-			 buckets_not_moved, buckets_to_move);
-	return;
-out:
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-				   buckets_to_move);
+	return false;
 }

 static bool have_copygc_reserve(struct bch_dev *ca)
@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }

-static inline int sectors_used_cmp(bucket_heap *heap,
-				   struct bucket_heap_entry l,
-				   struct bucket_heap_entry r)
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
-}
-
-static void bch2_moving_gc(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
+	copygc_heap *h = &ca->copygc_heap;
+	struct copygc_heap_entry e, *i;
 	struct bucket *g;
-	u64 sectors_to_move = 0;
-	size_t buckets_to_move, buckets_unused = 0;
-	struct bucket_heap_entry e, *i;
-	int reserve_sectors;
+	u64 keys_moved, sectors_moved;
+	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 buckets_to_move, buckets_not_moved = 0;
+	int ret;

-	if (!have_copygc_reserve(ca)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);
-		while (1) {
-			closure_wait(&c->freelist_wait, &cl);
-			if (have_copygc_reserve(ca))
-				break;
-			closure_sync(&cl);
-		}
-		closure_wake_up(&c->freelist_wait);
-	}
-
-	reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
-
-	trace_moving_gc_start(ca);
+	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));

 	/*
 	 * Find buckets with lowest sector counts, skipping completely
@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca)
 	 * them:
 	 */
 	down_read(&c->gc_lock);
-	ca->copygc_heap.used = 0;
+	h->used = 0;
 	for_each_bucket(g, ca) {
 		struct bucket_mark m = READ_ONCE(g->mark);
-		struct bucket_heap_entry e = { g - ca->buckets, m };
-
-		if (bucket_unused(m)) {
-			buckets_unused++;
-			continue;
-		}
+		struct copygc_heap_entry e;

 		if (m.owned_by_allocator ||
-		    m.data_type != BUCKET_DATA)
+		    m.data_type != BUCKET_DATA ||
+		    !bucket_sectors_used(m) ||
+		    bucket_sectors_used(m) >= ca->mi.bucket_size)
 			continue;

-		if (bucket_sectors_used(m) >= ca->mi.bucket_size)
-			continue;
-
-		heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
+		e = (struct copygc_heap_entry) {
+			.offset = bucket_to_sector(ca, g - ca->buckets),
+			.mark	= m
+		};
+		heap_add_or_replace(h, e, -sectors_used_cmp);
 	}
 	up_read(&c->gc_lock);

-	for (i = ca->copygc_heap.data;
-	     i < ca->copygc_heap.data + ca->copygc_heap.used;
-	     i++)
+	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += bucket_sectors_used(i->mark);

 	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
-		BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
 		sectors_to_move -= bucket_sectors_used(e.mark);
 	}

-	buckets_to_move = ca->copygc_heap.used;
+	buckets_to_move = h->used;

-	eytzinger0_sort(ca->copygc_heap.data,
-			ca->copygc_heap.used,
-			sizeof(ca->copygc_heap.data[0]),
-			bucket_idx_cmp, NULL);
+	if (!buckets_to_move)
+		return;

-	read_moving(ca, buckets_to_move, sectors_to_move);
+	eytzinger0_sort(h->data, h->used,
+			sizeof(h->data[0]),
+			bucket_offset_cmp, NULL);
+
+	ret = bch2_move_data(c, &ca->copygc_pd.rate,
+			     SECTORS_IN_FLIGHT_PER_DEVICE,
+			     &ca->self,
+			     writepoint_ptr(&ca->copygc_write_point),
+			     BTREE_INSERT_USE_RESERVE,
+			     ca->dev_idx,
+			     copygc_pred, ca,
+			     &keys_moved,
+			     &sectors_moved);
+
+	for (i = h->data; i < h->data + h->used; i++) {
+		size_t bucket = sector_to_bucket(ca, i->offset);
+		struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark);
+
+		if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+			sectors_not_moved += bucket_sectors_used(m);
+			buckets_not_moved++;
+		}
 	}

-static int bch2_moving_gc_thread(void *arg)
+	if (sectors_not_moved && !ret)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+
+	trace_copygc(ca,
+		     sectors_moved, sectors_not_moved,
+		     buckets_to_move, buckets_not_moved);
+}
+
+static int bch2_copygc_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg)
 		 * don't start copygc until less than half the gc reserve is
 		 * available:
 		 */
-		available = dev_buckets_available(ca);
+		available = dev_buckets_available(c, ca);
 		want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
 				 c->opts.gc_reserve_percent, 200);
 		if (available > want) {
@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg)
 			continue;
 		}

-		bch2_moving_gc(ca);
+		bch2_copygc(c, ca);
 	}

 	return 0;
 }

-void bch2_moving_gc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_dev *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
+	ca->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&ca->copygc_pd.rate);

-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
+	if (ca->copygc_thread)
+		kthread_stop(ca->copygc_thread);
+	ca->copygc_thread = NULL;
 }

-int bch2_moving_gc_start(struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct task_struct *t;

-	BUG_ON(ca->moving_gc_read);
+	BUG_ON(ca->copygc_thread);

-	if (ca->fs->opts.nochanges)
+	if (c->opts.nochanges)
 		return 0;

-	if (bch2_fs_init_fault("moving_gc_start"))
+	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;

-	t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
+	t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
 	if (IS_ERR(t))
 		return PTR_ERR(t);

-	ca->moving_gc_read = t;
-	wake_up_process(ca->moving_gc_read);
+	ca->copygc_thread = t;
+	wake_up_process(ca->copygc_thread);

 	return 0;
 }

-void bch2_dev_moving_gc_init(struct bch_dev *ca)
+void bch2_dev_copygc_init(struct bch_dev *ca)
 {
-	bch2_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
+	bch2_pd_controller_init(&ca->copygc_pd);
+	ca->copygc_pd.d_term = 0;
 }
--- a/libbcachefs/movinggc.h
+++ b/libbcachefs/movinggc.h
@ -1,30 +1,8 @@
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H

-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca)					\
-	((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca)					\
-	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
-
-void bch2_moving_gc_stop(struct bch_dev *);
-int bch2_moving_gc_start(struct bch_dev *);
-void bch2_dev_moving_gc_init(struct bch_dev *);
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);

 #endif /* _BCACHEFS_MOVINGGC_H */
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	if (err)
 		return err;

+	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
+	    bch2_sb_get_crypt(sb) &&
+	    BCH_SB_INITIALIZED(sb))
+		return "Incompatible extent nonces";
+
 	sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);

 	return NULL;
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -20,6 +20,7 @@
 #include "debug.h"
 #include "error.h"
 #include "fs.h"
+#include "fs-io.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_tiering_stop(c);

 	for_each_member_device(ca, c, i)
-		bch2_moving_gc_stop(ca);
+		bch2_copygc_stop(ca);

 	bch2_gc_thread_stop(c);

@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 */
 	percpu_ref_kill(&c->writes);

-	del_timer(&c->foreground_write_wakeup);
 	cancel_delayed_work(&c->pd_controllers_update);

-	c->foreground_write_pd.rate.rate = UINT_MAX;
-	bch2_wake_delayed_writes((unsigned long) c);
-
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
 	 * outstanding writes to complete so they don't see spurious errors due
@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c)
 	if (bch2_gc_thread_start(c))
 		goto err;

-	err = "error starting moving GC thread";
+	err = "error starting copygc thread";
 	for_each_rw_member(ca, c, i)
-		if (bch2_moving_gc_start(ca)) {
+		if (bch2_copygc_start(c, ca)) {
 			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
@ -375,6 +372,7 @@ err:

 static void bch2_fs_free(struct bch_fs *c)
 {
+	bch2_fs_fsio_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c)
 {
 	unsigned i;

-	del_timer_sync(&c->foreground_write_wakeup);
 	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);

@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->tiering_enabled = 1;
 	c->tiering_percent = 10;

-	c->foreground_target_percent = 20;
-
 	c->journal.write_time	= &c->journal_write_time;
 	c->journal.delay_time	= &c->journal_delay_time;
 	c->journal.blocked_time	= &c->journal_blocked_time;
@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_cache_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
-	    bch2_check_set_has_compressed_data(c, c->opts.compression))
+	    bch2_check_set_has_compressed_data(c, c->opts.compression) ||
+	    bch2_fs_fsio_init(c))
 		goto err;

 	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	ca->dev_idx = dev_idx;
 	__set_bit(ca->dev_idx, ca->self.d);

+	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
 	spin_lock_init(&ca->freelist_lock);
-	bch2_dev_moving_gc_init(ca);
+	bch2_dev_copygc_init(ca);

 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);

@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (bch2_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");

-	lg_local_lock(&c->usage_lock);
-	if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
-		bch2_mark_dev_metadata(c, ca);
-	lg_local_unlock(&c->usage_lock);
+	bch2_mark_dev_superblock(c, ca, 0);

 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)

 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-	bch2_moving_gc_stop(ca);
+	bch2_copygc_stop(ca);

 	/*
 	 * This stops new data writes (e.g. to existing open data
@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_dev_allocator_start(ca))
 		return "error starting allocator thread";

-	if (bch2_moving_gc_start(ca))
-		return "error starting moving GC thread";
+	if (bch2_copygc_start(c, ca))
+		return "error starting copygc thread";

 	if (bch2_tiering_start(c))
 		return "error starting tiering thread";
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
 	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
 }

+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 					      struct bch_devs_mask *mask)
 {
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@ -13,4 +13,33 @@ struct bch_devs_mask {
 	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
 };

+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_REPLICAS_MAX];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u8			state;
+	u8			tier;
+	u8			replacement;
+	u8			discard;
+	u8			data_allowed;
+	u8			valid;
+};
+
+struct bch_replicas_cpu_entry {
+	u8			data_type;
+	u8			devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+	struct rcu_head		rcu;
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_cpu_entry entries[];
+};
+
 #endif /* _BCACHEFS_SUPER_TYPES_H */
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -161,8 +161,11 @@ read_attribute(meta_buckets);
 read_attribute(alloc_buckets);
 read_attribute(has_data);
 read_attribute(alloc_debug);
+write_attribute(wake_allocator);

 read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);

 rw_attribute(journal_write_delay_ms);
 rw_attribute(journal_reclaim_delay_ms);
@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms);
 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);

-rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);

@ -179,12 +181,9 @@ rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);

-sysfs_pd_controller_attribute(foreground_write);

 rw_attribute(pd_controllers_update_seconds);

-rw_attribute(foreground_target_percent);
-
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);

@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 		if (k.k->type == BCH_EXTENT) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const struct bch_extent_ptr *ptr;
-			const union bch_extent_crc *crc;
+			struct bch_extent_crc_unpacked crc;

 			extent_for_each_ptr_crc(e, ptr, crc) {
-				if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
+				if (crc.compression_type == BCH_COMPRESSION_NONE) {
 					nr_uncompressed_extents++;
 					uncompressed_sectors += e.k->size;
 				} else {
 					nr_compressed_extents++;
 					compressed_sectors_compressed +=
-						crc_compressed_size(e.k, crc);
+						crc.compressed_size;
 					compressed_sectors_uncompressed +=
-						crc_uncompressed_size(e.k, crc);
+						crc.uncompressed_size;
 				}

 				/* only looking at the first ptr */
@ -323,17 +322,17 @@ SHOW(bch2_fs)

 	sysfs_print(read_realloc_races,
 		    atomic_long_read(&c->read_realloc_races));
+	sysfs_print(extent_migrate_done,
+		    atomic_long_read(&c->extent_migrate_done));
+	sysfs_print(extent_migrate_raced,
+		    atomic_long_read(&c->extent_migrate_raced));

 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);

-	sysfs_printf(foreground_write_ratelimit_enabled, "%i",
-		     c->foreground_write_ratelimit_enabled);
 	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
-	sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);

 	sysfs_print(pd_controllers_update_seconds,
 		    c->pd_controllers_update_seconds);
-	sysfs_print(foreground_target_percent, c->foreground_target_percent);

 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
@ -371,9 +370,6 @@ STORE(__bch2_fs)
 	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
 	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);

-	sysfs_strtoul(foreground_write_ratelimit_enabled,
-		      c->foreground_write_ratelimit_enabled);
-
 	if (attr == &sysfs_btree_gc_periodic) {
 		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
 			?: (ssize_t) size;
@ -389,8 +385,8 @@ STORE(__bch2_fs)
 			?: (ssize_t) size;

 		for_each_member_device(ca, c, i)
-			if (ca->moving_gc_read)
-				wake_up_process(ca->moving_gc_read);
+			if (ca->copygc_thread)
+				wake_up_process(ca->copygc_thread);
 		return ret;
 	}

@ -402,11 +398,8 @@ STORE(__bch2_fs)
 		return ret;
 	}

-	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
-
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
-	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);

 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
 	sysfs_pd_controller_store(tiering,	&c->tiers[1].pd); /* XXX */
@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_journal_write_delay_ms,
 	&sysfs_journal_reclaim_delay_ms,

-	&sysfs_foreground_target_percent,
 	&sysfs_tiering_percent,

 	&sysfs_compression_stats,
@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_pins,

 	&sysfs_read_realloc_races,
+	&sysfs_extent_migrate_done,
+	&sysfs_extent_migrate_raced,

 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
 	&sysfs_prune_cache,

-	&sysfs_foreground_write_ratelimit_enabled,
 	&sysfs_copy_gc_enabled,
 	&sysfs_tiering_enabled,
 	sysfs_pd_controller_files(tiering),
-	sysfs_pd_controller_files(foreground_write),
 	&sysfs_internal_uuid,

 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 {
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);

 	return scnprintf(buf, PAGE_SIZE,
 		"free_inc:               %zu/%zu\n"
 		"free[RESERVE_BTREE]:    %zu/%zu\n"
 		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
 		"free[RESERVE_NONE]:     %zu/%zu\n"
-		"alloc:                  %llu/%llu\n"
-		"meta:                   %llu/%llu\n"
-		"dirty:                  %llu/%llu\n"
-		"available:              %llu/%llu\n"
+		"buckets:\n"
+		"    capacity:           %llu\n"
+		"    alloc:              %llu\n"
+		"    meta:               %llu\n"
+		"    dirty:              %llu\n"
+		"    available:          %llu\n"
+		"sectors:\n"
+		"    meta:               %llu\n"
+		"    dirty:              %llu\n"
+		"    cached:             %llu\n"
 		"freelist_wait:          %s\n"
 		"open buckets:           %u/%u (reserved %u)\n"
 		"open_buckets_wait:      %s\n",
@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
-		stats.buckets_alloc,			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[S_META],			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[S_DIRTY],			ca->mi.nbuckets - ca->mi.first_bucket,
-		__dev_buckets_available(ca, stats),	ca->mi.nbuckets - ca->mi.first_bucket,
+		ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_alloc,
+		stats.buckets[S_META],
+		stats.buckets[S_DIRTY],
+		__dev_buckets_available(ca, stats),
+		stats.sectors[S_META],
+		stats.sectors[S_DIRTY],
+		stats.sectors_cached,
 		c->freelist_wait.list.first		? "waiting" : "empty",
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
@ -769,7 +771,7 @@ SHOW(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 	char *out = buf, *end = buf + PAGE_SIZE;

 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
@ -788,8 +790,8 @@ SHOW(bch2_dev)
 	sysfs_print(cached_buckets,	stats.buckets_cached);
 	sysfs_print(meta_buckets,	stats.buckets[S_META]);
 	sysfs_print(alloc_buckets,	stats.buckets_alloc);
-	sysfs_print(available_buckets,	dev_buckets_available(ca));
-	sysfs_print(free_buckets,	dev_buckets_free(ca));
+	sysfs_print(available_buckets,	__dev_buckets_available(ca, stats));
+	sysfs_print(free_buckets,	__dev_buckets_free(ca, stats));

 	if (attr == &sysfs_has_data) {
 		out += bch2_scnprint_flag_list(out, end - out,
@ -799,7 +801,7 @@ SHOW(bch2_dev)
 		return out - buf;
 	}

-	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
+	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);

 	if (attr == &sysfs_cache_replacement_policy) {
 		out += bch2_scnprint_string_list(out, end - out,
@ -843,7 +845,7 @@ STORE(bch2_dev)
 	struct bch_fs *c = ca->fs;
 	struct bch_member *mi;

-	sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
+	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);

 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
@ -899,6 +901,9 @@ STORE(bch2_dev)
 		bch2_tiering_start(c);
 	}

+	if (attr == &sysfs_wake_allocator)
+		bch2_wake_allocator(ca);
+
 	return size;
 }
 SYSFS_OPS(bch2_dev);
@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = {

 	/* debug: */
 	&sysfs_alloc_debug,
+	&sysfs_wake_allocator,

 	sysfs_pd_controller_files(copy_gc),
 	NULL
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@ -15,20 +15,10 @@
 #include <linux/kthread.h>
 #include <trace/events/bcachefs.h>

-struct tiering_state {
-	struct bch_tier		*tier;
-	unsigned		sectors;
-	unsigned		stripe_size;
-	unsigned		dev_idx;
-	struct bch_dev		*ca;
-};
-
-static bool tiering_pred(struct bch_fs *c,
-			 struct bch_tier *tier,
-			 struct bkey_s_c k)
+static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
 {
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	struct bch_tier *tier = arg;
+	struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
 	const struct bch_extent_ptr *ptr;
 	unsigned replicas = 0;

@ -44,93 +34,21 @@ static bool tiering_pred(struct bch_fs *c,
 	return replicas < c->opts.data_replicas;
 }

-	return false;
-}
-
-static int issue_tiering_move(struct bch_fs *c,
-			      struct bch_tier *tier,
-			      struct moving_context *ctxt,
-			      struct bkey_s_c k)
-{
-	int ret;
-
-	ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
-	if (!ret)
-		trace_tiering_copy(k.k);
-	else
-		trace_tiering_alloc_fail(c, k.k->size);
-
-	return ret;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
-{
-	struct moving_context ctxt;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned nr_devices = dev_mask_nr(&tier->devs);
-	int ret;
-
-	if (!nr_devices)
-		return 0;
-
-	trace_tiering_start(c);
-
-	bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
-			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-
-	while (!kthread_should_stop() &&
-	       !bch2_move_ctxt_wait(&ctxt) &&
-	       (k = bch2_btree_iter_peek(&iter)).k &&
-	       !btree_iter_err(k)) {
-		if (!tiering_pred(c, tier, k))
-			goto next;
-
-		ret = issue_tiering_move(c, tier, &ctxt, k);
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(&ctxt);
-			continue;
-		}
-next:
-		bch2_btree_iter_advance_pos(&iter);
-		//bch2_btree_iter_cond_resched(&iter);
-
-		/* unlock before calling moving_context_wait() */
-		bch2_btree_iter_unlock(&iter);
-		cond_resched();
-	}
-
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
-
-	return ctxt.sectors_moved;
-}
-
 static int bch2_tiering_thread(void *arg)
 {
 	struct bch_tier *tier = arg;
 	struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct bch_dev *ca;
-	u64 tier_capacity, available_sectors;
+	u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
 	unsigned long last;
-	unsigned i;
+	unsigned i, nr_devices;

 	set_freezable();

 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   dev_mask_nr(&tier->devs)))
+					   (nr_devices = dev_mask_nr(&tier->devs))))
 			break;

 		while (1) {
@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg)
 							ca->mi.first_bucket);
 					available_sectors +=
 						bucket_to_sector(ca,
-							dev_buckets_available(ca));
+							dev_buckets_available(c, ca));
 				}
 				rcu_read_unlock();
 			}
@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg)
 				return 0;
 		}

-		read_tiering(c, tier);
+		bch2_move_data(c, &tier->pd.rate,
+			       SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
+			       &tier->devs,
+			       writepoint_ptr(&tier->wp),
+			       0,
+			       -1,
+			       tiering_pred, tier,
+			       &keys_moved,
+			       &sectors_moved);
 	}

 	return 0;
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)

 int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+
 	while (1) {
 		u64 delay = bch2_ratelimit_delay(d);

 		if (delay)
 			set_current_state(TASK_INTERRUPTIBLE);

-		if (kthread_should_stop())
+		if (kthread && kthread_should_stop())
 			return 1;

 		if (!delay)
@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max)
 {
 	size_t rand;

+	if (!max)
+		return 0;
+
 	do {
-		get_random_bytes(&rand, sizeof(rand));
+		rand = get_random_long();
 		rand &= roundup_pow_of_two(max) - 1;
 	} while (rand >= max);

@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)

 	return vpmalloc(size, gfp_mask);
 }
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));

+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
 #endif /* _BCACHEFS_UTIL_H */