Update bcachefs sources to 313b24b652 bcachefs: Fix an assertion

2025-04-11 00:00:03 +03:00 · 2021-01-18 23:38:05 -05:00 · 2021-01-18 23:38:05 -05:00 · 9fce394ca6
commit 9fce394ca6
parent 4a4a7e01d7
30 changed files with 665 additions and 521 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-7d57e9b703cf8bda52c3894b5a18e74329914823
+313b24b652d521c6ba4a965f7033c73575923a91
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -54,10 +54,10 @@ static void pd_controllers_update(struct work_struct *work)
 		 * reclaimed by copy GC
 		 */
 		fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-					stats.buckets[BCH_DATA_user] +
-					stats.buckets[BCH_DATA_cached]) -
-				  (stats.sectors[BCH_DATA_user] +
-				   stats.sectors[BCH_DATA_cached])) << 9);
+					stats.d[BCH_DATA_user].buckets +
+					stats.d[BCH_DATA_cached].buckets) -
+				  (stats.d[BCH_DATA_user].sectors +
+				   stats.d[BCH_DATA_cached].sectors)) << 9);
 	}

 	bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@ -217,7 +217,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 		return 0;

 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-	g = __bucket(ca, k.k->p.offset, 0);
+	g = bucket(ca, k.k->p.offset);
 	u = bch2_alloc_unpack(k);

 	g->_mark.gen		= u.gen;
@ -278,7 +278,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k;
 	struct bch_dev *ca;
-	struct bucket_array *ba;
 	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
@ -302,9 +301,7 @@ retry:

 	percpu_down_read(&c->mark_lock);
 	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
-	ba	= bucket_array(ca);
-
-	g	= &ba->b[iter->pos.offset];
+	g	= bucket(ca, iter->pos.offset);
 	m	= READ_ONCE(g->mark);
 	new_u	= alloc_mem_to_key(g, m);
 	percpu_up_read(&c->mark_lock);
@ -326,54 +323,36 @@ err:
 	return ret;
 }

-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	u64 first_bucket, nbuckets;
-	int ret = 0;
-
-	percpu_down_read(&c->mark_lock);
-	first_bucket	= bucket_array(ca)->first_bucket;
-	nbuckets	= bucket_array(ca)->nbuckets;
-	percpu_up_read(&c->mark_lock);
-
-	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
-				   POS(ca->dev_idx, first_bucket),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	while (iter->pos.offset < nbuckets) {
-		bch2_trans_cond_resched(&trans);
-
-		ret = bch2_alloc_write_key(&trans, iter, flags);
-		if (ret)
-			break;
-		bch2_btree_iter_next_slot(iter);
-	}
-
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
 	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;

+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
 	for_each_member_device(ca, c, i) {
-		bch2_dev_alloc_write(c, ca, flags);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			break;
+		bch2_btree_iter_set_pos(iter,
+			POS(ca->dev_idx, ca->mi.first_bucket));
+
+		while (iter->pos.offset < ca->mi.nbuckets) {
+			bch2_trans_cond_resched(&trans);
+
+			ret = bch2_alloc_write_key(&trans, iter, flags);
+			if (ret) {
+				percpu_ref_put(&ca->io_ref);
+				goto err;
+			}
+			bch2_btree_iter_next_slot(iter);
 		}
 	}
-
+err:
+	bch2_trans_exit(&trans);
 	return ret;
 }

@ -552,7 +531,8 @@ out:
 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned long gc_count = c->gc_count;
-	u64 available;
+	s64 available;
+	unsigned i;
 	int ret = 0;

 	ca->allocator_state = ALLOCATOR_BLOCKED;
@ -568,8 +548,15 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;

-		available = max_t(s64, 0, dev_buckets_available(ca) -
-				  ca->inc_gen_really_needs_gc);
+		available  = dev_buckets_available(ca);
+		available -= ca->inc_gen_really_needs_gc;
+
+		spin_lock(&c->freelist_lock);
+		for (i = 0; i < RESERVE_NR; i++)
+			available -= fifo_used(&ca->free[i]);
+		spin_unlock(&c->freelist_lock);
+
+		available = max(available, 0LL);

 		if (available > fifo_free(&ca->free_inc) ||
 		    (available &&
@ -598,6 +585,9 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 	if (!is_available_bucket(mark))
 		return false;

+	if (mark.owned_by_allocator)
+		return false;
+
 	if (ca->buckets_nouse &&
 	    test_bit(bucket, ca->buckets_nouse))
 		return false;
@ -894,33 +884,32 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,

 	/* first, put on free_inc and mark as owned by allocator: */
 	percpu_down_read(&c->mark_lock);
-	spin_lock(&c->freelist_lock);
-
-	verify_not_on_freelist(c, ca, b);
-
-	BUG_ON(!fifo_push(&ca->free_inc, b));
-
 	g = bucket(ca, b);
 	m = READ_ONCE(g->mark);

-	invalidating_cached_data = m.cached_sectors != 0;
+	BUG_ON(m.dirty_sectors);
+
+	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+	spin_lock(&c->freelist_lock);
+	verify_not_on_freelist(c, ca, b);
+	BUG_ON(!fifo_push(&ca->free_inc, b));
+	spin_unlock(&c->freelist_lock);

 	/*
 	 * If we're not invalidating cached data, we only increment the bucket
 	 * gen in memory here, the incremented gen will be updated in the btree
 	 * by bch2_trans_mark_pointer():
 	 */
-
-	if (!invalidating_cached_data)
-		bch2_invalidate_bucket(c, ca, b, &m);
-	else
-		bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
-	spin_unlock(&c->freelist_lock);
-	percpu_up_read(&c->mark_lock);
-
-	if (!invalidating_cached_data)
+	if (!m.cached_sectors &&
+	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+		BUG_ON(m.data_type);
+		bucket_cmpxchg(g, m, m.gen++);
+		percpu_up_read(&c->mark_lock);
 		goto out;
+	}
+
+	percpu_up_read(&c->mark_lock);

 	/*
 	 * If the read-only path is trying to shut down, we can't be generating
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@ -98,7 +98,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);

-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
 int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);

--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 	rcu_read_lock();
 	buckets = bucket_array(ca);

-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark))
+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+		if (is_available_bucket(buckets->b[b].mark) &&
+		    !buckets->b[b].mark.owned_by_allocator)
 			goto success;
 	b = -1;
 success:
@ -224,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 				      bool may_alloc_partial,
 				      struct closure *cl)
 {
-	struct bucket_array *buckets;
 	struct open_bucket *ob;
-	long bucket = 0;
+	long b = 0;

 	spin_lock(&c->freelist_lock);

@ -260,13 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}

-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
 		goto out;

 	switch (reserve) {
 	case RESERVE_BTREE_MOVINGGC:
 	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
 			goto out;
 		break;
 	default:
@ -284,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	trace_bucket_alloc_fail(ca, reserve);
 	return ERR_PTR(-FREELIST_EMPTY);
 out:
-	verify_not_on_freelist(c, ca, bucket);
+	verify_not_on_freelist(c, ca, b);

 	ob = bch2_open_bucket_alloc(c);

 	spin_lock(&ob->lock);
-	buckets = bucket_array(ca);

 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
 	ob->ptr		= (struct bch_extent_ptr) {
 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.gen	= buckets->b[bucket].mark.gen,
-		.offset	= bucket_to_sector(ca, bucket),
+		.gen	= bucket(ca, b)->mark.gen,
+		.offset	= bucket_to_sector(ca, b),
 		.dev	= ca->dev_idx,
 	};

@ -489,16 +488,20 @@ bucket_alloc_from_stripe(struct bch_fs *c,
 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);

 	for (i = 0; i < devs_sorted.nr; i++)
-		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+			if (!h->s->blocks[ec_idx])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[ec_idx];
 			if (ob->ptr.dev == devs_sorted.devs[i] &&
-			    !test_and_set_bit(h->s->data_block_idx[ec_idx],
-					      h->s->blocks_allocated))
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
 				goto got_bucket;
+		}
 	goto out_put_head;
 got_bucket:
 	ca = bch_dev_bkey_exists(c, ob->ptr.dev);

-	ob->ec_idx	= h->s->data_block_idx[ec_idx];
+	ob->ec_idx	= ec_idx;
 	ob->ec		= h->s;

 	add_new_bucket(c, ptrs, devs_may_alloc,
@ -636,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,

 		if (!drop && ob->ec) {
 			mutex_lock(&ob->ec->lock);
-			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
-				drop |= ob2->ptr.dev == ca->dev_idx;
-			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+			for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+				if (!ob->ec->blocks[j])
+					continue;
+
+				ob2 = c->open_buckets + ob->ec->blocks[j];
 				drop |= ob2->ptr.dev == ca->dev_idx;
+			}
 			mutex_unlock(&ob->ec->lock);
 		}

--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 	return bkey_cmp(l, r) < 0 ? l : r;
 }

+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) > 0 ? l : r;
+}
+
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);

--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -205,13 +205,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }

 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial, bool metadata_only)
+			 bool initial)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	unsigned depth = metadata_only			? 1
-		: bch2_expensive_debug_checks		? 0
+	unsigned depth = bch2_expensive_debug_checks	? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@ -326,13 +325,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,

 static int bch2_gc_btree_init(struct bch_fs *c,
 			      struct journal_keys *journal_keys,
-			      enum btree_id btree_id,
-			      bool metadata_only)
+			      enum btree_id btree_id)
 {
 	struct btree *b;
-	unsigned target_depth = metadata_only		? 1
-		: bch2_expensive_debug_checks		? 0
-		: !btree_node_type_needs_gc(btree_id)	? 1
+	unsigned target_depth = bch2_expensive_debug_checks	? 0
+		: !btree_node_type_needs_gc(btree_id)		? 1
 		: 0;
 	u8 max_stale = 0;
 	int ret = 0;
@ -377,7 +374,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 }

 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
-			  bool initial, bool metadata_only)
+			  bool initial)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
@ -390,8 +387,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 		enum btree_id id = ids[i];
 		int ret = initial
 			? bch2_gc_btree_init(c, journal_keys,
-					     id, metadata_only)
-			: bch2_gc_btree(c, id, initial, metadata_only);
+					     id)
+			: bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 	}
@ -558,12 +555,11 @@ static void bch2_gc_free(struct bch_fs *c)
 }

 static int bch2_gc_done(struct bch_fs *c,
-			bool initial, bool metadata_only)
+			bool initial)
 {
 	struct bch_dev *ca;
-	bool verify = !metadata_only &&
-		(!initial ||
-		 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+	bool verify = (!initial ||
+		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
 	unsigned i;
 	int ret = 0;

@ -580,10 +576,9 @@ static int bch2_gc_done(struct bch_fs *c,
 		if (verify)						\
 			fsck_err(c, "stripe %zu has wrong "_msg		\
 				": got %u, should be %u",		\
-				dst_iter.pos, ##__VA_ARGS__,		\
+				iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
-		dst->dirty = true;					\
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_bucket_field(_f)						\
@ -602,29 +597,32 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)

-	if (!metadata_only) {
-		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
-		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+	{
+		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;

-		while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
-		       (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
-			BUG_ON(src_iter.pos != dst_iter.pos);
+		while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+			dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);

-			copy_stripe_field(alive,	"alive");
-			copy_stripe_field(sectors,	"sectors");
-			copy_stripe_field(algorithm,	"algorithm");
-			copy_stripe_field(nr_blocks,	"nr_blocks");
-			copy_stripe_field(nr_redundant,	"nr_redundant");
-			copy_stripe_field(blocks_nonempty,
-					  "blocks_nonempty");
+			if (dst->alive		!= src->alive ||
+			    dst->sectors	!= src->sectors ||
+			    dst->algorithm	!= src->algorithm ||
+			    dst->nr_blocks	!= src->nr_blocks ||
+			    dst->nr_redundant	!= src->nr_redundant) {
+				bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+				ret = -EINVAL;
+				goto fsck_err;
+			}

 			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
 				copy_stripe_field(block_sectors[i],
 						  "block_sectors[%u]", i);

-			genradix_iter_advance(&dst_iter, &c->stripes[0]);
-			genradix_iter_advance(&src_iter, &c->stripes[1]);
+			dst->blocks_nonempty = 0;
+			for (i = 0; i < dst->nr_blocks; i++)
+				dst->blocks_nonempty += dst->block_sectors[i] != 0;
+
+			genradix_iter_advance(&iter, &c->stripes[1]);
 		}
 	}

@ -658,28 +656,20 @@ static int bch2_gc_done(struct bch_fs *c,

 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
+		copy_fs_field(data,	"data");
+		copy_fs_field(cached,	"cached");
+		copy_fs_field(reserved,	"reserved");
+		copy_fs_field(nr_inodes,"nr_inodes");

-		if (!metadata_only) {
-			copy_fs_field(data,	"data");
-			copy_fs_field(cached,	"cached");
-			copy_fs_field(reserved,	"reserved");
-			copy_fs_field(nr_inodes,"nr_inodes");
-
-			for (i = 0; i < BCH_REPLICAS_MAX; i++)
-				copy_fs_field(persistent_reserved[i],
-					      "persistent_reserved[%i]", i);
-		}
+		for (i = 0; i < BCH_REPLICAS_MAX; i++)
+			copy_fs_field(persistent_reserved[i],
+				      "persistent_reserved[%i]", i);

 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];

-			if (metadata_only &&
-			    (e->data_type == BCH_DATA_user ||
-			     e->data_type == BCH_DATA_cached))
-				continue;
-
 			bch2_replicas_entry_to_text(&PBUF(buf), e);

 			copy_fs_field(replicas[i], "%s", buf);
@ -695,8 +685,7 @@ fsck_err:
 	return ret;
 }

-static int bch2_gc_start(struct bch_fs *c,
-			 bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
@ -760,13 +749,6 @@ static int bch2_gc_start(struct bch_fs *c,

 			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
 			d->gen_valid = s->gen_valid;
-
-			if (metadata_only &&
-			    (s->mark.data_type == BCH_DATA_user ||
-			     s->mark.data_type == BCH_DATA_cached)) {
-				d->_mark = s->mark;
-				d->_mark.owned_by_allocator = 0;
-			}
 		}
 	};

@ -794,7 +776,7 @@ static int bch2_gc_start(struct bch_fs *c,
 *    uses, GC could skip past them
 */
 int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
-	    bool initial, bool metadata_only)
+	    bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@ -810,13 +792,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 again:
-	ret = bch2_gc_start(c, metadata_only);
+	ret = bch2_gc_start(c);
 	if (ret)
 		goto out;

 	bch2_mark_superblocks(c);

-	ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+	ret = bch2_gc_btrees(c, journal_keys, initial);
 	if (ret)
 		goto out;

@ -855,7 +837,7 @@ out:
 		bch2_journal_block(&c->journal);

 		percpu_down_write(&c->mark_lock);
-		ret = bch2_gc_done(c, initial, metadata_only);
+		ret = bch2_gc_done(c, initial);

 		bch2_journal_unblock(&c->journal);
 	} else {
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@ -7,7 +7,7 @@
 void bch2_coalesce(struct bch_fs *);

 struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -1828,23 +1828,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
 	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }

-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos) {
-		unsigned long flags = READ_ONCE(b->flags);
-
-		BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
-		       (flags & (1 << BTREE_NODE_write_in_flight)));
-	}
-	rcu_read_unlock();
-}
-
 void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bucket_table *tbl;
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -185,7 +185,6 @@ do {									\

 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);

 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -836,7 +836,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	int ret = 0;

 	if (!trans->nr_updates)
-		goto out_noupdates;
+		goto out_reset;

 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&trans->c->gc_lock);
@ -850,7 +850,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
 		ret = bch2_trans_commit_get_rw_cold(trans);
 		if (ret)
-			return ret;
+			goto out_reset;
 	}

 #ifdef CONFIG_BCACHEFS_DEBUG
@ -962,7 +962,7 @@ out:

 	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&trans->c->writes);
-out_noupdates:
+out_reset:
 	bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);

 	return ret;
@ -981,10 +981,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 		.trigger_flags = flags, .iter = iter, .k = k
 	};

-	EBUG_ON(bkey_cmp(iter->pos,
-			 (iter->flags & BTREE_ITER_IS_EXTENTS)
-			 ? bkey_start_pos(&k->k)
-			 : k->k.p));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	BUG_ON(bkey_cmp(iter->pos,
+			(iter->flags & BTREE_ITER_IS_EXTENTS)
+			? bkey_start_pos(&k->k)
+			: k->k.p));
+
+	trans_for_each_update(trans, i) {
+		BUG_ON(bkey_cmp(i->iter->pos,
+				 (i->iter->flags & BTREE_ITER_IS_EXTENTS)
+				 ? bkey_start_pos(&i->k->k)
+				 : i->k->k.p));
+
+		BUG_ON(i != trans->updates &&
+		       btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0);
+	}
+#endif

 	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;

--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -376,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
 	return !is_available_bucket(m);
 }

-static inline int is_fragmented_bucket(struct bucket_mark m,
-				       struct bch_dev *ca)
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+					    struct bucket_mark m)
 {
-	if (!m.owned_by_allocator &&
-	    m.data_type == BCH_DATA_user &&
-	    bucket_sectors_used(m))
-		return max_t(int, 0, (int) ca->mi.bucket_size -
-			     bucket_sectors_used(m));
-	return 0;
+	return bucket_sectors_used(m)
+		? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+		: 0;
 }

 static inline int is_stripe_data_bucket(struct bucket_mark m)
@ -392,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m)
 	return m.stripe && m.data_type != BCH_DATA_parity;
 }

-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
-	return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
@ -456,7 +448,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 	if (type == BCH_DATA_sb || type == BCH_DATA_journal)
 		fs_usage->hidden	+= size;

-	dev_usage->buckets[type]	+= nr;
+	dev_usage->d[type].buckets	+= nr;
 }

 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
@ -481,19 +473,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,

 	u->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	u->buckets_ec += (int) new.stripe - (int) old.stripe;
 	u->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);

-	u->buckets_ec += (int) new.stripe - (int) old.stripe;
-	u->sectors_ec += bucket_stripe_sectors(new) -
-			 bucket_stripe_sectors(old);
-
-	u->sectors[old.data_type] -= old.dirty_sectors;
-	u->sectors[new.data_type] += new.dirty_sectors;
-	u->sectors[BCH_DATA_cached] +=
+	u->d[old.data_type].sectors -= old.dirty_sectors;
+	u->d[new.data_type].sectors += new.dirty_sectors;
+	u->d[BCH_DATA_cached].sectors +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
-	u->sectors_fragmented +=
-		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
 	preempt_enable();

 	if (!is_available_bucket(old) && is_available_bucket(new))
@ -650,46 +641,6 @@ unwind:
 	ret;								\
 })

-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-				    size_t b, struct bucket_mark *ret,
-				    bool gc)
-{
-	struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
-	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark old, new;
-
-	old = bucket_cmpxchg(g, new, ({
-		BUG_ON(!is_available_bucket(new));
-
-		new.owned_by_allocator	= true;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-		new.gen++;
-	}));
-
-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
-	if (old.cached_sectors)
-		update_cached_sectors(c, fs_usage, ca->dev_idx,
-				      -((s64) old.cached_sectors));
-
-	if (!gc)
-		*ret = old;
-	return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
-{
-	do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
-		   ca, b, old);
-
-	if (!old->owned_by_allocator && old->cached_sectors)
-		trace_invalidate(ca, bucket_to_sector(ca, b),
-				 old->cached_sectors);
-}
-
 static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 				    size_t b, bool owned_by_allocator,
 				    bool gc)
@ -1269,9 +1220,15 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;

 		for (i = 0; i < new_s->nr_blocks; i++) {
-			m->block_sectors[i] =
-				stripe_blockcount_get(new_s, i);
-			m->blocks_nonempty += !!m->block_sectors[i];
+			unsigned s = stripe_blockcount_get(new_s, i);
+
+			/*
+			 * gc recalculates this field from stripe ptr
+			 * references:
+			 */
+			if (!gc)
+				m->block_sectors[i] = s;
+			m->blocks_nonempty += !!s;
 		}

 		if (gc && old_s)
@ -2100,6 +2057,168 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 	return ret;
 }

+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_alloc_unpacked u;
+	struct bkey_i_alloc *a;
+	struct bch_extent_ptr ptr = {
+		.dev = ca->dev_idx,
+		.offset = bucket_to_sector(ca, b),
+	};
+	int ret = 0;
+
+	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+	if (ret)
+		return ret;
+
+	if (u.data_type && u.data_type != type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			iter->pos.inode, iter->pos.offset, u.gen,
+			bch2_data_types[u.data_type],
+			bch2_data_types[type],
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto out;
+	}
+
+	if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+			"while marking %s",
+			iter->pos.inode, iter->pos.offset, u.gen,
+			bch2_data_types[u.data_type ?: type],
+			u.dirty_sectors, sectors, ca->mi.bucket_size,
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (u.data_type		== type &&
+	    u.dirty_sectors	== sectors)
+		goto out;
+
+	u.data_type	= type;
+	u.dirty_sectors	= sectors;
+
+	bkey_alloc_init(&a->k_i);
+	a->k.p = iter->pos;
+	bch2_alloc_pack(a, u);
+	bch2_trans_update(trans, iter, &a->k_i, 0);
+out:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct disk_reservation *res,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	return __bch2_trans_do(trans, res, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+							ca->mi.bucket_size));
+
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+					    struct disk_reservation *res,
+					    struct bch_dev *ca,
+					    u64 start, u64 end,
+					    enum bch_data_type type,
+					    u64 *bucket, unsigned *bucket_sectors)
+{
+	int ret;
+
+	do {
+		u64 b = sector_to_bucket(ca, start);
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		if (b != *bucket) {
+			if (*bucket_sectors) {
+				ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+						*bucket, type, *bucket_sectors);
+				if (ret)
+					return ret;
+			}
+
+			*bucket		= b;
+			*bucket_sectors	= 0;
+		}
+
+		*bucket_sectors	+= sectors;
+		start += sectors;
+	} while (!ret && start < end);
+
+	return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+			     struct disk_reservation *res,
+			     struct bch_dev *ca)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 bucket = 0;
+	unsigned i, bucket_sectors = 0;
+	int ret;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR) {
+			ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+						0, BCH_SB_SECTOR,
+						BCH_DATA_sb, &bucket, &bucket_sectors);
+			if (ret)
+				return ret;
+		}
+
+		ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, &bucket, &bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	if (bucket_sectors) {
+		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+				bucket, BCH_DATA_sb, bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+				ca->journal.buckets[i],
+				BCH_DATA_journal, ca->mi.bucket_size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+			   struct disk_reservation *res,
+			   struct bch_dev *ca)
+{
+	return bch2_trans_do(c, res, NULL, 0,
+			__bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
 /* Disk reservations: */

 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
@ -2115,7 +2234,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 #define SECTORS_CACHE	1024

 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-			      unsigned sectors, int flags)
+			      u64 sectors, int flags)
 {
 	struct bch_fs_pcpu *pcpu;
 	u64 old, v, get;
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -153,18 +153,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark)
 	return mark.dirty_sectors + mark.cached_sectors;
 }

-static inline bool bucket_unused(struct bucket_mark mark)
-{
-	return !mark.owned_by_allocator &&
-		!mark.data_type &&
-		!bucket_sectors_used(mark);
-}
-
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
-	return (!mark.owned_by_allocator &&
-		!mark.dirty_sectors &&
-		!mark.stripe);
+	return !mark.dirty_sectors && !mark.stripe;
 }

 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
@ -245,8 +236,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);

-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, bool, struct gc_pos, unsigned);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
@ -270,6 +259,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);

+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+			struct disk_reservation *, struct bch_dev *,
+			size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+			   struct bch_dev *);
+
 /* disk reservations: */

 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
@ -284,8 +279,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)

 int bch2_disk_reservation_add(struct bch_fs *,
-			     struct disk_reservation *,
-			     unsigned, int);
+			      struct disk_reservation *,
+			      u64, int);

 static inline struct disk_reservation
 bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@ -302,8 +297,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)

 static inline int bch2_disk_reservation_get(struct bch_fs *c,
 					    struct disk_reservation *res,
-					    unsigned sectors,
-					    unsigned nr_replicas,
+					    u64 sectors, unsigned nr_replicas,
 					    int flags)
 {
 	*res = bch2_disk_reservation_init(c, nr_replicas);
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -52,16 +52,15 @@ struct bucket_array {
 };

 struct bch_dev_usage {
-	u64			buckets[BCH_DATA_NR];
 	u64			buckets_alloc;
+	u64			buckets_ec;
 	u64			buckets_unavailable;

-	/* _compressed_ sectors: */
-	u64			sectors[BCH_DATA_NR];
-	u64			sectors_fragmented;
-
-	u64			buckets_ec;
-	u64			sectors_ec;
+	struct {
+		u64		buckets;
+		u64		sectors; /* _compressed_ sectors: */
+		u64		fragmented;
+	}			d[BCH_DATA_NR];
 };

 struct bch_fs_usage {
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
 	arg.available_buckets	= arg.nr_buckets - src.buckets_unavailable;
 	arg.ec_buckets		= src.buckets_ec;
-	arg.ec_sectors		= src.sectors_ec;
+	arg.ec_sectors		= 0;

 	for (i = 0; i < BCH_DATA_NR; i++) {
-		arg.buckets[i] = src.buckets[i];
-		arg.sectors[i] = src.sectors[i];
+		arg.buckets[i] = src.d[i].buckets;
+		arg.sectors[i] = src.d[i].sectors;
 	}

 	percpu_ref_put(&ca->ref);
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@ -684,13 +684,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
 /* stripe creation: */

 static int ec_stripe_bkey_insert(struct bch_fs *c,
-				 struct ec_stripe_new *s,
-				 struct bkey_i_stripe *stripe)
+				 struct bkey_i_stripe *stripe,
+				 struct disk_reservation *res)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
-	struct bpos start_pos = POS(0, c->ec_stripe_hint);
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 	int ret;

 	bch2_trans_init(&trans, c, 0, 0);
@ -701,7 +702,7 @@ retry:
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 			if (start_pos.offset) {
-				start_pos = POS_MIN;
+				start_pos = min_pos;
 				bch2_btree_iter_set_pos(iter, start_pos);
 				continue;
 			}
@ -726,7 +727,7 @@ found_slot:

 	bch2_trans_update(&trans, iter, &stripe->k_i, 0);

-	ret = bch2_trans_commit(&trans, &s->res, NULL,
+	ret = bch2_trans_commit(&trans, res, NULL,
 				BTREE_INSERT_NOFAIL);
 err:
 	bch2_trans_iter_put(&trans, iter);
@ -740,6 +741,47 @@ err:
 	return ret;
 }

+static int ec_stripe_bkey_update(struct btree_trans *trans,
+				 struct bkey_i_stripe *new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	const struct bch_stripe *existing;
+	unsigned i;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_EC,
+				   new->k.p, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || k.k->type != KEY_TYPE_stripe) {
+		bch_err(c, "error updating stripe: not found");
+		ret = -ENOENT;
+		goto err;
+	}
+
+	existing = bkey_s_c_to_stripe(k).v;
+
+	if (existing->nr_blocks != new->v.nr_blocks) {
+		bch_err(c, "error updating stripe: nr_blocks does not match");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	for (i = 0; i < new->v.nr_blocks; i++)
+		stripe_blockcount_set(&new->v, i,
+			stripe_blockcount_get(existing, i));
+
+	bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
 static void extent_stripe_ptr_add(struct bkey_s_extent e,
 				  struct ec_stripe_buf *s,
 				  struct bch_extent_ptr *ptr,
@ -866,9 +908,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	if (!percpu_ref_tryget(&c->writes))
 		goto err;

-	BUG_ON(bitmap_weight(s->blocks_allocated,
-			     s->blocks.nr) != s->blocks.nr);
-
 	ec_generate_ec(&s->new_stripe);

 	ec_generate_checksums(&s->new_stripe);
@ -884,9 +923,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 	}

 	ret = s->have_existing_stripe
-		? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
-				    &s->res, NULL, BTREE_INSERT_NOFAIL)
-		: ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
+		? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+		: ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
 	if (ret) {
 		bch_err(c, "error creating stripe: error creating stripe key");
 		goto err_put_writes;
@ -902,11 +941,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)

 	spin_lock(&c->ec_stripes_heap_lock);
 	m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
-#if 0
-	pr_info("created a %s stripe %llu",
-		s->have_existing_stripe ? "existing" : "new",
-		s->stripe.key.k.p.offset);
-#endif
+
 	BUG_ON(m->on_heap);
 	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
 	spin_unlock(&c->ec_stripes_heap_lock);
@ -915,12 +950,17 @@ err_put_writes:
 err:
 	bch2_disk_reservation_put(c, &s->res);

-	open_bucket_for_each(c, &s->blocks, ob, i) {
-		ob->ec = NULL;
-		__bch2_open_bucket_put(c, ob);
-	}
+	for (i = 0; i < v->nr_blocks; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];

-	bch2_open_buckets_put(c, &s->parity);
+			if (i < nr_data) {
+				ob->ec = NULL;
+				__bch2_open_bucket_put(c, ob);
+			} else {
+				bch2_open_bucket_put(c, ob);
+			}
+		}

 	bch2_keylist_free(&s->keys, s->inline_keys);

@ -1179,7 +1219,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 	if (h->s &&
 	    h->s->allocated &&
 	    bitmap_weight(h->s->blocks_allocated,
-			  h->s->blocks.nr) == h->s->blocks.nr)
+			  h->s->nr_data) == h->s->nr_data)
 		ec_stripe_set_pending(c, h);

 	mutex_unlock(&h->lock);
@ -1216,64 +1256,82 @@ static enum bucket_alloc_ret
 new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 			 struct closure *cl)
 {
-	struct bch_devs_mask devs;
+	struct bch_devs_mask devs = h->devs;
 	struct open_bucket *ob;
-	unsigned i, nr_have, nr_data =
-		min_t(unsigned, h->nr_active_devs,
-		      BCH_BKEY_PTRS_MAX) - h->redundancy;
+	struct open_buckets buckets;
+	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 	bool have_cache = true;
 	enum bucket_alloc_ret ret = ALLOC_SUCCESS;

-	devs = h->devs;
-
-	for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
-		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
-		--nr_data;
+	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+		if (test_bit(i, h->s->blocks_gotten)) {
+			__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+			if (i < h->s->nr_data)
+				nr_have_data++;
+			else
+				nr_have_parity++;
+		}
 	}

-	BUG_ON(h->s->blocks.nr > nr_data);
-	BUG_ON(h->s->parity.nr > h->redundancy);
-
-	open_bucket_for_each(c, &h->s->parity, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-	open_bucket_for_each(c, &h->s->blocks, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
+	BUG_ON(nr_have_data	> h->s->nr_data);
+	BUG_ON(nr_have_parity	> h->s->nr_parity);

 	percpu_down_read(&c->mark_lock);
 	rcu_read_lock();

-	if (h->s->parity.nr < h->redundancy) {
-		nr_have = h->s->parity.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->s->parity,
+	buckets.nr = 0;
+	if (nr_have_parity < h->s->nr_parity) {
+		ret = bch2_bucket_alloc_set(c, &buckets,
 					    &h->parity_stripe,
 					    &devs,
-					    h->redundancy,
-					    &nr_have,
+					    h->s->nr_parity,
+					    &nr_have_parity,
 					    &have_cache,
 					    h->copygc
 					    ? RESERVE_MOVINGGC
 					    : RESERVE_NONE,
 					    0,
 					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data + h->s->nr_parity,
+					       h->s->nr_data);
+			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+			h->s->blocks[j] = buckets.v[i];
+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
 		if (ret)
 			goto err;
 	}

-	if (h->s->blocks.nr < nr_data) {
-		nr_have = h->s->blocks.nr;
-
-		ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+	buckets.nr = 0;
+	if (nr_have_data < h->s->nr_data) {
+		ret = bch2_bucket_alloc_set(c, &buckets,
 					    &h->block_stripe,
 					    &devs,
-					    nr_data,
-					    &nr_have,
+					    h->s->nr_data,
+					    &nr_have_data,
 					    &have_cache,
 					    h->copygc
 					    ? RESERVE_MOVINGGC
 					    : RESERVE_NONE,
 					    0,
 					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data, 0);
+			BUG_ON(j >= h->s->nr_data);
+
+			h->s->blocks[j] = buckets.v[i];
+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
 		if (ret)
 			goto err;
 	}
@ -1325,8 +1383,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 					       struct closure *cl)
 {
 	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i, data_idx = 0;
+	unsigned i;
 	s64 idx;
 	int ret;

@ -1361,9 +1418,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 				BUG();
 			}

+			BUG_ON(h->s->existing_stripe.size != h->blocksize);
+			BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
 			for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
-				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
+				if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+					__set_bit(i, h->s->blocks_gotten);
 					__set_bit(i, h->s->blocks_allocated);
+				}

 				ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
 			}
@ -1401,20 +1463,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 			goto out;
 		}

-		open_bucket_for_each(c, &h->s->blocks, ob, i) {
-			data_idx = find_next_zero_bit(h->s->blocks_allocated,
-						      h->s->nr_data, data_idx);
-			BUG_ON(data_idx >= h->s->nr_data);
-
-			h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
-			h->s->data_block_idx[i] = data_idx;
-			data_idx++;
-		}
-
-		open_bucket_for_each(c, &h->s->parity, ob, i)
-			h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
-
-		//pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
 		h->s->allocated = true;
 	}
 out:
@ -1434,12 +1482,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 		if (!h->s)
 			goto unlock;

-		open_bucket_for_each(c, &h->s->blocks, ob, i)
-			if (ob->ptr.dev == ca->dev_idx)
-				goto found;
-		open_bucket_for_each(c, &h->s->parity, ob, i)
+		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+			if (!h->s->blocks[i])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[i];
 			if (ob->ptr.dev == ca->dev_idx)
 				goto found;
+		}
 		goto unlock;
 found:
 		h->s->err = -EROFS;
@ -1466,7 +1516,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 				   size_t idx,
 				   struct bkey_i_stripe *new_key)
 {
-	struct bch_fs *c = trans->c;
+	const struct bch_stripe *v;
 	struct bkey_s_c k;
 	unsigned i;
 	int ret;
@ -1481,16 +1531,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_stripe)
 		return -EIO;

+	v = bkey_s_c_to_stripe(k).v;
+	for (i = 0; i < v->nr_blocks; i++)
+		if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+			goto write;
+	return 0;
+write:
 	bkey_reassemble(&new_key->k_i, k);

-	spin_lock(&c->ec_stripes_heap_lock);
-
 	for (i = 0; i < new_key->v.nr_blocks; i++)
 		stripe_blockcount_set(&new_key->v, i,
 				      m->block_sectors[i]);
-	m->dirty = false;
-
-	spin_unlock(&c->ec_stripes_heap_lock);

 	bch2_trans_update(trans, iter, &new_key->k_i, 0);
 	return 0;
@ -1514,7 +1565,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);

 	genradix_for_each(&c->stripes[0], giter, m) {
-		if (!m->dirty)
+		if (!m->alive)
 			continue;

 		ret = __bch2_trans_do(&trans, NULL, NULL,
@ -1624,19 +1675,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 		       h->target, h->algo, h->redundancy);

 		if (h->s)
-			pr_buf(out, "\tpending: blocks %u allocated %u\n",
-			       h->s->blocks.nr,
+			pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+			       h->s->nr_data, h->s->nr_parity,
 			       bitmap_weight(h->s->blocks_allocated,
-					     h->s->blocks.nr));
+					     h->s->nr_data));
 	}
 	mutex_unlock(&c->ec_stripe_head_lock);

 	mutex_lock(&c->ec_stripe_new_lock);
 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
-		       s->blocks.nr,
-		       bitmap_weight(s->blocks_allocated,
-				     s->blocks.nr),
+		pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+		       s->nr_data, s->nr_parity,
 		       atomic_read(&s->pin));
 	}
 	mutex_unlock(&c->ec_stripe_new_lock);
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@ -143,11 +143,9 @@ struct ec_stripe_new {
 	bool			pending;
 	bool			have_existing_stripe;

+	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
-	struct open_buckets	blocks;
-	u8			data_block_idx[BCH_BKEY_PTRS_MAX];
-	struct open_buckets	parity;
+	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
 	struct disk_reservation	res;

 	struct keylist		keys;
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@ -18,8 +18,7 @@ struct stripe {
 	u8			nr_blocks;
 	u8			nr_redundant;

-	unsigned		alive:1;
-	unsigned		dirty:1;
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 	unsigned		on_heap:1;
 	u8			blocks_nonempty;
 	u16			block_sectors[BCH_BKEY_PTRS_MAX];
--- a/libbcachefs/extent_update.c
+++ b/libbcachefs/extent_update.c
@ -192,18 +192,13 @@ bch2_extent_can_insert(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_i *insert)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree_node_iter node_iter = l->iter;
-	struct bkey_packed *_k;
 	struct bkey_s_c k;
-	struct bkey unpacked;
-	int sectors;
+	int ret, sectors;

-	_k = bch2_btree_node_iter_peek(&node_iter, l->b);
-	if (!_k)
-		return BTREE_INSERT_OK;
-
-	k = bkey_disassemble(l->b, _k, &unpacked);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;

 	/* Check if we're splitting a compressed extent: */

--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -84,6 +84,7 @@ struct dio_read {
 	struct closure			cl;
 	struct kiocb			*req;
 	long				ret;
+	bool				should_dirty;
 	struct bch_read_bio		rbio;
 };

@ -1619,12 +1620,22 @@ again:

 /* O_DIRECT reads */

+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
 static void bch2_dio_read_complete(struct closure *cl)
 {
 	struct dio_read *dio = container_of(cl, struct dio_read, cl);

 	dio->req->ki_complete(dio->req, dio->ret, 0);
-	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 }

 static void bch2_direct_IO_read_endio(struct bio *bio)
@ -1639,8 +1650,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio)

 static void bch2_direct_IO_read_split_endio(struct bio *bio)
 {
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
 	bch2_direct_IO_read_endio(bio);
-	bio_check_pages_dirty(bio);	/* transfers ownership */
+	bio_check_or_release(bio, should_dirty);
 }

 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
@ -1694,6 +1708,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)

 	dio->req	= req;
 	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);

 	goto start;
 	while (iter->count) {
@ -1715,7 +1735,9 @@ start:
 		}

 		offset += bio->bi_iter.bi_size;
-		bio_set_pages_dirty(bio);
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);

 		if (iter->count)
 			closure_get(&dio->cl);
@ -1729,7 +1751,7 @@ start:
 		closure_sync(&dio->cl);
 		closure_debug_destroy(&dio->cl);
 		ret = dio->ret;
-		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 		return ret;
 	} else {
 		return -EIOCBQUEUED;
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -499,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;

-		if (!journal_flushes_device(ca))
-			n->bio.bi_opf |= REQ_FUA;
-
 		if (likely(n->have_ioref)) {
 			this_cpu_add(ca->io_done->sectors[WRITE][type],
 				     bio_sectors(&n->bio));
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "journal.h"
 #include "journal_io.h"
@ -82,6 +83,7 @@ static void bch2_journal_buf_init(struct journal *j)
 	bkey_extent_init(&buf->key);
 	buf->noflush	= false;
 	buf->must_flush	= false;
+	buf->separate_flush = false;

 	memset(buf->has_inode, 0, sizeof(buf->has_inode));

@ -823,18 +825,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;

-		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
-					  ca->mi.bucket_size,
-					  gc_phase(GC_PHASE_SB),
-					  0);
+		if (!c || new_fs)
+			bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+						  ca->mi.bucket_size,
+						  gc_phase(GC_PHASE_SB),
+						  0);

 		if (c) {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->mark_lock);
 		}

+		if (c && !new_fs)
+			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+						bucket, BCH_DATA_journal,
+						ca->mi.bucket_size));
+
 		if (!new_fs)
 			bch2_open_bucket_put(c, ob);
+
+		if (ret)
+			goto err;
 	}
 err:
 	bch2_sb_resize_journal(&ca->disk_sb,
@ -953,6 +965,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	journal_quiesce(j);

 	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
 	       (journal_entry_is_open(j) ||
 		j->last_empty_seq + 1 != journal_cur_seq(j)));

--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j)

 struct bch_dev;

-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
-	return true;
-}
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
 	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -1189,6 +1189,53 @@ static void journal_write_endio(struct bio *bio)
 	percpu_ref_put(&ca->io_ref);
 }

+static void do_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_extent_ptr *ptr;
+	struct bio *bio;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio);
+		bio_set_dev(bio, ca->disk_sb.bdev);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bio->bi_opf		= REQ_OP_WRITE|REQ_SYNC|REQ_META;
+
+		if (!JSET_NO_FLUSH(w->data))
+			bio->bi_opf    |= REQ_FUA;
+		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+			bio->bi_opf    |= REQ_PREFLUSH;
+
+		bch2_bio_map(bio, w->data, sectors << 9);
+
+		trace_journal_write(bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] =
+			le64_to_cpu(w->data->seq);
+	}
+
+	continue_at(cl, journal_write_done, system_highpri_wq);
+	return;
+}
+
 void bch2_journal_write(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io);
@ -1198,9 +1245,8 @@ void bch2_journal_write(struct closure *cl)
 	struct jset_entry *start, *end;
 	struct jset *jset;
 	struct bio *bio;
-	struct bch_extent_ptr *ptr;
 	bool validate_before_checksum = false;
-	unsigned i, sectors, bytes, u64s;
+	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
 	int ret;

 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@ -1330,49 +1376,30 @@ retry_alloc:
 	if (c->opts.nochanges)
 		goto no_io;

-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		if (!percpu_ref_tryget(&ca->io_ref)) {
-			/* XXX: fix this */
-			bch_err(c, "missing device for journal write\n");
-			continue;
+	for_each_rw_member(ca, c, i)
+		nr_rw_members++;
+
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
+	if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+		for_each_rw_member(ca, c, i) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio);
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			bio->bi_opf		= REQ_OP_FLUSH;
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
 		}
-
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-			     sectors);
-
-		bio = ca->journal.bio;
-		bio_reset(bio);
-		bio_set_dev(bio, ca->disk_sb.bdev);
-		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_end_io		= journal_write_endio;
-		bio->bi_private		= ca;
-		bio->bi_opf		= REQ_OP_WRITE|REQ_SYNC|REQ_META;
-		if (!JSET_NO_FLUSH(jset))
-			bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
-		bch2_bio_map(bio, jset, sectors << 9);
-
-		trace_journal_write(bio);
-		closure_bio_submit(bio, cl);
-
-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
 	}

-	if (!JSET_NO_FLUSH(jset)) {
-		for_each_rw_member(ca, c, i)
-			if (journal_flushes_device(ca) &&
-			    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-				percpu_ref_get(&ca->io_ref);
+	bch2_bucket_seq_cleanup(c);

-				bio = ca->journal.bio;
-				bio_reset(bio);
-				bio_set_dev(bio, ca->disk_sb.bdev);
-				bio->bi_opf		= REQ_OP_FLUSH;
-				bio->bi_end_io		= journal_write_endio;
-				bio->bi_private		= ca;
-				closure_bio_submit(bio, cl);
-			}
-	}
+	continue_at(cl, do_journal_write, system_highpri_wq);
+	return;
 no_io:
 	bch2_bucket_seq_cleanup(c);

--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -31,6 +31,7 @@ struct journal_buf {
 	unsigned		u64s_reserved;
 	bool			noflush;	/* write has already been kicked off, and was noflush */
 	bool			must_flush;	/* something wants a flush */
+	bool			separate_flush;
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -154,7 +154,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		if (ret)
 			goto err;

-		if (disk_sectors_delta > (s64) &op->res.sectors) {
+		if (disk_sectors_delta > (s64) op->res.sectors) {
 			ret = bch2_disk_reservation_add(c, &op->res,
 						disk_sectors_delta - op->res.sectors,
 						!should_check_enospc
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -291,7 +291,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)

 		fragmented_allowed += ((__dev_buckets_available(ca, usage) *
 					ca->mi.bucket_size) >> 1);
-		fragmented += usage.sectors_fragmented;
+		fragmented += usage.d[BCH_DATA_user].fragmented;
 	}

 	return max_t(s64, 0, fragmented_allowed - fragmented);
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -1099,27 +1099,13 @@ use_clean:

 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);

-	if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
-	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
-		/*
-		 * interior btree node updates aren't consistent with the
-		 * journal; after an unclean shutdown we have to walk all
-		 * pointers to metadata:
-		 */
-		bch_info(c, "starting metadata mark and sweep");
-		err = "error in mark and sweep";
-		ret = bch2_gc(c, &c->journal_keys, true, true);
-		if (ret)
-			goto err;
-		bch_verbose(c, "mark and sweep done");
-	}
-
 	if (c->opts.fsck ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, &c->journal_keys, true, false);
+		ret = bch2_gc(c, &c->journal_keys, true);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -159,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	BUG_ON(!new_entry->data_type);
 	verify_replicas_entry(new_entry);

-	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
 	if (!new.entries)
 		return new;

@ -282,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c,

 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
-					sizeof(u64), GFP_NOIO)))
+					sizeof(u64), GFP_KERNEL)))
 			goto err;

-	if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
-	    !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
+	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+	    !(new_scratch  = kmalloc(bytes, GFP_KERNEL)) ||
 	    (c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
 		goto err;

 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
@ -548,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)

 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
 					 c->replicas_gc.entry_size,
-					 GFP_NOIO);
+					 GFP_KERNEL);
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
 		bch_err(c, "error allocating c->replicas_gc");
@ -671,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 		nr++;
 	}

-	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -ENOMEM;

@ -703,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 	entry_size += sizeof(struct bch_replicas_entry) -
 		sizeof(struct bch_replicas_entry_v0);

-	cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -ENOMEM;

--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -235,10 +235,7 @@ nowrote_alloc:
 	 * the journal kicks off btree writes via reclaim - wait for in flight
 	 * writes after stopping journal:
 	 */
-	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		bch2_btree_flush_all_writes(c);
-	else
-		bch2_btree_verify_flushed(c);
+	bch2_btree_flush_all_writes(c);

 	/*
 	 * After stopping journal:
@ -1222,13 +1219,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;

-	if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
-	    !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
-		mutex_lock(&c->sb_lock);
-		bch2_mark_dev_superblock(ca->fs, ca, 0);
-		mutex_unlock(&c->sb_lock);
-	}
-
 	bch2_dev_sysfs_online(c, ca);

 	if (c->sb.nr_devices == 1)
@ -1602,7 +1592,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca, 0);
+	bch2_mark_dev_superblock(NULL, ca, 0);

 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@ -1661,15 +1651,13 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);

-	bch2_mark_dev_superblock(c, ca, 0);
-
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

-	err = "alloc write failed";
-	ret = bch2_dev_alloc_write(c, ca, 0);
+	err = "error marking superblock";
+	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
 	if (ret)
-		goto err;
+		goto err_late;

 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
@ -1690,6 +1678,7 @@ err:
 	bch_err(c, "Unable to add device: %s", err);
 	return ret;
 err_late:
+	up_write(&c->state_lock);
 	bch_err(c, "Error going rw after adding device: %s", err);
 	return -EINVAL;
 }
@ -1724,6 +1713,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 		goto err;
 	}

+	if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+		err = "bch2_trans_mark_dev_sb() error";
+		goto err;
+	}
+
 	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -797,61 +797,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 		nr[c->open_buckets[i].type]++;

 	pr_buf(out,
-		"free_inc:               %zu/%zu\n"
-		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
-		"free[RESERVE_NONE]:     %zu/%zu\n"
-		"buckets:\n"
-		"    capacity:           %llu\n"
-		"    alloc:              %llu\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"    erasure coded:      %llu\n"
-		"    available:          %lli\n"
-		"sectors:\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"    erasure coded:      %llu\n"
-		"    fragmented:         %llu\n"
-		"    copygc threshold:   %llu\n"
-		"freelist_wait:          %s\n"
-		"open buckets:           %u/%u (reserved %u)\n"
-		"open_buckets_wait:      %s\n"
-		"open_buckets_btree:     %u\n"
-		"open_buckets_user:      %u\n"
-		"btree reserve cache:    %u\n",
-		fifo_used(&ca->free_inc),		ca->free_inc.size,
-		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
-		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
-		ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets_alloc,
-		stats.buckets[BCH_DATA_sb],
-		stats.buckets[BCH_DATA_journal],
-		stats.buckets[BCH_DATA_btree],
-		stats.buckets[BCH_DATA_user],
-		stats.buckets[BCH_DATA_cached],
-		stats.buckets_ec,
-		__dev_buckets_available(ca, stats),
-		stats.sectors[BCH_DATA_sb],
-		stats.sectors[BCH_DATA_journal],
-		stats.sectors[BCH_DATA_btree],
-		stats.sectors[BCH_DATA_user],
-		stats.sectors[BCH_DATA_cached],
-		stats.sectors_ec,
-		stats.sectors_fragmented,
-		c->copygc_threshold,
-		c->freelist_wait.list.first		? "waiting" : "empty",
-		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
-		BTREE_NODE_OPEN_BUCKET_RESERVE,
-		c->open_buckets_wait.list.first		? "waiting" : "empty",
-		nr[BCH_DATA_btree],
-		nr[BCH_DATA_user],
-		c->btree_reserve_cache_nr);
+	       "\t\t buckets\t sectors      fragmented\n"
+	       "capacity%16llu\n",
+	       ca->mi.nbuckets - ca->mi.first_bucket);
+
+	for (i = 1; i < BCH_DATA_NR; i++)
+		pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+		       bch2_data_types[i], stats.d[i].buckets,
+		       stats.d[i].sectors, stats.d[i].fragmented);
+
+	pr_buf(out,
+	       "ec\t%16llu\n"
+	       "available%15llu\n"
+	       "alloc\t%16llu\n"
+	       "\n"
+	       "free_inc\t\t%zu/%zu\n"
+	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+	       "free[RESERVE_NONE]\t%zu/%zu\n"
+	       "freelist_wait\t\t%s\n"
+	       "open buckets\t\t%u/%u (reserved %u)\n"
+	       "open_buckets_wait\t%s\n"
+	       "open_buckets_btree\t%u\n"
+	       "open_buckets_user\t%u\n"
+	       "btree reserve cache\t%u\n",
+	       stats.buckets_ec,
+	       __dev_buckets_available(ca, stats),
+	       stats.buckets_alloc,
+	       fifo_used(&ca->free_inc),		ca->free_inc.size,
+	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
+	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
+	       c->freelist_wait.list.first		? "waiting" : "empty",
+	       c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+	       BTREE_NODE_OPEN_BUCKET_RESERVE,
+	       c->open_buckets_wait.list.first		? "waiting" : "empty",
+	       nr[BCH_DATA_btree],
+	       nr[BCH_DATA_user],
+	       c->btree_reserve_cache_nr);
 }

 static const char * const bch2_rw[] = {