Update bcachefs sources to d83b992f65 bcachefs: Rewrite journal_seq_blacklist machinery

2025-02-22 00:00:03 +03:00 · 2019-04-04 22:15:36 -04:00 · 2019-04-04 22:15:36 -04:00 · d13bbb2955
commit d13bbb2955
parent be02db130b
43 changed files with 976 additions and 887 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-1712318522fdaa533f8622f4c7da05e44a4828b0
+d83b992f653d9f742f3f8567dbcfd1f4f72e858f
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@ -8,8 +8,8 @@
 #include <linux/types.h>
 #include <linux/crypto.h>

-#define CHACHA20_IV_SIZE	16
-#define CHACHA20_KEY_SIZE	32
-#define CHACHA20_BLOCK_SIZE	64
+#define CHACHA_IV_SIZE	16
+#define CHACHA_KEY_SIZE	32
+#define CHACHA_BLOCK_SIZE	64

 #endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -147,12 +147,9 @@ static inline u64 ktime_get_real_seconds(void)
 	return ts.tv_sec;
 }

-static inline struct timespec current_kernel_time(void)
+static inline void ktime_get_real_ts64(struct timespec64 *ts)
 {
-	struct timespec ts;
-
-	clock_gettime(CLOCK_MONOTONIC, &ts);
-	return ts;
+	clock_gettime(CLOCK_MONOTONIC, ts);
 }

 #define current_kernel_time64()	current_kernel_time()
--- a/libbcachefs.c
+++ b/libbcachefs.c
@ -619,6 +619,11 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
 {
 }

+static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
+				enum units units)
+{
+}
+
 typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);

 struct bch_sb_field_toolops {
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -290,8 +290,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}

-	for_each_member_device(ca, c, i)
-		bch2_dev_usage_from_buckets(c, ca);
+	percpu_down_write(&c->mark_lock);
+	bch2_dev_usage_from_buckets(c);
+	percpu_up_write(&c->mark_lock);

 	mutex_lock(&c->bucket_clock[READ].lock);
 	for_each_member_device(ca, c, i) {
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -183,6 +183,7 @@
 #include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
@ -220,6 +221,8 @@
 	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
 	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
 	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err_ratelimited(c, fmt, ...) \
@ -481,6 +484,7 @@ enum {
 	BCH_FS_RW,

 	/* shutdown: */
+	BCH_FS_STOPPING,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,

@ -506,6 +510,15 @@ struct bch_fs_pcpu {
 	u64			sectors_available;
 };

+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[0];
+};
+
 struct bch_fs {
 	struct closure		cl;

@ -641,6 +654,11 @@ struct bch_fs {

 	struct io_clock		io_clock[2];

+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
@ -794,4 +812,27 @@ static inline unsigned block_bytes(const struct bch_fs *c)
 	return c->opts.block_size << 9;
 }

+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+	if (c->sb.time_precision == 1)
+		return ns;
+
+	return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
 #endif /* _BCACHEFS_H */
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -904,7 +904,8 @@ struct bch_sb_field {
 	x(quota,	4)	\
 	x(disk_groups,	5)	\
 	x(clean,	6)	\
-	x(replicas,	7)
+	x(replicas,	7)	\
+	x(journal_seq_blacklist, 8)

 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@ -1119,6 +1120,20 @@ struct bch_sb_field_clean {
 	};
 };

+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+
+	union {
+		struct journal_seq_blacklist_entry start[0];
+		__u64		_data[0];
+	};
+};
+
 /* Superblock: */

 /*
@ -1274,6 +1289,7 @@ enum bch_sb_features {
 	BCH_FEATURE_ZSTD		= 2,
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
 	BCH_FEATURE_EC			= 4,
+	BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
 	BCH_FEATURE_NR,
 };

--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -114,7 +114,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const struct bch_extent_ptr *ptr;
-	struct gc_pos pos = { 0 };
 	unsigned flags =
 		BCH_BUCKET_MARK_GC|
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
@ -171,7 +170,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}

-	bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags);
+	bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags);
 fsck_err:
 	return ret;
 }
@ -202,7 +201,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
 }

 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial)
+			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
@ -222,7 +221,9 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	 * and on startup, we have to read every btree node (XXX: only if it was
 	 * an unclean shutdown)
 	 */
-	if (initial || expensive_debug_checks(c))
+	if (metadata_only)
+		depth = 1;
+	else if (initial || expensive_debug_checks(c))
 		depth = 0;

 	btree_node_range_checks_init(&r, depth);
@ -278,7 +279,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 }

 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
-			  bool initial)
+			  bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	u8 max_stale;
@ -292,11 +293,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 		enum btree_id id = ids[i];
 		enum btree_node_type type = __btree_node_type(0, id);

-		int ret = bch2_gc_btree(c, id, initial);
+		int ret = bch2_gc_btree(c, id, initial, metadata_only);
 		if (ret)
 			return ret;

-		if (journal && btree_node_type_needs_gc(type)) {
+		if (journal && !metadata_only &&
+		    btree_node_type_needs_gc(type)) {
 			struct bkey_i *k, *n;
 			struct jset_entry *j;
 			struct journal_replay *r;
@ -397,7 +399,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
-	struct gc_pos pos = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;

@ -407,8 +408,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
 			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      true, 0,
-				      pos, NULL, 0,
+				      true, 0, NULL, 0,
 				      BCH_BUCKET_MARK_GC);

 	mutex_unlock(&c->btree_interior_update_lock);
@ -481,25 +481,28 @@ static void bch2_gc_free(struct bch_fs *c)
 	c->usage[1] = NULL;
 }

-static void bch2_gc_done(struct bch_fs *c, bool initial)
+static int bch2_gc_done(struct bch_fs *c,
+			bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
-	bool verify = !initial ||
-		(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
+	bool verify = !metadata_only &&
+		(!initial ||
+		 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
 	unsigned i;
+	int ret = 0;

 #define copy_field(_f, _msg, ...)					\
 	if (dst->_f != src->_f) {					\
 		if (verify)						\
-			bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+			fsck_err(c, _msg ": got %llu, should be %llu"	\
 				, ##__VA_ARGS__, dst->_f, src->_f);	\
 		dst->_f = src->_f;					\
 	}
 #define copy_stripe_field(_f, _msg, ...)				\
 	if (dst->_f != src->_f) {					\
 		if (verify)						\
-			bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
-				": got %u, should be %u, fixing",	\
+			fsck_err(c, "stripe %zu has wrong "_msg		\
+				": got %u, should be %u",		\
 				dst_iter.pos, ##__VA_ARGS__,		\
 				dst->_f, src->_f);			\
 		dst->_f = src->_f;					\
@ -508,8 +511,8 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_bucket_field(_f)						\
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
 		if (verify)						\
-			bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
-				": got %u, should be %u, fixing", i, b,	\
+			fsck_err(c, "dev %u bucket %zu has wrong " #_f	\
+				": got %u, should be %u", i, b,		\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
 		dst->b[b]._mark._f = src->b[b].mark._f;			\
 		dst->b[b]._mark.dirty = true;				\
@ -519,7 +522,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)

-	{
+	if (!metadata_only) {
 		struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
 		struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
@ -571,26 +574,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		}
 	};

-	for_each_member_device(ca, c, i) {
-		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
-		struct bch_dev_usage *dst = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
-		struct bch_dev_usage *src = (void *)
-			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-		unsigned b;
-
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(buckets[b],	"buckets[%s]",
-				       bch2_data_types[b]);
-		copy_dev_field(buckets_alloc,		"buckets_alloc");
-		copy_dev_field(buckets_ec,		"buckets_ec");
-		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
-
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_dev_field(sectors[b],	"sectors[%s]",
-				       bch2_data_types[b]);
-		copy_dev_field(sectors_fragmented,	"sectors_fragmented");
-	}
+	bch2_dev_usage_from_buckets(c);

 	{
 		unsigned nr = fs_usage_u64s(c);
@ -600,20 +584,29 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 			bch2_acc_percpu_u64s((void *) c->usage[1], nr);

 		copy_fs_field(hidden,		"hidden");
-		copy_fs_field(data,		"data");
-		copy_fs_field(cached,		"cached");
-		copy_fs_field(reserved,		"reserved");
-		copy_fs_field(nr_inodes,	"nr_inodes");
+		copy_fs_field(btree,		"btree");

-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}

 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];

+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_USER ||
+			     e->data_type == BCH_DATA_CACHED))
+				continue;
+
 			bch2_replicas_entry_to_text(&PBUF(buf), e);

 			copy_fs_field(replicas[i], "%s", buf);
@ -625,9 +618,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 #undef copy_bucket_field
 #undef copy_stripe_field
 #undef copy_field
+fsck_err:
+	return ret;
 }

-static int bch2_gc_start(struct bch_fs *c)
+static int bch2_gc_start(struct bch_fs *c,
+			 bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@ -673,10 +669,18 @@ static int bch2_gc_start(struct bch_fs *c)
 		dst->nbuckets		= src->nbuckets;

 		for (b = 0; b < src->nbuckets; b++) {
-			dst->b[b]._mark.gen =
-				dst->b[b].oldest_gen =
-				src->b[b].mark.gen;
-			dst->b[b].gen_valid = src->b[b].gen_valid;
+			struct bucket *d = &dst->b[b];
+			struct bucket *s = &src->b[b];
+
+			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
+			d->gen_valid = s->gen_valid;
+
+			if (metadata_only &&
+			    (s->mark.data_type == BCH_DATA_USER ||
+			     s->mark.data_type == BCH_DATA_CACHED)) {
+				d->_mark = s->mark;
+				d->_mark.owned_by_allocator = 0;
+			}
 		}
 	};

@ -701,7 +705,8 @@ static int bch2_gc_start(struct bch_fs *c)
 *    move around - if references move backwards in the ordering GC
 *    uses, GC could skip past them
 */
-int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
+int bch2_gc(struct bch_fs *c, struct list_head *journal,
+	    bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@ -713,7 +718,7 @@ int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 	down_write(&c->gc_lock);
 again:
 	percpu_down_write(&c->mark_lock);
-	ret = bch2_gc_start(c);
+	ret = bch2_gc_start(c, metadata_only);
 	percpu_up_write(&c->mark_lock);

 	if (ret)
@ -721,7 +726,7 @@ again:

 	bch2_mark_superblocks(c);

-	ret = bch2_gc_btrees(c, journal, initial);
+	ret = bch2_gc_btrees(c, journal, initial, metadata_only);
 	if (ret)
 		goto out;

@ -755,7 +760,7 @@ out:
 	percpu_down_write(&c->mark_lock);

 	if (!ret)
-		bch2_gc_done(c, initial);
+		ret = bch2_gc_done(c, initial, metadata_only);

 	/* Indicates that gc is no longer in progress: */
 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
@ -1157,7 +1162,7 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);

-		ret = bch2_gc(c, NULL, false);
+		ret = bch2_gc(c, NULL, false, false);
 		if (ret)
 			bch_err(c, "btree gc failed: %i", ret);

--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@ -4,7 +4,7 @@
 #include "btree_types.h"

 void bch2_coalesce(struct bch_fs *);
-int bch2_gc(struct bch_fs *, struct list_head *, bool);
+int bch2_gc(struct bch_fs *, struct list_head *, bool, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -509,7 +509,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
 			     bytes);

-		nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
 	}

 	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@ -770,7 +770,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	struct btree_node *sorted;
 	struct bkey_packed *k;
 	struct bset *i;
-	bool used_mempool;
+	bool used_mempool, blacklisted;
 	unsigned u64s;
 	int ret, retry_read = 0, write = READ;

@ -844,20 +844,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry

 		b->written += sectors;

-		ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
-		if (ret < 0) {
-			btree_err(BTREE_ERR_FATAL, c, b, i,
-				  "insufficient memory");
-			goto err;
-		}
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);

-		if (ret) {
-			btree_err_on(first,
-				     BTREE_ERR_FIXABLE, c, b, i,
-				     "first btree node bset has blacklisted journal seq");
-			if (!first)
-				continue;
-		}
+		btree_err_on(blacklisted && first,
+			     BTREE_ERR_FIXABLE, c, b, i,
+			     "first btree node bset has blacklisted journal seq");
+		if (blacklisted && !first)
+			continue;

 		bch2_btree_node_iter_large_push(iter, b,
 					   i->start,
@ -930,7 +925,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 out:
 	mempool_free(iter, &c->fill_iter);
 	return retry_read;
-err:
 fsck_err:
 	if (ret == BTREE_RETRY_READ) {
 		retry_read = 1;
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -818,14 +818,6 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	struct btree_iter *linked;
 	unsigned level = b->level;

-	/* caller now responsible for unlocking @b */
-
-	BUG_ON(iter->l[level].b != b);
-	BUG_ON(!btree_node_intent_locked(iter, level));
-
-	iter->l[level].b = BTREE_ITER_NOT_END;
-	mark_btree_node_unlocked(iter, level);
-
 	trans_for_each_iter(iter->trans, linked)
 		if (linked->l[level].b == b) {
 			__btree_node_unlock(linked, level);
@ -990,6 +982,7 @@ retry_all:
 	}

 	if (unlikely(ret == -EIO)) {
+		trans->error = true;
 		iter->flags |= BTREE_ITER_ERROR;
 		iter->l[iter->level].b = BTREE_ITER_NOT_END;
 		goto out;
@ -1162,6 +1155,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 	if (!btree_iter_node(iter, iter->level))
 		return NULL;

+	bch2_trans_cond_resched(iter->trans);
+
 	btree_iter_up(iter);

 	if (!bch2_btree_node_relock(iter, iter->level))
@ -1712,7 +1707,7 @@ void bch2_trans_preload_iters(struct btree_trans *trans)

 static int btree_trans_iter_alloc(struct btree_trans *trans)
 {
-	unsigned idx = ffz(trans->iters_linked);
+	unsigned idx = __ffs64(~trans->iters_linked);

 	if (idx < trans->nr_iters)
 		goto got_slot;
@ -1877,17 +1872,17 @@ void *bch2_trans_kmalloc(struct btree_trans *trans,

 int bch2_trans_unlock(struct btree_trans *trans)
 {
-	unsigned iters = trans->iters_linked;
+	u64 iters = trans->iters_linked;
 	int ret = 0;

 	while (iters) {
-		unsigned idx = __ffs(iters);
+		unsigned idx = __ffs64(iters);
 		struct btree_iter *iter = &trans->iters[idx];

 		ret = ret ?: btree_iter_err(iter);

 		__bch2_btree_iter_unlock(iter);
-		iters ^= 1 << idx;
+		iters ^= 1ULL << idx;
 	}

 	return ret;
@ -1949,7 +1944,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)

 int bch2_trans_exit(struct btree_trans *trans)
 {
-	int ret = bch2_trans_unlock(trans);
+	bch2_trans_unlock(trans);

 	kfree(trans->mem);
 	if (trans->used_mempool)
@ -1958,5 +1953,6 @@ int bch2_trans_exit(struct btree_trans *trans)
 		kfree(trans->iters);
 	trans->mem	= (void *) 0x1;
 	trans->iters	= (void *) 0x1;
-	return ret;
+
+	return trans->error ? -EIO : 0;
 }
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -279,6 +279,7 @@ struct btree_trans {
 	u8			nr_updates;
 	u8			size;
 	unsigned		used_mempool:1;
+	unsigned		error:1;

 	unsigned		mem_top;
 	unsigned		mem_bytes;
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -161,7 +161,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	struct gc_pos pos = { 0 };

 	for (d = as->pending; d < as->pending + as->nr_pending; d++)
 		if (!bkey_cmp(k.k->p, d->key.k.p) &&
@ -189,18 +188,12 @@ found:
 	 * to cancel out one of mark and sweep's markings if necessary:
 	 */

-	/*
-	 * bch2_mark_key() compares the current gc pos to the pos we're
-	 * moving this reference from, hence one comparison here:
-	 */
 	if (gc_pos_cmp(c->gc_pos, b
 		       ? gc_pos_btree_node(b)
 		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
 	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
-		bch2_mark_key_locked(c,
-			      bkey_i_to_s_c(&d->key),
-			      false, 0, pos,
-			      NULL, 0, BCH_BUCKET_MARK_GC);
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
+			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
 }

 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@ -272,8 +265,11 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,

 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		      false, 0,
-		      gc_phase(GC_PHASE_PENDING_DELETE),
 		      NULL, 0, 0);
+
+	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
+		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+			      false, 0, NULL, 0, BCH_BUCKET_MARK_GC);
 }

 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -1078,9 +1074,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	fs_usage = bch2_fs_usage_scratch_get(c);

 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-		      true, 0,
-		      gc_pos_btree_root(b->btree_id),
-		      fs_usage, 0, 0);
+		      true, 0, fs_usage, 0, 0);
+	if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
+				     true, 0, NULL, 0,
+				     BCH_BUCKET_MARK_GC);

 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
@ -1172,8 +1170,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	fs_usage = bch2_fs_usage_scratch_get(c);

 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-			     true, 0,
-			     gc_pos_btree_node(b), fs_usage, 0, 0);
+			     true, 0, fs_usage, 0, 0);
+
+	if (gc_visited(c, gc_pos_btree_node(b)))
+		bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+				     true, 0, NULL, 0, BCH_BUCKET_MARK_GC);

 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@ -1428,6 +1429,7 @@ static void btree_split(struct btree_update *as, struct btree *b,

 	/* Successful split, update the iterator to point to the new nodes: */

+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	if (n3)
 		bch2_btree_iter_node_replace(iter, n3);
@ -1739,7 +1741,10 @@ retry:

 	bch2_open_buckets_put(c, &n->ob);

+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
+	bch2_btree_iter_node_drop(iter, m);
+
 	bch2_btree_iter_node_replace(iter, n);

 	bch2_btree_iter_verify(iter, n);
@ -1837,6 +1842,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,

 	bch2_open_buckets_put(c, &n->ob);

+	six_lock_increment(&b->lock, SIX_LOCK_intent);
 	bch2_btree_iter_node_drop(iter, b);
 	bch2_btree_iter_node_replace(iter, n);
 	bch2_btree_node_free_inmem(c, b, iter);
@ -1988,9 +1994,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 		fs_usage = bch2_fs_usage_scratch_get(c);

 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-			      true, 0,
-			      gc_pos_btree_root(b->btree_id),
-			      fs_usage, 0, 0);
+			      true, 0, fs_usage, 0, 0);
+		if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
+			bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+					     true, 0, NULL, 0,
+					     BCH_BUCKET_MARK_GC);
+
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
 					   fs_usage);
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
+#include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
@ -601,10 +602,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
 	}

 	trans_for_each_update_iter(trans, i)
-		bch2_mark_update(trans, i, fs_usage);
+		bch2_mark_update(trans, i, fs_usage, 0);
 	if (fs_usage)
 		bch2_trans_fs_usage_apply(trans, fs_usage);

+	if (unlikely(c->gc_pos.phase)) {
+		trans_for_each_update_iter(trans, i)
+			if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+				bch2_mark_update(trans, i, NULL,
+						 BCH_BUCKET_MARK_GC);
+	}
+
 	trans_for_each_update(trans, i)
 		do_btree_insert_one(trans, i);
 out:
@ -852,12 +860,15 @@ out_noupdates:

 	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);

-	bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
 	if (!ret) {
-		bch2_trans_unlink_iters(trans, ~trans->iters_touched);
+		bch2_trans_unlink_iters(trans, ~trans->iters_touched|
+					trans->iters_unlink_on_commit);
 		trans->iters_touched = 0;
+	} else {
+		bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
 	}
-	trans->nr_updates = 0;
+	trans->nr_updates	= 0;
+	trans->mem_top		= 0;

 	return ret;
 err:
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -131,6 +131,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)

 		switch (e->data_type) {
 		case BCH_DATA_BTREE:
+			usage->btree	+= usage->replicas[i];
+			break;
 		case BCH_DATA_USER:
 			usage->data	+= usage->replicas[i];
 			break;
@ -225,6 +227,7 @@ static u64 avail_factor(u64 r)
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
 {
 	return min(fs_usage->hidden +
+		   fs_usage->btree +
 		   fs_usage->data +
 		   reserve_factor(fs_usage->reserved +
 				  fs_usage->online_reserved),
@ -240,7 +243,8 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
 	ret.capacity = c->capacity -
 		percpu_u64_get(&c->usage[0]->hidden);

-	data		= percpu_u64_get(&c->usage[0]->data);
+	data		= percpu_u64_get(&c->usage[0]->data) +
+			  percpu_u64_get(&c->usage[0]->btree);
 	reserved	= percpu_u64_get(&c->usage[0]->reserved) +
 		percpu_u64_get(&c->usage[0]->online_reserved);

@ -383,21 +387,32 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }

-void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
+void bch2_dev_usage_from_buckets(struct bch_fs *c)
 {
+	struct bch_dev *ca;
 	struct bucket_mark old = { .v.counter = 0 };
 	struct bch_fs_usage *fs_usage;
 	struct bucket_array *buckets;
 	struct bucket *g;
+	unsigned i;
+	int cpu;

-	percpu_down_read_preempt_disable(&c->mark_lock);
-	fs_usage = this_cpu_ptr(c->usage[0]);
-	buckets = bucket_array(ca);
+	percpu_u64_set(&c->usage[0]->hidden, 0);

-	for_each_bucket(g, buckets)
-		if (g->mark.data_type)
-			bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
-	percpu_up_read_preempt_enable(&c->mark_lock);
+	for_each_member_device(ca, c, i) {
+		for_each_possible_cpu(cpu)
+			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
+			       sizeof(*ca->usage[0]));
+
+		preempt_disable();
+		fs_usage = this_cpu_ptr(c->usage[0]);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets)
+			bch2_dev_usage_update(c, ca, fs_usage,
+					      old, g->mark, false);
+		preempt_enable();
+	}
 }

 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)	\
@ -418,10 +433,17 @@ static inline void update_replicas(struct bch_fs *c,
 	BUG_ON(idx < 0);
 	BUG_ON(!sectors);

-	if (r->data_type == BCH_DATA_CACHED)
-		fs_usage->cached	+= sectors;
-	else
+	switch (r->data_type) {
+	case BCH_DATA_BTREE:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_USER:
 		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_CACHED:
+		fs_usage->cached	+= sectors;
+		break;
+	}
 	fs_usage->replicas[idx]		+= sectors;
 }

@ -924,12 +946,13 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }

-static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-			   bool inserting, s64 sectors,
-			   struct bch_fs_usage *fs_usage,
-			   unsigned journal_seq, unsigned flags,
-			   bool gc)
+int bch2_mark_key_locked(struct bch_fs *c,
+		   struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct bch_fs_usage *fs_usage,
+		   u64 journal_seq, unsigned flags)
 {
+	bool gc = flags & BCH_BUCKET_MARK_GC;
 	int ret = 0;

 	preempt_disable();
@ -981,21 +1004,8 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	return ret;
 }

-int bch2_mark_key_locked(struct bch_fs *c,
-		   struct bkey_s_c k,
-		   bool inserting, s64 sectors,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *fs_usage,
-		   u64 journal_seq, unsigned flags)
-{
-	return do_mark_fn(__bch2_mark_key, c, pos, flags,
-			  k, inserting, sectors, fs_usage,
-			  journal_seq, flags);
-}
-
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		  bool inserting, s64 sectors,
-		  struct gc_pos pos,
 		  struct bch_fs_usage *fs_usage,
 		  u64 journal_seq, unsigned flags)
 {
@ -1003,7 +1013,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,

 	percpu_down_read_preempt_disable(&c->mark_lock);
 	ret = bch2_mark_key_locked(c, k, inserting, sectors,
-				   pos, fs_usage, journal_seq, flags);
+				   fs_usage, journal_seq, flags);
 	percpu_up_read_preempt_enable(&c->mark_lock);

 	return ret;
@ -1011,13 +1021,13 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,

 void bch2_mark_update(struct btree_trans *trans,
 		      struct btree_insert_entry *insert,
-		      struct bch_fs_usage *fs_usage)
+		      struct bch_fs_usage *fs_usage,
+		      unsigned flags)
 {
 	struct bch_fs		*c = trans->c;
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;

 	if (!btree_node_type_needs_gc(iter->btree_id))
@ -1027,7 +1037,7 @@ void bch2_mark_update(struct btree_trans *trans,
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			pos, fs_usage, trans->journal_res.seq, 0);
+			fs_usage, trans->journal_res.seq, flags);

 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
@ -1060,7 +1070,8 @@ void bch2_mark_update(struct btree_trans *trans,
 				BUG_ON(sectors <= 0);

 				bch2_mark_key_locked(c, k, true, sectors,
-					pos, fs_usage, trans->journal_res.seq, 0);
+					fs_usage, trans->journal_res.seq,
+					flags);

 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@ -1071,7 +1082,7 @@ void bch2_mark_update(struct btree_trans *trans,
 		}

 		bch2_mark_key_locked(c, k, false, sectors,
-			pos, fs_usage, trans->journal_res.seq, 0);
+			fs_usage, trans->journal_res.seq, flags);

 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -173,7 +173,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,

 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);

-void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
+void bch2_dev_usage_from_buckets(struct bch_fs *);

 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
@ -245,16 +245,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 1)

 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct gc_pos,
-		  struct bch_fs_usage *, u64, unsigned);
+		  bool, s64, struct bch_fs_usage *,
+		  u64, unsigned);
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
-		  bool, s64, struct gc_pos,
-		  struct bch_fs_usage *, u64, unsigned);
+		  bool, s64, struct bch_fs_usage *,
+		  u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *);

 void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
-		      struct bch_fs_usage *);
+		      struct bch_fs_usage *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);

 /* disk reservations: */
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -69,6 +69,7 @@ struct bch_fs_usage {
 	u64			gc_start[0];

 	u64			hidden;
+	u64			btree;
 	u64			data;
 	u64			cached;
 	u64			reserved;
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@ -9,7 +9,7 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/hash.h>
 #include <crypto/poly1305.h>
 #include <keys/user-type.h>
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@ -6,7 +6,7 @@
 #include "super-io.h"

 #include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>

 static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len)
 {
@ -126,9 +126,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
 /* for skipping ahead and encrypting/decrypting at an offset: */
 static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 {
-	EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));

-	le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
 	return nonce;
 }

--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@ -328,17 +328,18 @@ out:
 	return inum;
 }

-int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 {
-	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = 0;

-	bch2_trans_init(&trans, c);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS,
+				   POS(dir_inum, 0), 0);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);

-	for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
-			   POS(dir_inum, 0), 0, k) {
+	for_each_btree_key_continue(iter, 0, k) {
 		if (k.k->p.inode > dir_inum)
 			break;

@ -347,11 +348,17 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
 			break;
 		}
 	}
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_put(trans, iter);

 	return ret;
 }

+int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+{
+	return bch2_trans_do(c, NULL, 0,
+		bch2_empty_dir_trans(&trans, dir_inum));
+}
+
 int bch2_readdir(struct bch_fs *c, struct file *file,
 		 struct dir_context *ctx)
 {
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@ -54,6 +54,7 @@ int bch2_dirent_rename(struct btree_trans *,
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 		       const struct qstr *);

+int bch2_empty_dir_trans(struct btree_trans *, u64);
 int bch2_empty_dir(struct bch_fs *, u64);
 int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);

--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@ -1231,10 +1231,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote)

 static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
 {
-
-	struct gc_pos pos = { 0 };
-
-	bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
+	bch2_mark_key(c, k, true, 0, NULL, 0, 0);
 }

 int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -757,7 +757,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 	EBUG_ON(!PageLocked(page));
 	EBUG_ON(!PageLocked(newpage));

-	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;

--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@ -265,7 +265,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			return -EPERM;

 		down_write(&sb->s_umount);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		bch2_fs_emergency_read_only(c);
 		up_write(&sb->s_umount);
 		return 0;
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -1582,7 +1582,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 	struct bch_opts opts = bch2_opts_empty();
 	int ret;

-	opt_set(opts, read_only, (*flags & MS_RDONLY) != 0);
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);

 	ret = bch2_parse_mount_opts(&opts, data);
 	if (ret)
@ -1594,7 +1594,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 		if (opts.read_only) {
 			bch2_fs_read_only(c);

-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		} else {
 			ret = bch2_fs_read_write(c);
 			if (ret) {
@ -1603,7 +1603,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 				return -EINVAL;
 			}

-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 		}

 		c->opts.read_only = opts.read_only;
@ -1681,7 +1681,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	unsigned i;
 	int ret;

-	opt_set(opts, read_only, (flags & MS_RDONLY) != 0);
+	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);

 	ret = bch2_parse_mount_opts(&opts, data);
 	if (ret)
@ -1691,7 +1691,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	if (IS_ERR(c))
 		return ERR_CAST(c);

-	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
+	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
 	if (IS_ERR(sb)) {
 		closure_put(&c->cl);
 		return ERR_CAST(sb);
@ -1702,7 +1702,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	if (sb->s_root) {
 		closure_put(&c->cl);

-		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+		if ((flags ^ sb->s_flags) & SB_RDONLY) {
 			ret = -EBUSY;
 			goto err_put_super;
 		}
@ -1745,7 +1745,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,

 #ifdef CONFIG_BCACHEFS_POSIX_ACL
 	if (c->opts.acl)
-		sb->s_flags	|= MS_POSIXACL;
+		sb->s_flags	|= SB_POSIXACL;
 #endif

 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
@ -1760,7 +1760,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		goto err_put_super;
 	}

-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 out:
 	return dget(sb->s_root);

--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -69,11 +69,6 @@ static inline unsigned nlink_bias(umode_t mode)
 	return S_ISDIR(mode) ? 2 : 1;
 }

-static inline u64 bch2_current_time(struct bch_fs *c)
-{
-	return timespec_to_bch2_time(c, current_kernel_time64());
-}
-
 static inline bool inode_attr_changing(struct bch_inode_info *dir,
 				struct bch_inode_info *inode,
 				enum inode_opt_id id)
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -127,18 +127,21 @@ static struct inode_walker inode_walker_init(void)
 	};
 }

-static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
+static int walk_inode(struct btree_trans *trans,
+		      struct inode_walker *w, u64 inum)
 {
-	w->first_this_inode	= inum != w->cur_inum;
-	w->cur_inum		= inum;
-
-	if (w->first_this_inode) {
-		int ret = bch2_inode_find_by_inum(c, inum, &w->inode);
+	if (inum != w->cur_inum) {
+		int ret = bch2_inode_find_by_inum_trans(trans, inum,
+							&w->inode);

 		if (ret && ret != -ENOENT)
 			return ret;

-		w->have_inode = !ret;
+		w->have_inode	= !ret;
+		w->cur_inum	= inum;
+		w->first_this_inode = true;
+	} else {
+		w->first_this_inode = false;
 	}

 	return 0;
@ -444,12 +447,15 @@ static int check_extents(struct bch_fs *c)
 	int ret = 0;

 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);

 	bch_verbose(c, "checking extents");

-	for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k) {
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;

@ -514,6 +520,8 @@ static int check_extents(struct bch_fs *c)
 	}
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
 	return bch2_trans_exit(&trans) ?: ret;
 }

@ -536,21 +544,20 @@ static int check_dirents(struct bch_fs *c)
 	bch_verbose(c, "checking dirents");

 	bch2_trans_init(&trans, c);
-
 	bch2_trans_preload_iters(&trans);

-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
-
 	hash_check_init(&h);

+	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+				   POS(BCACHEFS_ROOT_INO, 0), 0);
+retry:
 	for_each_btree_key_continue(iter, 0, k) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
 		u64 d_inum;

-		ret = walk_inode(c, &w, k.k->p.inode);
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;

@ -619,7 +626,7 @@ static int check_dirents(struct bch_fs *c)
 			continue;
 		}

-		ret = bch2_inode_find_by_inum(c, d_inum, &target);
+		ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
 		if (ret && ret != -ENOENT)
 			break;

@ -670,6 +677,9 @@ static int check_dirents(struct bch_fs *c)
 	hash_stop_chain(&trans, &h);
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
+
 	return bch2_trans_exit(&trans) ?: ret;
 }

@ -688,17 +698,16 @@ static int check_xattrs(struct bch_fs *c)

 	bch_verbose(c, "checking xattrs");

-	bch2_trans_init(&trans, c);
+	hash_check_init(&h);

+	bch2_trans_init(&trans, c);
 	bch2_trans_preload_iters(&trans);

 	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
 				   POS(BCACHEFS_ROOT_INO, 0), 0);
-
-	hash_check_init(&h);
-
+retry:
 	for_each_btree_key_continue(iter, 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
+		ret = walk_inode(&trans, &w, k.k->p.inode);
 		if (ret)
 			break;

@ -721,6 +730,8 @@ static int check_xattrs(struct bch_fs *c)
 	}
 err:
 fsck_err:
+	if (ret == -EINTR)
+		goto retry;
 	return bch2_trans_exit(&trans) ?: ret;
 }

@ -904,6 +915,7 @@ static int check_directory_structure(struct bch_fs *c,
 	int ret = 0;

 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);

 	bch_verbose(c, "checking directory structure");

@ -918,9 +930,8 @@ restart_dfs:
 	}

 	ret = path_down(&path, BCACHEFS_ROOT_INO);
-	if (ret) {
-		return ret;
-	}
+	if (ret)
+		goto err;

 	while (path.nr) {
 next:
@ -982,14 +993,19 @@ up:
 		path.nr--;
 	}

-	for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k) {
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+retry:
+	for_each_btree_key_continue(iter, 0, k) {
 		if (k.k->type != KEY_TYPE_inode)
 			continue;

 		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
 			continue;

-		if (!bch2_empty_dir(c, k.k->p.inode))
+		ret = bch2_empty_dir_trans(&trans, k.k->p.inode);
+		if (ret == -EINTR)
+			goto retry;
+		if (!ret)
 			continue;

 		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
@ -1017,15 +1033,12 @@ up:
 		memset(&path, 0, sizeof(path));
 		goto restart_dfs;
 	}
-
-out:
-	kfree(dirs_done.bits);
-	kfree(path.entries);
-	return ret;
 err:
 fsck_err:
 	ret = bch2_trans_exit(&trans) ?: ret;
-	goto out;
+	kfree(dirs_done.bits);
+	kfree(path.entries);
+	return ret;
 }

 struct nlink {
@ -1069,6 +1082,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	int ret;

 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);

 	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);

@ -1225,12 +1239,10 @@ static int check_inode(struct btree_trans *trans,
 			return ret;
 	}

-	if (u.bi_flags & BCH_INODE_UNLINKED) {
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu unlinked",
-			    u.bi_inum);
-
+	if (u.bi_flags & BCH_INODE_UNLINKED &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
 		bch_verbose(c, "deleting inode %llu", u.bi_inum);

 		ret = bch2_inode_rm(c, u.bi_inum);
@ -1240,12 +1252,10 @@ static int check_inode(struct btree_trans *trans,
 		return ret;
 	}

-	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) {
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_size dirty",
-			    u.bi_inum);
-
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
 		bch_verbose(c, "truncating inode %llu", u.bi_inum);

 		/*
@ -1270,14 +1280,12 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}

-	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) {
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
 		s64 sectors;

-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_sectors dirty",
-			    u.bi_inum);
-
 		bch_verbose(c, "recounting sectors for inode %llu",
 			    u.bi_inum);

@ -1326,6 +1334,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	u64 nlinks_pos;

 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);

 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
 				   POS(range_start, 0), 0);
@ -1425,6 +1434,7 @@ static int check_inodes_fast(struct bch_fs *c)
 	int ret = 0, ret2;

 	bch2_trans_init(&trans, c);
+	bch2_trans_preload_iters(&trans);

 	iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
 				   POS_MIN, 0);
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -251,9 +251,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 		     struct bch_inode_unpacked *parent)
 {
-	s64 now = timespec_to_bch2_time(c,
-		timespec64_trunc(current_kernel_time64(),
-				 c->sb.time_precision));
+	s64 now = bch2_current_time(c);

 	memset(inode_u, 0, sizeof(*inode_u));

@ -445,31 +443,32 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	return ret;
 }

-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
-			    struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+				  struct bch_inode_unpacked *inode)
 {
-	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	int ret = -ENOENT;

-	bch2_trans_init(&trans, c);
+	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+			POS(inode_nr, 0), BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);

-	for_each_btree_key(&trans, iter, BTREE_ID_INODES,
-			   POS(inode_nr, 0), BTREE_ITER_SLOTS, k) {
-		switch (k.k->type) {
-		case KEY_TYPE_inode:
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
-			break;
-		default:
-			/* hole, not found */
-			break;
-		}
+	k = bch2_btree_iter_peek_slot(iter);
+	if (k.k->type == KEY_TYPE_inode)
+		ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);

-		break;
-	}
+	bch2_trans_iter_put(trans, iter);

-	return bch2_trans_exit(&trans) ?: ret;
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+			    struct bch_inode_unpacked *inode)
+{
+	return bch2_trans_do(c, NULL, 0,
+		bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }

 #ifdef CONFIG_BCACHEFS_DEBUG
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@ -3,8 +3,6 @@

 #include "opts.h"

-#include <linux/math64.h>
-
 extern const char * const bch2_inode_opts[];

 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
@ -59,23 +57,9 @@ int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,

 int bch2_inode_rm(struct bch_fs *, u64);

-int bch2_inode_find_by_inum(struct bch_fs *, u64,
-			   struct bch_inode_unpacked *);
-
-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
-{
-	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
-}
-
-static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
-{
-	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
-
-	if (c->sb.time_precision == 1)
-		return ns;
-
-	return div_s64(ns, c->sb.time_precision);
-}
+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);

 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
 {
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -992,27 +992,57 @@ void bch2_fs_journal_stop(struct journal *j)
 	cancel_delayed_work_sync(&j->reclaim_work);
 }

-void bch2_fs_journal_start(struct journal *j)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
+			  struct list_head *journal_entries)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl;
-	u64 blacklist = 0;
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i;
+	u64 last_seq = cur_seq, nr, seq;

-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		blacklist = max(blacklist, bl->end);
+	if (!list_empty(journal_entries))
+		last_seq = le64_to_cpu(list_last_entry(journal_entries,
+						       struct journal_replay,
+						       list)->j.last_seq);
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
+
+	j->last_seq_ondisk	= last_seq;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq) {
+		INIT_LIST_HEAD(&p->list);
+		INIT_LIST_HEAD(&p->flushed);
+		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
+	}
+
+	list_for_each_entry(i, journal_entries, list) {
+		seq = le64_to_cpu(i->j.seq);
+
+		BUG_ON(seq < last_seq || seq >= cur_seq);
+
+		p = journal_seq_pin(j, seq);
+
+		atomic_set(&p->count, 1);
+		p->devs = i->devs;
+	}

 	spin_lock(&j->lock);

 	set_bit(JOURNAL_STARTED, &j->flags);

-	while (journal_cur_seq(j) < blacklist)
-		journal_pin_new_entry(j, 0);
-
-	/*
-	 * __journal_entry_close() only inits the next journal entry when it
-	 * closes an open journal entry - the very first journal entry gets
-	 * initialized here:
-	 */
 	journal_pin_new_entry(j, 1);
 	bch2_journal_buf_init(j);

@ -1021,12 +1051,7 @@ void bch2_fs_journal_start(struct journal *j)
 	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);

-	/*
-	 * Adding entries to the next journal entry before allocating space on
-	 * disk for the next journal entry - this is ok, because these entries
-	 * only have to go down with the next journal entry we write:
-	 */
-	bch2_journal_seq_blacklist_write(j);
+	return 0;
 }

 /* init/exit: */
@ -1091,8 +1116,6 @@ int bch2_fs_journal_init(struct journal *j)
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
 	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
 	init_waitqueue_head(&j->pin_flush_wait);
-	mutex_init(&j->blacklist_lock);
-	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
 	mutex_init(&j->discard_lock);

--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -469,8 +469,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 int bch2_dev_journal_alloc(struct bch_dev *);

 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
 void bch2_fs_journal_stop(struct journal *);
-void bch2_fs_journal_start(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -9,7 +9,6 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
 #include "replicas.h"

 #include <trace/events/bcachefs.h>
@ -655,45 +654,11 @@ void bch2_journal_entries_free(struct list_head *list)
 	}
 }

-int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
-{
-	struct journal *j = &c->journal;
-	struct journal_entry_pin_list *p;
-	u64 seq, nr = end_seq - last_seq + 1;
-
-	if (nr > j->pin.size) {
-		free_fifo(&j->pin);
-		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-		if (!j->pin.data) {
-			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-			return -ENOMEM;
-		}
-	}
-
-	atomic64_set(&j->seq, end_seq);
-	j->last_seq_ondisk = last_seq;
-
-	j->pin.front	= last_seq;
-	j->pin.back	= end_seq + 1;
-
-	fifo_for_each_entry_ptr(p, &j->pin, seq) {
-		INIT_LIST_HEAD(&p->list);
-		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
-		p->devs.nr = 0;
-	}
-
-	return 0;
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
-	struct journal *j = &c->journal;
 	struct journal_list jlist;
 	struct journal_replay *i;
-	struct journal_entry_pin_list *p;
 	struct bch_dev *ca;
-	u64 cur_seq, end_seq;
 	unsigned iter;
 	size_t keys = 0, entries = 0;
 	bool degraded = false;
@ -725,17 +690,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	if (jlist.ret)
 		return jlist.ret;

-	if (list_empty(list)){
-		bch_err(c, "no journal entries found");
-		return BCH_FSCK_REPAIR_IMPOSSIBLE;
-	}
-
 	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas;
 		char buf[80];

-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
-
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@ -745,6 +705,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		 * the devices - this is wrong:
 		 */

+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
@ -755,68 +717,18 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 			if (ret)
 				return ret;
 		}
-	}
-
-	i = list_last_entry(list, struct journal_replay, list);
-
-	ret = bch2_journal_set_seq(c,
-				   le64_to_cpu(i->j.last_seq),
-				   le64_to_cpu(i->j.seq));
-	if (ret)
-		return ret;
-
-	mutex_lock(&j->blacklist_lock);
-
-	list_for_each_entry(i, list, list) {
-		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
-
-		if (bch2_journal_seq_blacklist_read(j, i)) {
-			mutex_unlock(&j->blacklist_lock);
-			return -ENOMEM;
-		}
-	}
-
-	mutex_unlock(&j->blacklist_lock);
-
-	cur_seq = journal_last_seq(j);
-	end_seq = le64_to_cpu(list_last_entry(list,
-				struct journal_replay, list)->j.seq);
-
-	list_for_each_entry(i, list, list) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
-		bool blacklisted;
-
-		mutex_lock(&j->blacklist_lock);
-		while (cur_seq < le64_to_cpu(i->j.seq) &&
-		       bch2_journal_seq_blacklist_find(j, cur_seq))
-			cur_seq++;
-
-		blacklisted = bch2_journal_seq_blacklist_find(j,
-							 le64_to_cpu(i->j.seq));
-		mutex_unlock(&j->blacklist_lock);
-
-		fsck_err_on(blacklisted, c,
-			    "found blacklisted journal entry %llu",
-			    le64_to_cpu(i->j.seq));
-
-		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			cur_seq, le64_to_cpu(i->j.seq) - 1,
-			journal_last_seq(j), end_seq);
-
-		cur_seq = le64_to_cpu(i->j.seq) + 1;

 		for_each_jset_key(k, _n, entry, &i->j)
 			keys++;
 		entries++;
 	}

-	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-		 keys, entries, journal_cur_seq(j));
+	if (!list_empty(list)) {
+		i = list_last_entry(list, struct journal_replay, list);
+
+		bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+			 keys, entries, le64_to_cpu(i->j.seq));
+	}
 fsck_err:
 	return ret;
 }
@ -876,8 +788,9 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 	 * but - there are other correctness issues if btree gc were to run
 	 * before journal replay finishes
 	 */
+	BUG_ON(c->gc_pos.phase);
+
 	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      gc_pos_btree_node(iter->l[0].b),
 		      NULL, 0, 0);
 	bch2_trans_exit(&trans);

--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@ -34,7 +34,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)

-int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
 int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@ -1,12 +1,9 @@

 #include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
+#include "super-io.h"

 /*
 * journal_seq_blacklist machinery:
@ -36,327 +33,285 @@
 * record that it was blacklisted so that a) on recovery we don't think we have
 * missing journal entries and b) so that the btree code continues to ignore
 * that bset, until that btree node is rewritten.
- *
- * Blacklisted journal sequence numbers are themselves recorded as entries in
- * the journal.
 */

-/*
- * Called when journal needs to evict a blacklist entry to reclaim space: find
- * any btree nodes that refer to the blacklist journal sequence numbers, and
- * rewrite them:
- */
-static void journal_seq_blacklist_flush(struct journal *j,
-					struct journal_entry_pin *pin, u64 seq)
+static unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
 {
-	struct bch_fs *c =
-		container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl =
-		container_of(pin, struct journal_seq_blacklist, pin);
-	struct blacklisted_node n;
-	struct closure cl;
-	unsigned i;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	for (i = 0;; i++) {
-		struct btree_trans trans;
-		struct btree_iter *iter;
-		struct btree *b;
-
-		bch2_trans_init(&trans, c);
-
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-
-		iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos,
-						0, 0, 0);
-
-		b = bch2_btree_iter_peek_node(iter);
-
-		/* The node might have already been rewritten: */
-
-		if (b->data->keys.seq == n.seq) {
-			ret = bch2_btree_node_rewrite(c, iter, n.seq, 0);
-			if (ret) {
-				bch2_trans_exit(&trans);
-				bch2_fs_fatal_error(c,
-					"error %i rewriting btree node with blacklisted journal seq",
-					ret);
-				bch2_journal_halt(j);
-				return;
-			}
-		}
-
-		bch2_trans_exit(&trans);
-	}
-
-	for (i = 0;; i++) {
-		struct btree_update *as;
-		struct pending_btree_node_free *d;
-
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-redo_wait:
-		mutex_lock(&c->btree_interior_update_lock);
-
-		/*
-		 * Is the node on the list of pending interior node updates -
-		 * being freed? If so, wait for that to finish:
-		 */
-		for_each_pending_btree_node_free(c, as, d)
-			if (n.seq	== d->seq &&
-			    n.btree_id	== d->btree_id &&
-			    !d->level &&
-			    !bkey_cmp(n.pos, d->key.k.p)) {
-				closure_wait(&as->wait, &cl);
-				mutex_unlock(&c->btree_interior_update_lock);
-				closure_sync(&cl);
-				goto redo_wait;
-			}
-
-		mutex_unlock(&c->btree_interior_update_lock);
-	}
-
-	mutex_lock(&j->blacklist_lock);
-
-	bch2_journal_pin_drop(j, &bl->pin);
-	list_del(&bl->list);
-	kfree(bl->entries);
-	kfree(bl);
-
-	mutex_unlock(&j->blacklist_lock);
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
 }

-/*
- * Determine if a particular sequence number is blacklisted - if so, return
- * blacklist entry:
- */
-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+static unsigned sb_blacklist_u64s(unsigned nr)
 {
-	struct journal_seq_blacklist *bl;
+	struct bch_sb_field_journal_seq_blacklist *bl;

-	lockdep_assert_held(&j->blacklist_lock);
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}

-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		if (seq >= bl->start && seq <= bl->end)
-			return bl;
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+			  struct bch_sb_field_journal_seq_blacklist *bl,
+			  unsigned i)
+{
+	unsigned nr = blacklist_nr_entries(bl);
+
+	if (le64_to_cpu(bl->start[i].end) >=
+	    le64_to_cpu(bl->start[i + 1].start)) {
+		bl->start[i].end = bl->start[i + 1].end;
+		--nr;
+		memmove(&bl->start[i],
+			&bl->start[i + 1],
+			sizeof(bl->start[0]) * (nr - i));
+
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+							sb_blacklist_u64s(nr));
+		BUG_ON(!bl);
+	}
+
+	return bl;
+}
+
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i, nr;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	nr = blacklist_nr_entries(bl);
+
+	if (bl) {
+		for (i = 0; i < nr; i++) {
+			struct journal_seq_blacklist_entry *e =
+				bl->start + i;
+
+			if (start == le64_to_cpu(e->start) &&
+			    end   == le64_to_cpu(e->end))
+				goto out;
+
+			if (start <= le64_to_cpu(e->start) &&
+			    end   >= le64_to_cpu(e->end)) {
+				e->start = cpu_to_le64(start);
+				e->end	= cpu_to_le64(end);
+
+				if (i + 1 < nr)
+					bl = blacklist_entry_try_merge(c,
+								bl, i);
+				if (i)
+					bl = blacklist_entry_try_merge(c,
+								bl, i - 1);
+				goto out_write_sb;
+			}
+		}
+	}
+
+	bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+					sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	bl->start[nr].start	= cpu_to_le64(start);
+	bl->start[nr].end	= cpu_to_le64(end);
+out_write_sb:
+	c->disk_sb.sb->features[0] |=
+		1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
+
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
+{
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
+
+	return (l->start > r->start) - (l->start < r->start);
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
+
+	if (!t)
+		return false;
+
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
+
+	BUG_ON(t->entries[idx].start > seq);
+
+	if (seq >= t->entries[idx].end)
+		return false;
+
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
+}
+
+int bch2_blacklist_table_initialize(struct bch_fs *c)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	BUG_ON(c->journal_seq_blacklist_table);
+
+	if (!bl)
+		return 0;
+
+	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+		    GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	t->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
+	}
+
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
+
+	c->journal_seq_blacklist_table = t;
+	return 0;
+}
+
+static const char *
+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (le64_to_cpu(i->start) >=
+		    le64_to_cpu(i->end))
+			return "entry start >= end";
+
+		if (i + 1 < bl->start + nr &&
+		    le64_to_cpu(i[0].end) >
+		    le64_to_cpu(i[1].start))
+			return "entries out of order";
+	}

 	return NULL;
 }

-/*
- * Allocate a new, in memory blacklist entry:
- */
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
 {
-	struct journal_seq_blacklist *bl;
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);

-	lockdep_assert_held(&j->blacklist_lock);
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			pr_buf(out, " ");

-	/*
-	 * When we start the journal, bch2_journal_start() will skip over @seq:
-	 */
-
-	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-	if (!bl)
-		return NULL;
-
-	bl->start	= start;
-	bl->end		= end;
-
-	list_add_tail(&bl->list, &j->seq_blacklist);
-	return bl;
+		pr_buf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
+	}
 }

-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
 {
-	struct journal *j = &c->journal;
-	struct journal_seq_blacklist *bl = NULL;
-	struct blacklisted_node *n;
-	u64 journal_seq;
-	int ret = 0;
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans trans;
+	unsigned i, nr, new_nr;
+	int ret;

-	if (!seq)
-		return 0;
+	bch2_trans_init(&trans, c);

-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-	spin_unlock(&j->lock);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter *iter;
+		struct btree *b;

-	/* Interier updates aren't journalled: */
-	BUG_ON(b->level);
-	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
-	/*
-	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
-	 * increasing it temporarily to work around bug in old kernels
-	 */
-	fsck_err_on(seq > journal_seq + 4, c,
-		    "bset journal seq too far in the future: %llu > %llu",
-		    seq, journal_seq);
-
-	if (seq <= journal_seq &&
-	    list_empty_careful(&j->seq_blacklist))
-		return 0;
-
-	mutex_lock(&j->blacklist_lock);
-
-	if (seq <= journal_seq) {
-		bl = bch2_journal_seq_blacklist_find(j, seq);
-		if (!bl)
-			goto out;
-	} else {
-		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
-			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
-		if (!j->new_blacklist) {
-			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
-						journal_seq + 1,
-						journal_seq + 1);
-			if (!j->new_blacklist) {
-				ret = -ENOMEM;
-				goto out;
+		for_each_btree_node(&trans, iter, i, POS_MIN,
+				    BTREE_ITER_PREFETCH, b)
+			if (test_bit(BCH_FS_STOPPING, &c->flags)) {
+				bch2_trans_exit(&trans);
+				return;
 			}
-		}
-		bl = j->new_blacklist;
-		bl->end = max(bl->end, seq);
+		bch2_trans_iter_free(&trans, iter);
 	}

-	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
-		if (b->data->keys.seq	== n->seq &&
-		    b->btree_id		== n->btree_id &&
-		    !bkey_cmp(b->key.k.p, n->pos))
-			goto found_entry;
-
-	if (!bl->nr_entries ||
-	    is_power_of_2(bl->nr_entries)) {
-		n = krealloc(bl->entries,
-			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
-			     GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		bl->entries = n;
-	}
-
-	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
-		.seq		= b->data->keys.seq,
-		.btree_id	= b->btree_id,
-		.pos		= b->key.k.p,
-	};
-found_entry:
-	ret = 1;
-out:
-fsck_err:
-	mutex_unlock(&j->blacklist_lock);
-	return ret;
-}
-
-static int __bch2_journal_seq_blacklist_read(struct journal *j,
-					     struct journal_replay *i,
-					     u64 start, u64 end)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl;
-
-	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
-		    start, end);
-
-	bl = bch2_journal_seq_blacklisted_new(j, start, end);
-	if (!bl)
-		return -ENOMEM;
-
-	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
-			     journal_seq_blacklist_flush);
-	return 0;
-}
-
-/*
- * After reading the journal, find existing journal seq blacklist entries and
- * read them into memory:
- */
-int bch2_journal_seq_blacklist_read(struct journal *j,
-				    struct journal_replay *i)
-{
-	struct jset_entry *entry;
-	int ret = 0;
-
-	vstruct_for_each(&i->j, entry) {
-		switch (entry->type) {
-		case BCH_JSET_ENTRY_blacklist: {
-			struct jset_entry_blacklist *bl_entry =
-				container_of(entry, struct jset_entry_blacklist, entry);
-
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->seq),
-					le64_to_cpu(bl_entry->seq));
-			break;
-		}
-		case BCH_JSET_ENTRY_blacklist_v2: {
-			struct jset_entry_blacklist_v2 *bl_entry =
-				container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->start),
-					le64_to_cpu(bl_entry->end));
-			break;
-		}
-		}
-
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-/*
- * After reading the journal and walking the btree, we might have new journal
- * sequence numbers to blacklist - add entries to the next journal entry to be
- * written:
- */
-void bch2_journal_seq_blacklist_write(struct journal *j)
-{
-	struct journal_seq_blacklist *bl = j->new_blacklist;
-	struct jset_entry_blacklist_v2 *bl_entry;
-	struct jset_entry *entry;
-
-	if (!bl)
+	ret = bch2_trans_exit(&trans);
+	if (ret)
 		return;

-	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
-			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+	if (!bl)
+		goto out;

-	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
-	bl_entry->start		= cpu_to_le64(bl->start);
-	bl_entry->end		= cpu_to_le64(bl->end);
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;

-	bch2_journal_pin_add(j,
-			     journal_cur_seq(j),
-			     &bl->pin,
-			     journal_seq_blacklist_flush);
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);

-	j->new_blacklist = NULL;
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
+
+	new_nr = dst - bl->start;
+
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &=
+				~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
 }
--- a/libbcachefs/journal_seq_blacklist.h
+++ b/libbcachefs/journal_seq_blacklist.h
@ -1,13 +1,12 @@
 #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H

-struct journal_replay;
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);

-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *, u64);
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
-int bch2_journal_seq_blacklist_read(struct journal *,
-				    struct journal_replay *);
-void bch2_journal_seq_blacklist_write(struct journal *);
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);

 #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -53,24 +53,6 @@ struct journal_entry_pin {
 	u64				seq;
 };

-/* corresponds to a btree node with a blacklisted bset: */
-struct blacklisted_node {
-	__le64			seq;
-	enum btree_id		btree_id;
-	struct bpos		pos;
-};
-
-struct journal_seq_blacklist {
-	struct list_head	list;
-	u64			start;
-	u64			end;
-
-	struct journal_entry_pin pin;
-
-	struct blacklisted_node	*entries;
-	size_t			nr_entries;
-};
-
 struct journal_res {
 	bool			ref;
 	u8			idx;
@ -221,10 +203,6 @@ struct journal {

 	u64			replay_journal_seq;

-	struct mutex		blacklist_lock;
-	struct list_head	seq_blacklist;
-	struct journal_seq_blacklist *new_blacklist;
-
 	struct write_point	wp;
 	spinlock_t		err_lock;

--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -208,7 +208,8 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 	up_read(&ca->bucket_lock);

 	if (sectors_not_moved && !ret)
-		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+		bch_warn_ratelimited(c,
+			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
 			 sectors_not_moved, sectors_to_move,
 			 buckets_not_moved, buckets_to_move);

--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@ -457,7 +457,7 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 {
 	struct bch_fs *c = sb->s_fs_info;

-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;

 	/* Accounting must be enabled at mount time: */
@ -494,7 +494,7 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
 {
 	struct bch_fs *c = sb->s_fs_info;

-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;

 	mutex_lock(&c->sb_lock);
@ -518,7 +518,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 	struct bch_fs *c = sb->s_fs_info;
 	int ret;

-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;

 	if (uflags & FS_USER_QUOTA) {
@ -600,7 +600,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	struct bch_sb_field_quota *sb_quota;
 	struct bch_memquota_type *q;

-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;

 	if (type >= QTYP_NR)
@ -719,7 +719,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 	struct bkey_i_quota new_quota;
 	int ret;

-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;

 	bkey_quota_init(&new_quota.k_i);
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -11,6 +11,7 @@
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
+#include "journal_seq_blacklist.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
@ -51,6 +52,118 @@ found:
 	return k;
 }

+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean **cleanp,
+				   struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static int
+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
+						  struct list_head *journal)
+{
+	struct journal_replay *i =
+		list_last_entry(journal, struct journal_replay, list);
+	u64 start_seq	= le64_to_cpu(i->j.last_seq);
+	u64 end_seq	= le64_to_cpu(i->j.seq);
+	u64 seq		= start_seq;
+	int ret = 0;
+
+	list_for_each_entry(i, journal, list) {
+		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			seq, le64_to_cpu(i->j.seq) - 1,
+			start_seq, end_seq);
+
+		seq = le64_to_cpu(i->j.seq);
+
+		fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
+			    "found blacklisted journal entry %llu", seq);
+
+		do {
+			seq++;
+		} while (bch2_journal_seq_is_blacklisted(c, seq, false));
+	}
+fsck_err:
+	return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(clean, READ);
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
 static int journal_replay_entry_early(struct bch_fs *c,
 				      struct jset_entry *entry)
 {
@ -100,54 +213,108 @@ static int journal_replay_entry_early(struct bch_fs *c,
 					      le64_to_cpu(u->v));
 		break;
 	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
 	}

 	return ret;
 }

-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean **cleanp,
-				   struct jset *j)
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean,
+				struct list_head *journal)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	if (clean) {
+		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct journal_replay *i =
+			list_last_entry(journal, struct journal_replay, list);
+
+		c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+
+		list_for_each_entry(i, journal, list)
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
+	}
+
+	bch2_fs_usage_initialize(c);
+
+	return 0;
+}
+
+static int read_btree_roots(struct bch_fs *c)
 {
 	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
 	int ret = 0;

-	if (!clean || !j)
-		return 0;
-
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
-	}
-
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
+		struct btree_root *r = &c->btree_roots[i];

-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
-
-		if (!k1 && !k2)
+		if (!r->alive)
 			continue;

-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
-				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+		if (i == BTREE_ID_ALLOC &&
+		    test_reconstruct_alloc(c)) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+			continue;
+		}
+
+
+		if (r->error) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "invalid btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
+
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
+			__fsck_err(c, i == BTREE_ID_ALLOC
+				   ? FSCK_CAN_IGNORE : 0,
+				   "error reading btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_ALLOC)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+		}
 	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (!c->btree_roots[i].b)
+			bch2_btree_root_alloc(c, i);
 fsck_err:
 	return ret;
 }
@ -185,119 +352,82 @@ static bool journal_empty(struct list_head *journal)
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
-	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
-	struct jset_entry *entry;
+	struct bch_sb_field_clean *clean = NULL;
+	u64 journal_seq;
 	LIST_HEAD(journal);
-	struct jset *j = NULL;
-	unsigned i;
-	bool run_gc = c->opts.fsck ||
-		!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
 	int ret;

-	mutex_lock(&c->sb_lock);
+	if (c->sb.clean)
+		clean = read_superblock_clean(c);
+	ret = PTR_ERR_OR_ZERO(clean);
+	if (ret)
+		goto err;
+
+	if (c->sb.clean)
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+
 	if (!c->replicas.entries) {
 		bch_info(c, "building replicas info");
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}

-	if (c->sb.clean)
-		sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-	if (sb_clean) {
-		clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-				GFP_KERNEL);
-		if (!clean) {
-			ret = -ENOMEM;
-			mutex_unlock(&c->sb_lock);
-			goto err;
-		}
+	if (!c->sb.clean || c->opts.fsck) {
+		struct jset *j;

-		if (le16_to_cpu(c->disk_sb.sb->version) <
-		    bcachefs_metadata_version_bkey_renumber)
-			bch2_sb_clean_renumber(clean, READ);
-	}
-	mutex_unlock(&c->sb_lock);
-
-	if (clean)
-		bch_info(c, "recovering from clean shutdown, journal seq %llu",
-			 le64_to_cpu(clean->journal_seq));
-
-	if (!clean || c->opts.fsck) {
 		ret = bch2_journal_read(c, &journal);
 		if (ret)
 			goto err;

-		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+		fsck_err_on(c->sb.clean && !journal_empty(&journal), c,
+			    "filesystem marked clean but journal not empty");
+
+		if (!c->sb.clean && list_empty(&journal)){
+			bch_err(c, "no journal entries found");
+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+			goto err;
+		}
+
+		j = &list_last_entry(&journal, struct journal_replay, list)->j;
+
+		ret = verify_superblock_clean(c, &clean, j);
+		if (ret)
+			goto err;
+
+		journal_seq = le64_to_cpu(j->seq) + 1;
 	} else {
-		ret = bch2_journal_set_seq(c,
-					   le64_to_cpu(clean->journal_seq),
-					   le64_to_cpu(clean->journal_seq));
-		BUG_ON(ret);
+		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}

-	ret = verify_superblock_clean(c, &clean, j);
+	ret = journal_replay_early(c, clean, &journal);
 	if (ret)
 		goto err;

-	fsck_err_on(clean && !journal_empty(&journal), c,
-		    "filesystem marked clean but journal not empty");
-
-	err = "insufficient memory";
-	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
-		for (entry = clean->start;
-		     entry != vstruct_end(&clean->field);
-		     entry = vstruct_next(entry)) {
-			ret = journal_replay_entry_early(c, entry);
-			if (ret)
-				goto err;
-		}
-	} else {
-		struct journal_replay *i;
-
-		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
-		list_for_each_entry(i, &journal, list)
-			vstruct_for_each(&i->j, entry) {
-				ret = journal_replay_entry_early(c, entry);
-				if (ret)
-					goto err;
-			}
-	}
-
-	bch2_fs_usage_initialize(c);
-
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
-
-		if (!r->alive)
-			continue;
-
-		err = "invalid btree root pointer";
-		ret = -1;
-		if (r->error)
-			goto err;
-
-		if (i == BTREE_ID_ALLOC &&
-		    test_reconstruct_alloc(c))
-			continue;
-
-		err = "error reading btree root";
-		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+	if (!c->sb.clean) {
+		ret = bch2_journal_seq_blacklist_add(c,
+				journal_seq,
+				journal_seq + 4);
 		if (ret) {
-			if (i != BTREE_ID_ALLOC)
-				goto err;
-
-			mustfix_fsck_err(c, "error reading btree root");
-			run_gc = true;
+			bch_err(c, "error creating new journal seq blacklist entry");
+			goto err;
 		}
+
+		journal_seq += 4;
 	}

-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (!c->btree_roots[i].b)
-			bch2_btree_root_alloc(c, i);
+	ret = bch2_blacklist_table_initialize(c);
+
+	ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal);
+	if (ret)
+		goto err;
+
+	ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal);
+	if (ret)
+		goto err;
+
+	ret = read_btree_roots(c);
+	if (ret)
+		goto err;

 	err = "error reading allocation information";
 	ret = bch2_alloc_read(c, &journal);
@ -312,10 +442,12 @@ int bch2_fs_recovery(struct bch_fs *c)

 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);

-	if (run_gc) {
+	if (c->opts.fsck ||
+	    !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 		bch_verbose(c, "starting mark and sweep:");
 		err = "error in recovery";
-		ret = bch2_gc(c, &journal, true);
+		ret = bch2_gc(c, &journal, true, false);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
@ -334,13 +466,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (c->opts.noreplay)
 		goto out;

-	/*
-	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
-	 * will give spurious errors about oldest_gen > bucket_gen -
-	 * this is a hack but oh well.
-	 */
-	bch2_fs_journal_start(&c->journal);
-
 	bch_verbose(c, "starting journal replay:");
 	err = "journal replay failed";
 	ret = bch2_journal_replay(c, &journal);
@ -356,6 +481,14 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;

+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas:");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
 	mutex_lock(&c->sb_lock);
 	if (c->opts.version_upgrade) {
 		if (c->sb.version < bcachefs_metadata_version_new_versioning)
@ -371,14 +504,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 	}
 	mutex_unlock(&c->sb_lock);

-	if (enabled_qtypes(c)) {
-		bch_verbose(c, "reading quotas:");
-		ret = bch2_fs_quota_read(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "quotas done");
-	}
-
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
 out:
 	bch2_journal_entries_free(&journal);
 	kfree(clean);
@ -427,7 +555,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	bch2_fs_journal_start(&c->journal);
+	bch2_fs_journal_start(&c->journal, 1, &journal);
 	bch2_journal_set_replay_done(&c->journal);

 	err = "error going read write";
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -6,6 +6,7 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
 #include "super-io.h"
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -29,6 +29,7 @@
 #include "io.h"
 #include "journal.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
@ -499,6 +500,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	kfree(c->replicas.entries);
 	kfree(c->replicas_gc.entries);
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
+	kfree(c->journal_seq_blacklist_table);

 	if (c->journal_reclaim_wq)
 		destroy_workqueue(c->journal_reclaim_wq);
@ -527,6 +529,10 @@ void bch2_fs_stop(struct bch_fs *c)

 	bch_verbose(c, "shutting down");

+	set_bit(BCH_FS_STOPPING, &c->flags);
+
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
 	for_each_member_device(ca, c, i)
 		if (ca->kobj.state_in_sysfs &&
 		    ca->disk_sb.bdev)
@ -663,6 +669,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	spin_lock_init(&c->btree_write_error_lock);
 	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);

+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);

--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -496,7 +496,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);

 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c, NULL, false);
+		bch2_gc(c, NULL, false, false);

 	if (attr == &sysfs_trigger_alloc_write) {
 		bool wrote;
--- a/linux/crypto/chacha20_generic.c
+++ b/linux/crypto/chacha20_generic.c
@ -17,7 +17,7 @@

 #include <linux/crypto.h>
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/skcipher.h>

 #include <sodium/crypto_stream_chacha20.h>
@ -36,7 +36,7 @@ static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 		container_of(tfm, struct chacha20_tfm, tfm);
 	int i;

-	if (keysize != CHACHA20_KEY_SIZE)
+	if (keysize != CHACHA_KEY_SIZE)
 		return -EINVAL;

 	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
@ -72,8 +72,8 @@ static int crypto_chacha20_crypt(struct skcipher_request *req)
 		if (sg_is_last(sg))
 			break;

-		BUG_ON(sg->length % CHACHA20_BLOCK_SIZE);
-		iv[0] += sg->length / CHACHA20_BLOCK_SIZE;
+		BUG_ON(sg->length % CHACHA_BLOCK_SIZE);
+		iv[0] += sg->length / CHACHA_BLOCK_SIZE;
 		sg = sg_next(sg);
 	};

@ -93,8 +93,8 @@ static void *crypto_chacha20_alloc_tfm(void)
 	tfm->tfm.setkey		= crypto_chacha20_setkey;
 	tfm->tfm.encrypt	= crypto_chacha20_crypt;
 	tfm->tfm.decrypt	= crypto_chacha20_crypt;
-	tfm->tfm.ivsize		= CHACHA20_IV_SIZE;
-	tfm->tfm.keysize	= CHACHA20_KEY_SIZE;
+	tfm->tfm.ivsize		= CHACHA_IV_SIZE;
+	tfm->tfm.keysize	= CHACHA_KEY_SIZE;

 	return tfm;
 }