From abfdc593a532abaa40ac6ca87c1e0c86459f8c87 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 16 Mar 2024 19:29:22 -0400
Subject: [PATCH] Update bcachefs sources to 83338f5b2cb8 bcachefs: fix for
 building in userspace

---
 .bcachefs_revision                            |   2 +-
 Makefile                                      |  14 -
 c_src/cmd_format.c                            |   2 +-
 c_src/cmd_fs.c                                |   2 +-
 c_src/cmd_list_journal.c                      |   2 +-
 c_src/tools-util.h                            |   2 +-
 include/linux/darray_types.h                  |  22 -
 include/linux/generic-radix-tree.h            |  29 +-
 libbcachefs/alloc_background.c                |  62 ++-
 libbcachefs/backpointers.c                    |   3 +-
 libbcachefs/bcachefs.h                        |  12 +-
 libbcachefs/bkey.h                            | 207 +-------
 libbcachefs/bkey_types.h                      | 213 +++++++++
 libbcachefs/bset.c                            |   2 +-
 libbcachefs/btree_cache.c                     |   2 +-
 libbcachefs/btree_gc.c                        | 132 +++--
 libbcachefs/btree_io.c                        |  19 +-
 libbcachefs/btree_iter.c                      |  12 +-
 libbcachefs/btree_journal_iter.c              |   2 +-
 libbcachefs/btree_key_cache.c                 |   8 +-
 libbcachefs/btree_locking.h                   |   2 +-
 libbcachefs/btree_types.h                     |   2 +-
 libbcachefs/btree_update.c                    |   4 +-
 libbcachefs/btree_update_interior.c           |  86 +++-
 libbcachefs/btree_update_interior.h           |   2 +
 libbcachefs/btree_write_buffer_types.h        |   2 +-
 libbcachefs/chardev.c                         |  20 +-
 {linux => libbcachefs}/darray.c               |   7 +-
 {include/linux => libbcachefs}/darray.h       |  59 +--
 libbcachefs/errcode.h                         |   7 +-
 libbcachefs/error.c                           |   4 +-
 libbcachefs/extents.h                         |  11 +-
 {include/linux => libbcachefs}/eytzinger.h    |  58 +--
 libbcachefs/fs-io-buffered.c                  | 151 ++++--
 libbcachefs/fs-io-pagecache.h                 |   9 +-
 libbcachefs/fs.c                              |   4 +-
 libbcachefs/fsck.c                            |  11 +-
 libbcachefs/inode.c                           |  15 +-
 libbcachefs/io_read.c                         |   6 +-
 libbcachefs/io_write.c                        |   4 +-
 libbcachefs/journal.c                         |  22 +-
 libbcachefs/journal_io.c                      |  42 +-
 libbcachefs/journal_io.h                      |  10 +-
 libbcachefs/journal_sb.c                      |   2 +-
 libbcachefs/journal_seq_blacklist.c           |  73 +--
 libbcachefs/journal_types.h                   |   6 +-
 libbcachefs/lru.c                             |   3 +-
 {linux => libbcachefs}/mean_and_variance.c    |   3 +-
 .../linux => libbcachefs}/mean_and_variance.h |   0
 libbcachefs/nocow_locking.c                   |   2 +-
 libbcachefs/opts.c                            |   8 +-
 libbcachefs/opts.h                            |   5 +
 libbcachefs/recovery.c                        |  71 ++-
 libbcachefs/replicas.c                        |  19 +-
 libbcachefs/replicas.h                        |   3 +-
 libbcachefs/sb-downgrade.c                    |   5 +-
 libbcachefs/sb-errors_types.h                 |   5 +-
 libbcachefs/sb-members.h                      |   2 +-
 libbcachefs/subvolume.h                       |   1 +
 libbcachefs/subvolume_types.h                 |   2 +-
 libbcachefs/super-io.c                        |   8 +
 libbcachefs/super-io.h                        |   2 +-
 libbcachefs/super.c                           |  77 +--
 libbcachefs/thread_with_file.c                | 450 ++++++++++++++++++
 libbcachefs/thread_with_file.h                |  76 +++
 libbcachefs/thread_with_file_types.h          |  23 +
 libbcachefs/time_stats.c                      | 165 +++++++
 {include/linux => libbcachefs}/time_stats.h   |  62 ++-
 libbcachefs/util.c                            | 157 +++++-
 libbcachefs/util.h                            |  61 ++-
 libbcachefs/xattr.c                           |   5 +-
 linux/generic-radix-tree.c                    |  35 +-
 linux/sort.c                                  |  89 ----
 linux/time_stats.c                            | 373 ---------------
 74 files changed, 1807 insertions(+), 1273 deletions(-)
 delete mode 100644 include/linux/darray_types.h
 create mode 100644 libbcachefs/bkey_types.h
 rename {linux => libbcachefs}/darray.c (68%)
 rename {include/linux => libbcachefs}/darray.h (66%)
 rename {include/linux => libbcachefs}/eytzinger.h (77%)
 rename {linux => libbcachefs}/mean_and_variance.c (99%)
 rename {include/linux => libbcachefs}/mean_and_variance.h (100%)
 create mode 100644 libbcachefs/thread_with_file.c
 create mode 100644 libbcachefs/thread_with_file.h
 create mode 100644 libbcachefs/thread_with_file_types.h
 create mode 100644 libbcachefs/time_stats.c
 rename {include/linux => libbcachefs}/time_stats.h (63%)
 delete mode 100644 linux/time_stats.c

diff --git a/.bcachefs_revision b/.bcachefs_revision
index f1553698..c9361961 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-26494335d114f7813a7fc499bbacb4a74d613b6f
+83338f5b2cb8406cda8bf7be3f566ab97c696917
diff --git a/Makefile b/Makefile
index 37101aff..26cbcb91 100644
--- a/Makefile
+++ b/Makefile
@@ -274,20 +274,6 @@ update-bcachefs-sources:
 	git add include/linux/kmemleak.h
 	cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
 	git add linux/int_sqrt.c
-	cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
-	git add linux/mean_and_variance.c
-	cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
-	git add include/linux/mean_and_variance.h
-	cp $(LINUX_DIR)/lib/time_stats.c linux/
-	git add linux/time_stats.c
-	cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/
-	git add include/linux/time_stats.h
-	cp $(LINUX_DIR)/include/linux/darray.h include/linux/
-	git add include/linux/darray.h
-	cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/
-	git add include/linux/darray_types.h
-	cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/
-	git add include/linux/eytzinger.h
 	cp $(LINUX_DIR)/scripts/Makefile.compiler ./
 	git add Makefile.compiler
 	$(RM) libbcachefs/*.mod.c
diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c
index 3d29f413..4d6c2dc1 100644
--- a/c_src/cmd_format.c
+++ b/c_src/cmd_format.c
@@ -28,7 +28,7 @@
 #include "libbcachefs/super-io.h"
 #include "libbcachefs/util.h"
 
-#include "linux/darray.h"
+#include "libbcachefs/darray.h"
 
 #define OPTS						\
 x(0,	replicas,		required_argument)	\
diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c
index d8542d61..e08054e8 100644
--- a/c_src/cmd_fs.c
+++ b/c_src/cmd_fs.c
@@ -15,7 +15,7 @@
 #include "cmds.h"
 #include "libbcachefs.h"
 
-#include "linux/darray.h"
+#include "libbcachefs/darray.h"
 
 static void __dev_usage_type_to_text(struct printbuf *out,
 				     enum bch_data_type type,
diff --git a/c_src/cmd_list_journal.c b/c_src/cmd_list_journal.c
index d104f50a..38ec8a28 100644
--- a/c_src/cmd_list_journal.c
+++ b/c_src/cmd_list_journal.c
@@ -128,7 +128,7 @@ static void journal_entries_print(struct bch_fs *c, unsigned nr_entries,
 		if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq))
 			continue;
 
-		bool blacklisted = p->ignore ||
+		bool blacklisted = p->ignore_blacklisted ||
 			bch2_journal_seq_is_blacklisted(c,
 					le64_to_cpu(p->j.seq), false);
 
diff --git a/c_src/tools-util.h b/c_src/tools-util.h
index 4682406e..bff3bc65 100644
--- a/c_src/tools-util.h
+++ b/c_src/tools-util.h
@@ -20,7 +20,7 @@
 #include <linux/uuid.h>
 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/bbpos.h"
-#include "linux/darray.h"
+#include "libbcachefs/darray.h"
 
 #define noreturn __attribute__((noreturn))
 
diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h
deleted file mode 100644
index a400a0c3..00000000
--- a/include/linux/darray_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
- */
-#ifndef _LINUX_DARRAY_TYpES_H
-#define _LINUX_DARRAY_TYpES_H
-
-#include <linux/types.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr)					\
-struct {								\
-	size_t nr, size;						\
-	_type *data;							\
-	_type preallocated[_nr];					\
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)	darray_char;
-typedef DARRAY(char *)	darray_str;
-
-#endif /* _LINUX_DARRAY_TYpES_H */
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 84741316..f3512fdd 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -5,7 +5,7 @@
  * DOC: Generic radix trees/sparse arrays
  *
  * Very simple and minimalistic, supporting arbitrary size entries up to
- * PAGE_SIZE.
+ * GENRADIX_NODE_SIZE.
  *
  * A genradix is defined with the type it will store, like so:
  *
@@ -45,12 +45,15 @@
 
 struct genradix_root;
 
+#define GENRADIX_NODE_SHIFT	9
+#define GENRADIX_NODE_SIZE	(1U << GENRADIX_NODE_SHIFT)
+
 struct __genradix {
 	struct genradix_root		*root;
 };
 
 /*
- * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
+ * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
  */
 
 #define __GENRADIX_INITIALIZER					\
@@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *);
 static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 {
 	if (__builtin_constant_p(obj_size))
-		BUILD_BUG_ON(obj_size > PAGE_SIZE);
+		BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE);
 	else
-		BUG_ON(obj_size > PAGE_SIZE);
+		BUG_ON(obj_size > GENRADIX_NODE_SIZE);
 
 	if (!is_power_of_2(obj_size)) {
-		size_t objs_per_page = PAGE_SIZE / obj_size;
+		size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size;
 
-		return (idx / objs_per_page) * PAGE_SIZE +
+		return (idx / objs_per_page) * GENRADIX_NODE_SIZE +
 			(idx % objs_per_page) * obj_size;
 	} else {
 		return idx * obj_size;
@@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
 #define __genradix_objs_per_page(_radix)			\
-	(PAGE_SIZE / sizeof((_radix)->type[0]))
+	(GENRADIX_NODE_SIZE / sizeof((_radix)->type[0]))
 #define __genradix_page_remainder(_radix)			\
-	(PAGE_SIZE % sizeof((_radix)->type[0]))
+	(GENRADIX_NODE_SIZE % sizeof((_radix)->type[0]))
 
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
@@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 	iter->offset += obj_size;
 
 	if (!is_power_of_2(obj_size) &&
-	    (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
-		iter->offset = round_up(iter->offset, PAGE_SIZE);
+	    (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE)
+		iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE);
 
 	iter->pos++;
 }
@@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
 		return;
 	}
 
-	if ((iter->offset & (PAGE_SIZE - 1)) == 0)
-		iter->offset -= PAGE_SIZE % obj_size;
+	if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0)
+		iter->offset -= GENRADIX_NODE_SIZE % obj_size;
 
 	iter->offset -= obj_size;
 	iter->pos--;
@@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
 	genradix_for_each_from(_radix, _iter, _p, 0)
 
 #define genradix_last_pos(_radix)				\
-	(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+	(SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1)
 
 /**
  * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ccd6cbfd..c47f72f2 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1052,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (k.k->type != discard_key_type &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, need_discard_key_wrong,
-		      "incorrect key in need_discard btree (got %s should be %s)\n"
-		      "  %s",
-		      bch2_bkey_types[k.k->type],
-		      bch2_bkey_types[discard_key_type],
-		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(k.k->type != discard_key_type,
+			c, need_discard_key_wrong,
+			"incorrect key in need_discard btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[discard_key_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -1083,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (k.k->type != freespace_key_type &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, freespace_key_wrong,
-		      "incorrect key in freespace btree (got %s should be %s)\n"
-		      "  %s",
-		      bch2_bkey_types[k.k->type],
-		      bch2_bkey_types[freespace_key_type],
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(k.k->type != freespace_key_type,
+			c, freespace_key_wrong,
+			"incorrect key in freespace btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[freespace_key_type],
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -1115,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (a->gen != alloc_gen(k, gens_offset) &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, bucket_gens_key_wrong,
-		      "incorrect gen in bucket_gens btree (got %u should be %u)\n"
-		      "  %s",
-		      alloc_gen(k, gens_offset), a->gen,
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+			c, bucket_gens_key_wrong,
+			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
+			"  %s",
+			alloc_gen(k, gens_offset), a->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i_bucket_gens *g =
 			bch2_trans_kmalloc(trans, sizeof(*g));
 
@@ -1174,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 
 	*end = bkey_min(k.k->p, *end);
 
-	if (k.k->type != KEY_TYPE_set &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, freespace_hole_missing,
-		      "hole in alloc btree missing in freespace btree\n"
-		      "  device %llu buckets %llu-%llu",
-		      freespace_iter->pos.inode,
-		      freespace_iter->pos.offset,
-		      end->offset))) {
+	if (fsck_err_on(k.k->type != KEY_TYPE_set,
+			c, freespace_hole_missing,
+			"hole in alloc btree missing in freespace btree\n"
+			"  device %llu buckets %llu-%llu",
+			freespace_iter->pos.inode,
+			freespace_iter->pos.offset,
+			end->offset)) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index f2b33fe4..8cb35ea5 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -477,8 +477,7 @@ missing:
 	prt_printf(&buf, "\nbp pos ");
 	bch2_bpos_to_text(&buf, bp_iter.pos);
 
-	if (c->opts.reconstruct_alloc ||
-	    fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+	if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
 		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
 
 	goto out;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 0bee9dab..339dc3e1 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -200,8 +200,6 @@
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
-#include <linux/thread_with_file_types.h>
-#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -214,6 +212,7 @@
 #include "recovery_types.h"
 #include "sb-errors_types.h"
 #include "seqmutex.h"
+#include "time_stats.h"
 #include "util.h"
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -470,6 +469,7 @@ enum bch_time_stats {
 #include "replicas_types.h"
 #include "subvolume_types.h"
 #include "super_types.h"
+#include "thread_with_file_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
@@ -598,7 +598,7 @@ struct bch_dev {
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct time_stats_quantiles	io_latency[2];
+	struct bch2_time_stats_quantiles io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
@@ -645,8 +645,8 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
-	struct time_stats	duration;
-	struct time_stats	lock_hold_times;
+	struct bch2_time_stats	duration;
+	struct bch2_time_stats	lock_hold_times;
 	struct mutex		lock;
 	unsigned		nr_max_paths;
 	unsigned		journal_entries_size;
@@ -1111,7 +1111,7 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	bool			promote_whole_extents;
 
-	struct time_stats	times[BCH_TIME_STAT_NR];
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 831be018..cf23ff47 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -4,7 +4,7 @@
 
 #include <linux/bug.h>
 #include "bcachefs_format.h"
-
+#include "bkey_types.h"
 #include "btree_types.h"
 #include "util.h"
 #include "vstructs.h"
@@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
 				     const struct bkey_format *,
 				     const struct bkey_packed *);
 
-/* bkey with split value, const */
-struct bkey_s_c {
-	const struct bkey	*k;
-	const struct bch_val	*v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-	union {
-	struct {
-		struct bkey	*k;
-		struct bch_val	*v;
-	};
-	struct bkey_s_c		s_c;
-	};
-};
-
-#define bkey_p_next(_k)		vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
-	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-	return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-	unsigned u64s = BKEY_U64s + val_u64s;
-
-	BUG_ON(u64s > U8_MAX);
-	k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
 enum bkey_lr_packed {
 	BKEY_PACKED_BOTH,
 	BKEY_PACKED_RIGHT,
@@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
 static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
 				      const struct bkey_packed *k)
 {
-	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-
-	EBUG_ON(k->u64s < ret);
-	return ret;
+	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
 }
 
 static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
 	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
 }
 
-#define bkey_s_null		((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-	return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-	return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-	return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...)					\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-};									\
-									\
-struct bkey_s_c_##name {						\
-	union {								\
-	struct {							\
-		const struct bkey	*k;				\
-		const struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-struct bkey_s_##name {							\
-	union {								\
-	struct {							\
-		struct bkey		*k;				\
-		struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c_##name		c;				\
-	struct bkey_s			s;				\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline const struct bkey_i_##name *				\
-bkey_i_to_##name##_c(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{									\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-name##_i_to_s_c(const struct bkey_i_##name *k)				\
-{									\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-bkey_i_to_s_c_##name(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{									\
-	struct bkey_i_##name *k =					\
-		container_of(&_k->k, struct bkey_i_##name, k);		\
-									\
-	bkey_init(&k->k);						\
-	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = KEY_TYPE_##name;					\
-	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
-									\
-	return k;							\
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h
new file mode 100644
index 00000000..c9ae9e42
--- /dev/null
+++ b/libbcachefs/bkey_types.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i	- bkey with inline value
+ * bkey_s	- bkey with split value
+ * bkey_s_c	- bkey with split value, const
+ */
+
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 1d77aa55..3fd1085b 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -9,12 +9,12 @@
 #include "bcachefs.h"
 #include "btree_cache.h"
 #include "bset.h"
+#include "eytzinger.h"
 #include "trace.h"
 #include "util.h"
 
 #include <asm/unaligned.h>
 #include <linux/console.h>
-#include <linux/eytzinger.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 79975046..562561a9 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -661,7 +661,7 @@ out:
 	bch2_btree_keys_init(b);
 	set_btree_node_accessed(b);
 
-	time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
 	memalloc_nofs_restore(flags);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 6c52f116..584aee70 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -593,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
 
-		if (!g->gen_valid &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_to_missing_alloc_key,
-			      "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+		if (fsck_err_on(!g->gen_valid,
+				c, ptr_to_missing_alloc_key,
+				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -611,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
-			      "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen, g->gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+				c, ptr_gen_newer_than_bucket_gen,
+				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen, g->gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -633,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
-			      "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+				c, ptr_gen_newer_than_bucket_gen,
+				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
-		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, stale_dirty_ptr,
-			      "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen, g->gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+		if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+				c, stale_dirty_ptr,
+				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen, g->gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@@ -1366,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-	struct bucket gc, *b;
+	struct bucket old_gc, gc, *b;
 	struct bkey_i_alloc_v4 *a;
 	struct bch_alloc_v4 old_convert, new;
 	const struct bch_alloc_v4 *old;
-	enum bch_data_type type;
 	int ret;
 
 	old = bch2_alloc_to_v4(k, &old_convert);
@@ -1378,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	percpu_down_read(&c->mark_lock);
 	b = gc_bucket(ca, iter->pos.offset);
+	old_gc = *b;
+
+	if ((old->data_type == BCH_DATA_sb ||
+	     old->data_type == BCH_DATA_journal) &&
+	    !bch2_dev_is_online(ca)) {
+		b->data_type = old->data_type;
+		b->dirty_sectors = old->dirty_sectors;
+	}
 
 	/*
 	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
 	 * fix that here:
 	 */
-	type = __alloc_data_type(b->dirty_sectors,
-				 b->cached_sectors,
-				 b->stripe,
-				 *old,
-				 b->data_type);
-	if (b->data_type != type) {
-		struct bch_dev_usage *u;
-
-		preempt_disable();
-		u = this_cpu_ptr(ca->usage_gc);
-		u->d[b->data_type].buckets--;
-		b->data_type = type;
-		u->d[b->data_type].buckets++;
-		preempt_enable();
-	}
-
+	b->data_type = __alloc_data_type(b->dirty_sectors,
+					 b->cached_sectors,
+					 b->stripe,
+					 *old,
+					 b->data_type);
 	gc = *b;
 	percpu_up_read(&c->mark_lock);
 
+	if (gc.data_type != old_gc.data_type ||
+	    gc.dirty_sectors != old_gc.dirty_sectors)
+		bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
+
 	if (metadata_only &&
 	    gc.data_type != BCH_DATA_sb &&
 	    gc.data_type != BCH_DATA_journal &&
@@ -1411,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	if (gen_after(old->gen, gc.gen))
 		return 0;
 
-	if (c->opts.reconstruct_alloc ||
-	    fsck_err_on(new.data_type != gc.data_type, c,
+	if (fsck_err_on(new.data_type != gc.data_type, c,
 			alloc_key_data_type_wrong,
 			"bucket %llu:%llu gen %u has wrong data_type"
 			": got %s, should be %s",
@@ -1423,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		new.data_type = gc.data_type;
 
 #define copy_bucket_field(_errtype, _f)					\
-	if (c->opts.reconstruct_alloc ||				\
-	    fsck_err_on(new._f != gc._f, c, _errtype,			\
+	if (fsck_err_on(new._f != gc._f, c, _errtype,			\
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %u, should be %u",			\
 			iter->pos.inode, iter->pos.offset,		\
@@ -1586,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			"  should be %u",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
 			return ret;
@@ -1596,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			new->k.type = KEY_TYPE_deleted;
 		else
 			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+		ret = bch2_trans_update(trans, iter, new, 0);
 	}
 fsck_err:
 	printbuf_exit(&buf);
@@ -1818,10 +1812,10 @@ out:
 	if (!ret) {
 		bch2_journal_block(&c->journal);
 
-		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
-			bch2_gc_reflink_done(c, metadata_only) ?:
-			bch2_gc_alloc_done(c, metadata_only) ?:
-			bch2_gc_done(c, initial, metadata_only);
+		ret   = bch2_gc_alloc_done(c, metadata_only) ?:
+			bch2_gc_done(c, initial, metadata_only) ?:
+			bch2_gc_stripes_done(c, metadata_only) ?:
+			bch2_gc_reflink_done(c, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
 	}
@@ -1971,7 +1965,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	c->gc_count++;
 
-	time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 	trace_and_count(c, gc_gens_end, c);
 err:
 	for_each_member_device(c, ca) {
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 86415701..624c8287 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
 	if (sorting_entire_node)
-		time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 				       start_time);
 
 	/* Make sure we preserve bset journal_seq: */
@@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 			&dst->format,
 			true);
 
-	time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 			       start_time);
 
 	set_btree_bset_end(dst, dst->set);
@@ -839,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
 	if (k->format > KEY_FORMAT_CURRENT)
 		return false;
 
+	if (k->u64s < bkeyp_key_u64s(&b->format, k))
+		return false;
+
 	struct printbuf buf = PRINTBUF;
 	struct bkey tmp;
 	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@@ -880,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				 "invalid bkey format %u", k->format))
 			goto drop_this_key;
 
-		/* XXX: validate k->u64s */
+		if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i,
+				 btree_node_bkey_bad_u64s,
+				 "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
+			goto drop_this_key;
+
 		if (!write)
 			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
 				    BSET_BIG_ENDIAN(i), write,
@@ -1250,7 +1259,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 out:
 	mempool_free(iter, &c->fill_iter);
 	printbuf_exit(&buf);
-	time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
 	return retry_read;
 fsck_err:
 	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1322,7 +1331,7 @@ start:
 		}
 	}
 
-	time_stats_update(&c->times[BCH_TIME_btree_node_read],
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
 
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 2357af3e..51bcdc6c 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (ret)
 		return ret;
 
-	btree_path_set_should_be_locked(trans->paths + iter->path);
+	struct btree_path *path = btree_iter_path(trans, iter);
+	if (btree_path_node(path, path->level))
+		btree_path_set_should_be_locked(path);
 	return 0;
 }
 
@@ -2905,7 +2907,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
 	    time_after64(now, trans->last_begin_time + 10))
-		__time_stats_update(&btree_trans_stats(trans)->duration,
+		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
 					 trans->last_begin_time, now);
 
 	if (!trans->restarted &&
@@ -3230,7 +3232,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
 		kfree(s->max_paths_text);
-		time_stats_exit(&s->lock_hold_times);
+		bch2_time_stats_exit(&s->lock_hold_times);
 	}
 
 	if (c->btree_trans_barrier_initialized)
@@ -3246,8 +3248,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
-		time_stats_init(&s->duration);
-		time_stats_init(&s->lock_hold_times);
+		bch2_time_stats_init(&s->duration);
+		bch2_time_stats_init(&s->lock_hold_times);
 		mutex_init(&s->lock);
 	}
 
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
index 207dd32e..50e04356 100644
--- a/libbcachefs/btree_journal_iter.c
+++ b/libbcachefs/btree_journal_iter.c
@@ -512,7 +512,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		cond_resched();
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 74e52fd2..8a71d434 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	struct bkey_i *new_k = NULL;
 	int ret;
 
-	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
-			       BTREE_ITER_KEY_CACHE_FILL|
-			       BTREE_ITER_CACHED_NOFILL);
+	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+			     BTREE_ITER_KEY_CACHE_FILL|
+			     BTREE_ITER_CACHED_NOFILL);
+	iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index f2e2c588..4bd72c85 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 					      struct btree_path *path, unsigned level)
 {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	__time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
 				 path->l[level].lock_taken_time,
 				 local_clock());
 #endif
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index b2ebf143..9404d96c 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -2,13 +2,13 @@
 #ifndef _BCACHEFS_BTREE_TYPES_H
 #define _BCACHEFS_BTREE_TYPES_H
 
-#include <linux/darray_types.h>
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
 #include "bbpos_types.h"
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
+#include "darray.h"
 #include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index cbb7cf21..a4b40c16 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -14,8 +14,6 @@
 #include "snapshot.h"
 #include "trace.h"
 
-#include <linux/darray.h>
-
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 					 const struct btree_insert_entry *r)
 {
@@ -454,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
 	 * the key cache - but the key has to exist in the btree for that to
 	 * work:
 	 */
-	if (path->cached && bkey_deleted(&i->old_k))
+	if (path->cached && !i->old_btree_u64s)
 		return flush_new_cached_update(trans, i, flags, ip);
 
 	return 0;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 70da4fa2..642213ef 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -25,8 +25,7 @@
 #include <linux/random.h>
 
 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				  btree_path_idx_t, struct btree *,
-				  struct keylist *, unsigned);
+				  btree_path_idx_t, struct btree *, struct keylist *);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@@ -517,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
 	bch2_disk_reservation_put(c, &as->disk_res);
 	bch2_btree_reserve_put(as, trans);
 
-	time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
 			       as->start_time);
 
 	mutex_lock(&c->btree_interior_update_lock);
@@ -1039,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
 	continue_at(&as->cl, btree_update_set_nodes_written,
 		    as->c->btree_interior_update_worker);
 
-	time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
 			       start_time);
 }
 
@@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);
 
 	mutex_lock(&c->btree_root_lock);
-	BUG_ON(btree_node_root(c, b) &&
-	       (b->c.level < btree_node_root(c, b)->c.level ||
-		!btree_node_dying(btree_node_root(c, b))));
-
 	bch2_btree_id_root(c, b->c.btree_id)->b = b;
 	mutex_unlock(&c->btree_root_lock);
 
@@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as,
 
 static int btree_split(struct btree_update *as, struct btree_trans *trans,
 		       btree_path_idx_t path, struct btree *b,
-		       struct keylist *keys, unsigned flags)
+		       struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
 	struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	if (parent) {
 		/* Split a non root node */
-		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
 		if (ret)
 			goto err;
 	} else if (n3) {
@@ -1630,7 +1625,7 @@ out:
 
 	bch2_trans_verify_locks(trans);
 
-	time_stats_update(&c->times[n2
+	bch2_time_stats_update(&c->times[n2
 			       ? BCH_TIME_btree_node_split
 			       : BCH_TIME_btree_node_compact],
 			       start_time);
@@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  * @path_idx:		path that points to current node
  * @b:			node to insert keys into
  * @keys:		list of keys to insert
- * @flags:		transaction commit flags
  *
  * Returns: 0 on success, typically transaction restart error on failure
  *
@@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  */
 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
 				  btree_path_idx_t path_idx, struct btree *b,
-				  struct keylist *keys, unsigned flags)
+				  struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
 	struct btree_path *path = trans->paths + path_idx;
@@ -1739,7 +1733,7 @@ split:
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
 	}
 
-	return btree_split(as, trans, path_idx, b, keys, flags);
+	return btree_split(as, trans, path_idx, b, keys);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 			  unsigned flags)
 {
 	/* btree_split & merge may both cause paths array to be reallocated */
-
 	struct btree *b = path_l(trans->paths + path)->b;
 	struct btree_update *as;
 	unsigned l;
@@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
-	ret = btree_split(as, trans, path, b, NULL, flags);
+	ret = btree_split(as, trans, path, b, NULL);
 	if (ret) {
 		bch2_btree_update_free(as, trans);
 		return ret;
@@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	return ret;
 }
 
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+				   btree_path_idx_t path_idx)
+{
+	struct bch_fs *c = as->c;
+	struct btree_path *path = trans->paths + path_idx;
+	struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+	BUG_ON(!btree_node_locked(path, b->c.level));
+
+	n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	path->locks_want++;
+	BUG_ON(btree_node_locked(path, n->c.level));
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, path, n);
+
+	n->sib_u64s[0] = U16_MAX;
+	n->sib_u64s[1] = U16_MAX;
+
+	bch2_keylist_add(&as->parent_keys, &b->key);
+	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+	bch2_btree_set_root(as, trans, path, n);
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_trans_node_add(trans, path, n);
+	six_unlock_intent(&n->c.lock);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_add_tail(&b->list, &c->btree_cache.live);
+	mutex_unlock(&c->btree_cache.lock);
+
+	bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+	struct btree_update *as =
+		bch2_btree_update_start(trans, trans->paths + path,
+					b->c.level, true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
+
+	__btree_increase_depth(as, trans, path);
+	bch2_btree_update_done(as, trans);
+	return 0;
+}
+
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 				  btree_path_idx_t path,
 				  unsigned level,
@@ -1915,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_trans_verify_paths(trans);
 
-	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
 	if (ret)
 		goto err_free_update;
 
@@ -1935,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_btree_update_done(as, trans);
 
-	time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
 	if (new_path)
@@ -1986,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		ret = bch2_btree_insert_node(as, trans, iter->path,
-					     parent, &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
 		if (ret)
 			goto err;
 	} else {
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index c593c925..3439b037 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
 
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
 int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
 				  unsigned, unsigned, enum btree_node_sibling);
 
diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h
index 5f248873..9b9433de 100644
--- a/libbcachefs/btree_write_buffer_types.h
+++ b/libbcachefs/btree_write_buffer_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 
-#include <linux/darray_types.h>
+#include "darray.h"
 #include "journal_types.h"
 
 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 99293915..38defa19 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -11,6 +11,7 @@
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
+#include "thread_with_file.h"
 
 #include <linux/cdev.h>
 #include <linux/device.h>
@@ -19,15 +20,8 @@
 #include <linux/major.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
-#include <linux/thread_with_file.h>
 #include <linux/uaccess.h>
 
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
-	return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
 /* returns with ref on ca->ref */
 static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 					  unsigned flags)
@@ -172,9 +166,9 @@ static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 	bch2_fs_stop(c);
 
 	if (ret & 1)
-		stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
 	if (ret & 4)
-		stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
 
 	return ret;
 }
@@ -236,7 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-	ret = run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
 err:
 	if (ret < 0) {
 		if (thr)
@@ -439,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
 {
 	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 
-	thread_with_file_exit(&ctx->thr);
+	bch2_thread_with_file_exit(&ctx->thr);
 	kfree(ctx);
 	return 0;
 }
@@ -489,7 +483,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	ctx->c = c;
 	ctx->arg = arg;
 
-	ret = run_thread_with_file(&ctx->thr,
+	ret = bch2_run_thread_with_file(&ctx->thr,
 			&bcachefs_data_ops,
 			bch2_data_thread);
 	if (ret < 0)
@@ -857,7 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 			goto err;
 	}
 
-	ret = run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
 err:
 	if (ret < 0) {
 		bch_err_fn(c, ret);
diff --git a/linux/darray.c b/libbcachefs/darray.c
similarity index 68%
rename from linux/darray.c
rename to libbcachefs/darray.c
index 80e77959..ac35b8b7 100644
--- a/linux/darray.c
+++ b/libbcachefs/darray.c
@@ -1,13 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
- * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
- */
 
-#include <linux/darray.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
+#include "darray.h"
 
-int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
 {
 	if (new_size > d->size) {
 		new_size = roundup_pow_of_two(new_size);
diff --git a/include/linux/darray.h b/libbcachefs/darray.h
similarity index 66%
rename from include/linux/darray.h
rename to libbcachefs/darray.h
index ff167eb7..4b340d13 100644
--- a/include/linux/darray.h
+++ b/libbcachefs/darray.h
@@ -1,26 +1,34 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*
- * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
- */
-#ifndef _LINUX_DARRAY_H
-#define _LINUX_DARRAY_H
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
 
 /*
- * Dynamic arrays
+ * Dynamic arrays:
  *
  * Inspired by CCAN's darray
  */
 
-#include <linux/darray_types.h>
 #include <linux/slab.h>
 
-int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
+#define DARRAY_PREALLOCATED(_type, _nr)					\
+struct {								\
+	size_t nr, size;						\
+	_type *data;							\
+	_type preallocated[_nr];					\
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)	darray_char;
+typedef DARRAY(char *) darray_str;
+
+int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
 
 static inline int __darray_resize(darray_char *d, size_t element_size,
 				  size_t new_size, gfp_t gfp)
 {
 	return unlikely(new_size > d->size)
-		? __darray_resize_slowpath(d, element_size, new_size, gfp)
+		? __bch2_darray_resize(d, element_size, new_size, gfp)
 		: 0;
 }
 
@@ -61,28 +69,6 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 #define darray_first(_d)	((_d).data[0])
 #define darray_last(_d)		((_d).data[(_d).nr - 1])
 
-/* Insert/remove items into the middle of a darray: */
-
-#define array_insert_item(_array, _nr, _pos, _new_item)			\
-do {									\
-	memmove(&(_array)[(_pos) + 1],					\
-		&(_array)[(_pos)],					\
-		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
-	(_nr)++;							\
-	(_array)[(_pos)] = (_new_item);					\
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
-do {									\
-	(_nr) -= (_nr_to_remove);					\
-	memmove(&(_array)[(_pos)],					\
-		&(_array)[(_pos) + (_nr_to_remove)],			\
-		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)				\
-	array_remove_items(_array, _nr, _pos, 1)
-
 #define darray_insert_item(_d, pos, _item)				\
 ({									\
 	size_t _pos = (pos);						\
@@ -93,15 +79,10 @@ do {									\
 	_ret;								\
 })
 
-#define darray_remove_items(_d, _pos, _nr_to_remove)			\
-	array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
-
 #define darray_remove_item(_d, _pos)					\
-	darray_remove_items(_d, _pos, 1)
+	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
 
-/* Iteration: */
-
-#define __darray_for_each(_d, _i)					\
+#define __darray_for_each(_d, _i)						\
 	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each(_d, _i)						\
@@ -125,4 +106,4 @@ do {									\
 	darray_init(_d);						\
 } while (0)
 
-#endif /* _LINUX_DARRAY_H */
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index e960a6ea..af25d8ec 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -5,6 +5,10 @@
 #define BCH_ERRCODES()								\
 	x(ERANGE,			ERANGE_option_too_small)		\
 	x(ERANGE,			ERANGE_option_too_big)			\
+	x(EINVAL,			mount_option)				\
+	x(BCH_ERR_mount_option,		option_name)				\
+	x(BCH_ERR_mount_option,		option_value)				\
+	x(BCH_ERR_mount_option,         option_not_bool)                        \
 	x(ENOMEM,			ENOMEM_stripe_buf)			\
 	x(ENOMEM,			ENOMEM_replicas_table)			\
 	x(ENOMEM,			ENOMEM_cpu_replicas)			\
@@ -247,7 +251,8 @@
 	x(BCH_ERR_nopromote,		nopromote_congested)			\
 	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
-	x(BCH_ERR_nopromote,		nopromote_enomem)
+	x(BCH_ERR_nopromote,		nopromote_enomem)			\
+	x(0,				need_inode_lock)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 8ae95b21..04343120 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -3,7 +3,7 @@
 #include "error.h"
 #include "recovery.h"
 #include "super.h"
-#include <linux/thread_with_file.h>
+#include "thread_with_file.h"
 
 #define FSCK_ERR_RATELIMIT_NR	10
 
@@ -111,7 +111,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 	do {
 		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
 
-		int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+		int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
 		if (r < 0)
 			return YN_NO;
 		buf[r] = '\0';
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 6bf839d6..6219f2c0 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -43,6 +43,11 @@ enum bkey_invalid_flags;
 #define extent_entry_next(_entry)					\
 	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
+#define extent_entry_next_safe(_entry, _end)				\
+	(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)	\
+	 ? extent_entry_next(_entry)					\
+	 : _end)
+
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
 {
@@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 #define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
 	for ((_entry) = (_start);					\
 	     (_entry) < (_end);						\
-	     (_entry) = extent_entry_next(_entry))
+	     (_entry) = extent_entry_next_safe(_entry, _end))
 
 #define __bkey_ptr_next(_ptr, _end)					\
 ({									\
@@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 	(_ptr).has_ec	= false;					\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
-		switch (extent_entry_type(_entry)) {			\
+		switch (__extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
 			(_ptr).ptr		= _entry->ptr;		\
 			goto out;					\
@@ -344,7 +349,7 @@ out:									\
 	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
 	     (_entry) = _start;						\
 	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
-	     (_entry) = extent_entry_next(_entry))
+	     (_entry) = extent_entry_next_safe(_entry, _end))
 
 #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
 	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
diff --git a/include/linux/eytzinger.h b/libbcachefs/eytzinger.h
similarity index 77%
rename from include/linux/eytzinger.h
rename to libbcachefs/eytzinger.h
index 10315010..b04750db 100644
--- a/include/linux/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@@ -1,37 +1,27 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_EYTZINGER_H
-#define _LINUX_EYTZINGER_H
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
 
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#ifdef EYTZINGER_DEBUG
-#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
+#include "util.h"
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
+ * array
+ */
+
+/*
+ * One based indexing version:
  *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-	EYTZINGER_BUG_ON(child > 1);
+	EBUG_ON(child > 1);
 
 	return (i << 1) + child;
 }
@@ -68,7 +58,7 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EYTZINGER_BUG_ON(i > size);
+	EBUG_ON(i > size);
 
 	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
@@ -84,7 +74,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EYTZINGER_BUG_ON(i > size);
+	EBUG_ON(i > size);
 
 	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
@@ -111,7 +101,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 	unsigned shift = __fls(size) - b;
 	int s;
 
-	EYTZINGER_BUG_ON(!i || i > size);
+	EBUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -136,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EYTZINGER_BUG_ON(!i || i > size);
+	EBUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -174,7 +164,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-	EYTZINGER_BUG_ON(child > 1);
+	EBUG_ON(child > 1);
 
 	return (i << 1) + 1 + child;
 }
@@ -241,9 +231,11 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	     (_i) != -1;				\
 	     (_i) = eytzinger0_next((_i), (_size)))
 
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
 /* return greatest node <= @search, or -1 if not found */
 static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-					 cmp_func_t cmp, const void *search)
+					 eytzinger_cmp_fn cmp, const void *search)
 {
 	unsigned i, n = 0;
 
@@ -252,7 +244,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 	do {
 		i = n;
-		n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
 	} while (n < nr);
 
 	if (n & 1) {
@@ -277,13 +269,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	int _res;							\
 									\
 	while (_i < _nr &&						\
-	       (_res = _cmp(_search, _base + _i * _size)))		\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
 		_i = eytzinger0_child(_i, _res > 0);			\
 	_i;								\
 })
 
-void eytzinger0_sort_r(void *, size_t, size_t,
-		       cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
 
-#endif /* _LINUX_EYTZINGER_H */
+#endif /* _EYTZINGER_H */
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 27710cdd..39292e7e 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
 static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 struct address_space *mapping,
 				 struct iov_iter *iter,
-				 loff_t pos, unsigned len)
+				 loff_t pos, unsigned len,
+				 bool inode_locked)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	BUG_ON(!fs.nr);
 
+	/*
+	 * If we're not using the inode lock, we need to lock all the folios for
+	 * atomiticity of writes vs. other writes:
+	 */
+	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+		ret = -BCH_ERR_need_inode_lock;
+		goto out;
+	}
+
 	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	end = pos + copied;
 
 	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size)
+	if (end > inode->v.i_size) {
+		BUG_ON(!inode_locked);
 		i_size_write(&inode->v, end);
+	}
 	spin_unlock(&inode->v.i_lock);
 
 	f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos = iocb->ki_pos;
-	ssize_t written = 0;
-	int ret = 0;
+	loff_t pos;
+	bool inode_locked = false;
+	ssize_t written = 0, written2 = 0, ret = 0;
+
+	/*
+	 * We don't take the inode lock unless i_size will be changing. Folio
+	 * locks provide exclusion with other writes, and the pagecache add lock
+	 * provides exclusion with truncate and hole punching.
+	 *
+	 * There is one nasty corner case where atomicity would be broken
+	 * without great care: when copying data from userspace to the page
+	 * cache, we do that with faults disable - a page fault would recurse
+	 * back into the filesystem, taking filesystem locks again, and
+	 * deadlock; so it's done with faults disabled, and we fault in the user
+	 * buffer when we aren't holding locks.
+	 *
+	 * If we do part of the write, but we then race and in the userspace
+	 * buffer have been evicted and are no longer resident, then we have to
+	 * drop our folio locks to re-fault them in, breaking write atomicity.
+	 *
+	 * To fix this, we restart the write from the start, if we weren't
+	 * holding the inode lock.
+	 *
+	 * There is another wrinkle after that; if we restart the write from the
+	 * start, and then get an unrecoverable error, we _cannot_ claim to
+	 * userspace that we did not write data we actually did - so we must
+	 * track (written2) the most we ever wrote.
+	 */
+
+	if ((iocb->ki_flags & IOCB_APPEND) ||
+	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+		inode_lock(&inode->v);
+		inode_locked = true;
+	}
+
+	ret = generic_write_checks(iocb, iter);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+	if (ret) {
+		if (!inode_locked) {
+			inode_lock(&inode->v);
+			inode_locked = true;
+			ret = file_remove_privs_flags(file, 0);
+		}
+		if (ret)
+			goto unlock;
+	}
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	pos = iocb->ki_pos;
 
 	bch2_pagecache_add_get(inode);
 
+	if (!inode_locked &&
+	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+		goto get_inode_lock;
+
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ again:
 			}
 		}
 
+		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+			goto get_inode_lock;
+
 		if (unlikely(fatal_signal_pending(current))) {
 			ret = -EINTR;
 			break;
 		}
 
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+		if (ret == -BCH_ERR_need_inode_lock)
+			goto get_inode_lock;
 		if (unlikely(ret < 0))
 			break;
 
@@ -1030,50 +1103,46 @@ again:
 		}
 		pos += ret;
 		written += ret;
+		written2 = max(written, written2);
+
+		if (ret != bytes && !inode_locked)
+			goto get_inode_lock;
 		ret = 0;
 
 		balance_dirty_pages_ratelimited(mapping);
+
+		if (0) {
+get_inode_lock:
+			bch2_pagecache_add_put(inode);
+			inode_lock(&inode->v);
+			inode_locked = true;
+			bch2_pagecache_add_get(inode);
+
+			iov_iter_revert(iter, written);
+			pos -= written;
+			written = 0;
+			ret = 0;
+		}
 	} while (iov_iter_count(iter));
-
 	bch2_pagecache_add_put(inode);
-
-	return written ? written : ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	ssize_t ret;
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = bch2_direct_write(iocb, from);
-		goto out;
-	}
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs(file);
-	if (ret)
-		goto unlock;
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	ret = bch2_buffered_write(iocb, from);
-	if (likely(ret > 0))
-		iocb->ki_pos += ret;
 unlock:
-	inode_unlock(&inode->v);
+	if (inode_locked)
+		inode_unlock(&inode->v);
 
+	iocb->ki_pos += written;
+
+	ret = max(written, written2) ?: ret;
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
-out:
+	return ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+		? bch2_direct_write(iocb, iter)
+		: bch2_buffered_write(iocb, iter);
+
 	return bch2_err_class(ret);
 }
 
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
index 8cbaba65..828c3d7c 100644
--- a/libbcachefs/fs-io-pagecache.h
+++ b/libbcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
 
 struct bch_folio_sector {
 	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-	unsigned		nr_replicas:4;
-
+	u8			nr_replicas:4,
 	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-	unsigned		replicas_reserved:4;
-
-	/* i_sectors: */
-	enum bch_folio_sector_state state:8;
+				replicas_reserved:4;
+	u8			state;
 };
 
 struct bch_folio {
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 093f5404..3f073845 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1870,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 
 	ret = bch2_parse_mount_opts(NULL, &opts, data);
-	if (ret)
+	if (ret) {
+		ret = bch2_err_class(ret);
 		return ERR_PTR(ret);
+	}
 
 	if (!dev_name || strlen(dev_name) == 0)
 		return ERR_PTR(-EINVAL);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 144f074b..f48033be 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -5,6 +5,7 @@
 #include "btree_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
@@ -17,7 +18,6 @@
 #include "xattr.h"
 
 #include <linux/bsearch.h>
-#include <linux/darray.h>
 #include <linux/dcache.h> /* struct qstr */
 
 /*
@@ -849,12 +849,9 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
+	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
 	bch2_trans_iter_exit(trans, &iter);
-	return k.k->type == KEY_TYPE_set;
+	return ret;
 }
 
 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
@@ -970,7 +967,7 @@ static int check_inode(struct btree_trans *trans,
 		if (ret < 0)
 			return ret;
 
-		fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
 			    "inode %llu:%u unlinked, but not on deleted list",
 			    u.bi_inum, k.k->p.snapshot);
 		ret = 0;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index a3139bb6..2b5e0677 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -1181,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	bool need_another_pass;
 	int ret;
 again:
+	/*
+	 * if we ran check_inodes() unlinked inodes will have already been
+	 * cleaned up but the write buffer will be out of sync; therefore we
+	 * alway need a write buffer flush
+	 */
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
 	need_another_pass = false;
 
 	/*
@@ -1213,12 +1222,8 @@ again:
 		ret;
 	}));
 
-	if (!ret && need_another_pass) {
-		ret = bch2_btree_write_buffer_flush_sync(trans);
-		if (ret)
-			goto err;
+	if (!ret && need_another_pass)
 		goto again;
-	}
 err:
 	bch2_trans_put(trans);
 	return ret;
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index dce136cd..8a556e6d 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
 		container_of(wop, struct promote_op, write.op);
 	struct bch_fs *c = op->write.op.c;
 
-	time_stats_update(&c->times[BCH_TIME_data_promote],
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
 			       op->start_time);
 	promote_free(c, op);
 }
@@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
 
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
 	if (!op) {
 		ret = -BCH_ERR_nopromote_enomem;
 		goto err;
@@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
 	if (rbio->start_time)
-		time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
 				       rbio->start_time);
 	bio_endio(&rbio->bio);
 }
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index f7c4a428..f137252b 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
-	__time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
+	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
 }
 
 #endif
@@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
 
 	EBUG_ON(op->open_buckets.nr);
 
-	time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 	bch2_disk_reservation_put(c, &op->res);
 
 	if (!(op->flags & BCH_WRITE_MOVE))
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 46dc25ad..f314b2e7 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -84,12 +84,8 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
 		prt_str(out, "separate_flush ");
 	if (buf->need_flush_to_write_buffer)
 		prt_str(out, "need_flush_to_write_buffer ");
-	if (buf->need_flush_to_write_buffer)
-		prt_str(out, "need_flush_to_write_buffer ");
-	if (buf->write_done)
-		prt_str(out, "write done ");
 	if (buf->write_started)
-		prt_str(out, "write started ");
+		prt_str(out, "write_started ");
 	if (buf->write_allocated)
 		prt_str(out, "write allocated ");
 	if (buf->write_done)
@@ -715,7 +711,7 @@ recheck_need_open:
 			return ret;
 
 		seq = res.seq;
-		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf = journal_seq_to_buf(j, seq);
 		buf->must_flush = true;
 
 		if (!buf->flush_time) {
@@ -733,8 +729,8 @@ recheck_need_open:
 	}
 
 	/*
-	 * if write was kicked off without a flush, flush the next sequence
-	 * number instead
+	 * if write was kicked off without a flush, or if we promised it
+	 * wouldn't be a flush, flush the next sequence number instead
 	 */
 	buf = journal_seq_to_buf(j, seq);
 	if (buf->noflush) {
@@ -768,7 +764,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
 	if (!ret)
-		time_stats_update(j->flush_seq_time, start_time);
+		bch2_time_stats_update(j->flush_seq_time, start_time);
 
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
@@ -812,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 	     unwritten_seq++) {
 		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 
-		/* journal write is already in flight, and was a flush write: */
-		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+		/* journal flush already in flight, or flush requseted */
+		if (buf->must_flush)
 			goto out;
 
 		buf->noflush = true;
@@ -1203,7 +1199,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		last_seq = le64_to_cpu(i->j.last_seq);
@@ -1236,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index b37b75cc..d76c3c0c 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -86,9 +86,12 @@ static void __journal_replay_free(struct bch_fs *c,
 	kvfree(i);
 }
 
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
 {
-	i->ignore = true;
+	if (blacklisted)
+		i->ignore_blacklisted = true;
+	else
+		i->ignore_not_dirty = true;
 
 	if (!c->opts.read_entire_journal)
 		__journal_replay_free(c, i);
@@ -138,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 				       journal_entry_radix_idx(c, jlist->last_seq)) {
 			i = *_i;
 
-			if (!i || i->ignore)
+			if (journal_replay_ignore(i))
 				continue;
 
 			if (le64_to_cpu(i->j.seq) >= last_seq)
 				break;
-			journal_replay_free(c, i);
+
+			journal_replay_free(c, i, false);
 		}
 	}
 
@@ -199,8 +203,9 @@ replace:
 		return -BCH_ERR_ENOMEM_journal_entry_add;
 
 	darray_init(&i->ptrs);
-	i->csum_good	= entry_ptr.csum_good;
-	i->ignore	= false;
+	i->csum_good		= entry_ptr.csum_good;
+	i->ignore_blacklisted	= false;
+	i->ignore_not_dirty	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 
 	if (dup) {
@@ -1255,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c,
 
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		if (!*start_seq)
 			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
 		if (JSET_NO_FLUSH(&i->j)) {
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 			continue;
 		}
 
 		if (!last_write_torn && !i->csum_good) {
 			last_write_torn = true;
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 			continue;
 		}
 
@@ -1307,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c,
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
 		if (seq < *last_seq) {
-			journal_replay_free(c, i);
+			journal_replay_free(c, i, false);
 			continue;
 		}
 
@@ -1320,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c,
 			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
 				    jset_seq_blacklisted,
 				    "found blacklisted journal entry %llu", seq);
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 		}
 	}
 
@@ -1329,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c,
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1382,7 +1387,7 @@ int bch2_journal_read(struct bch_fs *c,
 		};
 
 		i = *_i;
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		darray_for_each(i->ptrs, ptr) {
@@ -1602,9 +1607,9 @@ static CLOSURE_CALLBACK(journal_write_done)
 	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
-	time_stats_update(!JSET_NO_FLUSH(w->data)
-			  ? j->flush_write_time
-			  : j->noflush_write_time, j->write_start_time);
+	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+			       ? j->flush_write_time
+			       : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
@@ -1667,6 +1672,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 			new.unwritten_idx++;
 		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
+		closure_wake_up(&w->wait);
 		completed = true;
 	}
 
@@ -1676,7 +1682,6 @@ static CLOSURE_CALLBACK(journal_write_done)
 
 		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
-		closure_wake_up(&w->wait);
 		journal_wake(j);
 	}
 
@@ -1930,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 		j->nr_noflush_writes++;
 	} else {
+		w->must_flush = true;
 		j->last_flush_write = jiffies;
 		j->nr_flush_writes++;
 		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
index 59790456..4f1e763a 100644
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
-#include <linux/darray_types.h>
+#include "darray.h"
 
 struct journal_ptr {
 	bool		csum_good;
@@ -20,11 +20,17 @@ struct journal_replay {
 	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
 
 	bool			csum_good;
-	bool			ignore;
+	bool			ignore_blacklisted;
+	bool			ignore_not_dirty;
 	/* must be last: */
 	struct jset		j;
 };
 
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+	return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 					struct jset_entry *entry, unsigned type)
 {
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
index 156691c2..ae4fb8c3 100644
--- a/libbcachefs/journal_sb.c
+++ b/libbcachefs/journal_sb.c
@@ -2,8 +2,8 @@
 
 #include "bcachefs.h"
 #include "journal_sb.h"
+#include "darray.h"
 
-#include <linux/darray.h>
 #include <linux/sort.h>
 
 /* BCH_SB_FIELD_journal: */
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index 024c9b1b..b5303874 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -2,11 +2,10 @@
 
 #include "bcachefs.h"
 #include "btree_iter.h"
+#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
 #include "super-io.h"
 
-#include <linux/eytzinger.h>
-
 /*
  * journal_seq_blacklist machinery:
  *
@@ -44,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
 	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
 }
 
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
-			  struct bch_sb_field_journal_seq_blacklist *bl,
-			  unsigned i)
-{
-	unsigned nr = blacklist_nr_entries(bl);
-
-	if (le64_to_cpu(bl->start[i].end) >=
-	    le64_to_cpu(bl->start[i + 1].start)) {
-		bl->start[i].end = bl->start[i + 1].end;
-		--nr;
-		memmove(&bl->start[i],
-			&bl->start[i + 1],
-			sizeof(bl->start[0]) * (nr - i));
-
-		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-					  sb_blacklist_u64s(nr));
-		BUG_ON(!bl);
-	}
-
-	return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
-					u64 start, u64 end)
-{
-	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl;
-	unsigned i, nr;
+	unsigned i = 0, nr;
 	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
 	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	nr = blacklist_nr_entries(bl);
 
-	for (i = 0; i < nr; i++) {
+	while (i < nr) {
 		struct journal_seq_blacklist_entry *e =
 			bl->start + i;
 
-		if (bl_entry_contig_or_overlaps(e, start, end)) {
-			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
-			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
+		if (end < le64_to_cpu(e->start))
+			break;
 
-			if (i + 1 < nr)
-				bl = blacklist_entry_try_merge(c,
-							bl, i);
-			if (i)
-				bl = blacklist_entry_try_merge(c,
-							bl, i - 1);
-			goto out_write_sb;
+		if (start > le64_to_cpu(e->end)) {
+			i++;
+			continue;
 		}
+
+		/*
+		 * Entry is contiguous or overlapping with new entry: merge it
+		 * with new entry, and delete:
+		 */
+
+		start	= min(start,	le64_to_cpu(e->start));
+		end	= max(end,	le64_to_cpu(e->end));
+		array_remove_item(bl->start, nr, i);
 	}
 
 	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -108,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 		goto out;
 	}
 
-	bl->start[nr].start	= cpu_to_le64(start);
-	bl->start[nr].end	= cpu_to_le64(end);
-out_write_sb:
+	array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+		.start	= cpu_to_le64(start),
+		.end	= cpu_to_le64(end),
+	}));
 	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
 	ret = bch2_write_super(c);
@@ -120,7 +95,8 @@ out:
 	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
 {
 	const struct journal_seq_blacklist_table_entry *l = _l;
 	const struct journal_seq_blacklist_table_entry *r = _r;
@@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	if (!bl)
 		return 0;
 
-	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
-		    GFP_KERNEL);
+	t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
 	if (!t)
 		return -BCH_ERR_ENOMEM_blacklist_table_init;
 
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 011f7a0d..8c053cb6 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -287,9 +287,9 @@ struct journal {
 	u64			nr_noflush_writes;
 	u64			entry_bytes_written;
 
-	struct time_stats	*flush_write_time;
-	struct time_stats	*noflush_write_time;
-	struct time_stats	*flush_seq_time;
+	struct bch2_time_stats	*flush_write_time;
+	struct bch2_time_stats	*noflush_write_time;
+	struct bch2_time_stats	*flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	res_map;
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
index ed7577cd..26569043 100644
--- a/libbcachefs/lru.c
+++ b/libbcachefs/lru.c
@@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			goto out;
 		}
 
-		if (c->opts.reconstruct_alloc ||
-		    fsck_err(c, lru_entry_bad,
+		if (fsck_err(c, lru_entry_bad,
 			     "incorrect lru entry: lru %s time %llu\n"
 			     "  %s\n"
 			     "  for %s",
diff --git a/linux/mean_and_variance.c b/libbcachefs/mean_and_variance.c
similarity index 99%
rename from linux/mean_and_variance.c
rename to libbcachefs/mean_and_variance.c
index 21ec6afc..0ea9f308 100644
--- a/linux/mean_and_variance.c
+++ b/libbcachefs/mean_and_variance.c
@@ -40,9 +40,10 @@
 #include <linux/limits.h>
 #include <linux/math.h>
 #include <linux/math64.h>
-#include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
+#include "mean_and_variance.h"
+
 u128_u u128_div(u128_u n, u64 d)
 {
 	u128_u r;
diff --git a/include/linux/mean_and_variance.h b/libbcachefs/mean_and_variance.h
similarity index 100%
rename from include/linux/mean_and_variance.h
rename to libbcachefs/mean_and_variance.h
diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c
index 181efa4a..3c21981a 100644
--- a/libbcachefs/nocow_locking.c
+++ b/libbcachefs/nocow_locking.c
@@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 		u64 start_time = local_clock();
 
 		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-		time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
 	}
 }
 
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index b1ed0b9a..08ea0cfc 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c,
 		if (ret < 0 || (*res != 0 && *res != 1)) {
 			if (err)
 				prt_printf(err, "%s: must be bool", opt->attr.name);
-			return ret;
+			return ret < 0 ? ret : -BCH_ERR_option_not_bool;
 		}
 		break;
 	case BCH_OPT_UINT:
@@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 
 	copied_opts = kstrdup(options, GFP_KERNEL);
 	if (!copied_opts)
-		return -1;
+		return -ENOMEM;
 	copied_opts_start = copied_opts;
 
 	while ((opt = strsep(&copied_opts, ",")) != NULL) {
@@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 
 bad_opt:
 	pr_err("Bad mount option %s", name);
-	ret = -1;
+	ret = -BCH_ERR_option_name;
 	goto out;
 bad_val:
 	pr_err("Invalid mount option %s", err.buf);
-	ret = -1;
+	ret = -BCH_ERR_option_value;
 	goto out;
 out:
 	kfree(copied_opts_start);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index f8c2341e..136083c1 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -290,6 +290,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Allow mounting in when data will be missing")	\
+	x(no_splitbrain_check,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't kick drives out when splitbrain detected")\
 	x(discard,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 	  OPT_BOOL(),							\
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 96e7a1ec..2af219ae 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id)
 }
 
 /* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void do_reconstruct_alloc(struct bch_fs *c)
 {
+	bch2_journal_log_msg(c, "dropping alloc info");
+	bch_info(c, "dropping and reconstructing all alloc info");
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
+	struct journal_keys *keys = &c->journal_keys;
 	size_t src, dst;
 
 	for (src = 0, dst = 0; src < keys->nr; src++)
 		if (!btree_id_is_alloc(keys->data[src].btree_id))
 			keys->data[dst++] = keys->data[src];
-
 	keys->nr = dst;
 }
 
@@ -122,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
+	struct btree_path *path = btree_iter_path(trans, &iter);
+	if (unlikely(!btree_path_node(path, k->level))) {
+		bch2_trans_iter_exit(trans, &iter);
+		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+					  BTREE_MAX_DEPTH, 0, iter_flags);
+		ret =   bch2_btree_iter_traverse(&iter) ?:
+			bch2_btree_increase_depth(trans, iter.path, 0) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+
 	/* Must be checked with btree locked: */
 	if (k->overwritten)
 		goto out;
@@ -355,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c,
 		genradix_for_each(&c->journal_entries, iter, _i) {
 			i = *_i;
 
-			if (!i || i->ignore)
+			if (journal_replay_ignore(i))
 				continue;
 
 			vstruct_for_each(&i->j, entry) {
@@ -384,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c)
 		if (!r->alive)
 			continue;
 
-		if (btree_id_is_alloc(i) &&
-		    c->opts.reconstruct_alloc) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
 			continue;
-		}
 
 		if (r->error) {
 			__fsck_err(c,
@@ -857,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto out;
 
 		genradix_for_each_reverse(&c->journal_entries, iter, i)
-			if (*i && !(*i)->ignore) {
+			if (!journal_replay_ignore(*i)) {
 				last_journal_entry = &(*i)->j;
 				break;
 			}
@@ -882,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 			genradix_for_each_reverse(&c->journal_entries, iter, i)
 				if (*i) {
 					last_journal_entry = &(*i)->j;
-					(*i)->ignore = false;
+					(*i)->ignore_blacklisted = false;
+					(*i)->ignore_not_dirty= false;
 					/*
 					 * This was probably a NO_FLUSH entry,
 					 * so last_seq was garbage - but we know
@@ -918,10 +960,8 @@ use_clean:
 	c->journal_replay_seq_start	= last_seq;
 	c->journal_replay_seq_end	= blacklist_seq - 1;
 
-	if (c->opts.reconstruct_alloc) {
-		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-		drop_alloc_keys(&c->journal_keys);
-	}
+	if (c->opts.reconstruct_alloc)
+		do_reconstruct_alloc(c);
 
 	zero_out_btree_mem_ptr(&c->journal_keys);
 
@@ -945,7 +985,7 @@ use_clean:
 			bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
 		if (ret) {
-			bch_err(c, "error creating new journal seq blacklist entry");
+			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
 			goto err;
 		}
 	}
@@ -956,9 +996,6 @@ use_clean:
 	if (ret)
 		goto err;
 
-	if (c->opts.reconstruct_alloc)
-		bch2_journal_log_msg(c, "dropping alloc info");
-
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
 	 * but hadn't had their pointers written:
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 678b9c20..cc2672c1 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -6,15 +6,12 @@
 #include "replicas.h"
 #include "super-io.h"
 
-#include <linux/sort.h>
-
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r,  const void *priv)
+static int bch2_memcmp(const void *l, const void *r, size_t size)
 {
-	size_t size = (size_t) priv;
 	return memcmp(l, r, size);
 }
 
@@ -42,8 +39,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
-			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -232,7 +228,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 
 	verify_replicas_entry(search);
 
-#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
 			      entry_cmp, search);
 #undef entry_cmp
@@ -828,11 +824,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 {
 	unsigned i;
 
-	sort_r(cpu_r->entries,
-	       cpu_r->nr,
-	       cpu_r->entry_size,
-	       bch2_memcmp, NULL,
-	       (void *)(size_t)cpu_r->entry_size);
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      bch2_memcmp, NULL);
 
 	for (i = 0; i < cpu_r->nr; i++) {
 		struct bch_replicas_entry_v1 *e =
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 983cce78..654a4b26 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -3,10 +3,9 @@
 #define _BCACHEFS_REPLICAS_H
 
 #include "bkey.h"
+#include "eytzinger.h"
 #include "replicas_types.h"
 
-#include <linux/eytzinger.h>
-
 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
 				 struct bch_replicas_entry_v1 *);
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 3337419f..e4396cb0 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -6,13 +6,12 @@
  */
 
 #include "bcachefs.h"
+#include "darray.h"
 #include "recovery.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "super-io.h"
 
-#include <linux/darray.h>
-
 #define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)
 
 /*
@@ -260,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
 				if (e < BCH_SB_ERR_MAX)
 					__set_bit(e, c->sb.errors_silent);
 				if (e < sizeof(ext->errors_silent) * 8)
-					ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+					__set_bit_le64(e, ext->errors_silent);
 			}
 		}
 	}
diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h
index 0df4b0e7..5178bf57 100644
--- a/libbcachefs/sb-errors_types.h
+++ b/libbcachefs/sb-errors_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_ERRORS_TYPES_H
 #define _BCACHEFS_SB_ERRORS_TYPES_H
 
-#include <linux/darray_types.h>
+#include "darray.h"
 
 #define BCH_SB_ERRS()							\
 	x(clean_but_journal_not_empty,				0)	\
@@ -264,7 +264,8 @@
 	x(subvol_children_not_set,				256)	\
 	x(subvol_children_bad,					257)	\
 	x(subvol_loop,						258)	\
-	x(subvol_unreachable,					259)
+	x(subvol_unreachable,					259)	\
+	x(btree_node_bkey_bad_u64s,				260)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index e4d4d842..be0a9418 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
-#include <linux/darray.h>
+#include "darray.h"
 
 extern char * const bch2_member_error_strs[];
 
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index 4045a180..903c0516 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "darray.h"
 #include "subvolume_types.h"
 
 enum bkey_invalid_flags;
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
index 40f16e3a..ae644adf 100644
--- a/libbcachefs/subvolume_types.h
+++ b/libbcachefs/subvolume_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-#include <linux/darray_types.h>
+#include "darray.h"
 
 typedef DARRAY(u32) snapshot_id_list;
 
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 38a50732..010daebf 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 			return ret;
 	}
 
+	if (rw == WRITE &&
+	    bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+		prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+			   le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+			   le64_to_cpu(sb->seq));
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
+
 	return 0;
 }
 
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index f3762091..95e80e06 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -3,12 +3,12 @@
 #define _BCACHEFS_SUPER_IO_H
 
 #include "extents.h"
+#include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
 #include "sb-members.h"
 
 #include <asm/byteorder.h>
-#include <linux/eytzinger.h>
 
 static inline bool bch2_version_compatible(u16 version)
 {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index a7f9de22..1cabdd47 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -56,6 +56,7 @@
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "thread_with_file.h"
 #include "trace.h"
 
 #include <linux/backing-dev.h>
@@ -67,7 +68,6 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
-#include <linux/thread_with_file.h>
 #include <crypto/hash.h>
 
 MODULE_LICENSE("GPL");
@@ -87,20 +87,27 @@ const char * const bch2_fs_flag_strs[] = {
 	NULL
 };
 
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
+{
+#ifdef __KERNEL__
+	if (unlikely(stdio)) {
+		if (fmt[0] == KERN_SOH[0])
+			fmt += 2;
+
+		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		return;
+	}
+#endif
+	vprintk(fmt, args);
+}
+
 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
 {
 	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
 
 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		stdio_redirect_vprintf(stdio, true, fmt, args);
-	}
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }
 
@@ -110,14 +117,7 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 
 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		stdio_redirect_vprintf(stdio, true, fmt, args);
-	}
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }
 
@@ -532,7 +532,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	unsigned i;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		time_stats_exit(&c->times[i]);
+		bch2_time_stats_exit(&c->times[i]);
 
 	bch2_free_pending_node_rewrites(c);
 	bch2_fs_sb_errors_exit(c);
@@ -765,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->journal_keys.initial_ref_held = true;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		time_stats_init(&c->times[i]);
+		bch2_time_stats_init(&c->times[i]);
 
 	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@@ -830,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 
 	pr_uuid(&name, c->sb.user_uuid.b);
-	strscpy(c->name, name.buf, sizeof(c->name));
-	printbuf_exit(&name);
-
 	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
 	if (ret)
 		goto err;
 
+	strscpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
 	/* Compat: */
 	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
 	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -1073,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 }
 
 static int bch2_dev_in_fs(struct bch_sb_handle *fs,
-			  struct bch_sb_handle *sb)
+			  struct bch_sb_handle *sb,
+			  struct bch_opts *opts)
 {
 	if (fs == sb)
 		return 0;
@@ -1114,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
 		prt_newline(&buf);
 
-		prt_printf(&buf, "Not using older sb");
+		if (!opts->no_splitbrain_check)
+			prt_printf(&buf, "Not using older sb");
 
 		pr_err("%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_device_splitbrain;
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
 	}
 
 	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1141,12 +1145,17 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
 		prt_bdevname(&buf, sb->bdev);
 		prt_printf(&buf, " has %llu\n", seq_from_member);
-		prt_str(&buf, "Not using ");
-		prt_bdevname(&buf, sb->bdev);
+
+		if (!opts->no_splitbrain_check) {
+			prt_str(&buf, "Not using ");
+			prt_bdevname(&buf, sb->bdev);
+		}
 
 		pr_err("%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_device_splitbrain;
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
 	}
 
 	return 0;
@@ -1180,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_dev_buckets_free(ca);
 	free_page((unsigned long) ca->sb_read_scratch);
 
-	time_stats_quantiles_exit(&ca->io_latency[WRITE]);
-	time_stats_quantiles_exit(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
@@ -1272,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	time_stats_quantiles_init(&ca->io_latency[READ]);
-	time_stats_quantiles_init(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
 	ca->mi = bch2_mi_to_cpu(member);
 
@@ -1847,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	dev_idx = sb.sb->dev_idx;
 
-	ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+	ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
 	bch_err_msg(c, ret, "bringing %s online", path);
 	if (ret)
 		goto err;
@@ -2035,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			best = sb;
 
 	darray_for_each_reverse(sbs, sb) {
-		ret = bch2_dev_in_fs(best, sb);
+		ret = bch2_dev_in_fs(best, sb, &opts);
 
 		if (ret == -BCH_ERR_device_has_been_removed ||
 		    ret == -BCH_ERR_device_splitbrain) {
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
new file mode 100644
index 00000000..940db15d
--- /dev/null
+++ b/libbcachefs/thread_with_file.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "thread_with_file.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
+
+void bch2_thread_with_file_exit(struct thread_with_file *thr)
+{
+	if (thr->task) {
+		kthread_stop(thr->task);
+		put_task_struct(thr->task);
+	}
+}
+
+int bch2_run_thread_with_file(struct thread_with_file *thr,
+			      const struct file_operations *fops,
+			      int (*fn)(void *))
+{
+	struct file *file = NULL;
+	int ret, fd = -1;
+	unsigned fd_flags = O_CLOEXEC;
+
+	if (fops->read && fops->write)
+		fd_flags |= O_RDWR;
+	else if (fops->read)
+		fd_flags |= O_RDONLY;
+	else if (fops->write)
+		fd_flags |= O_WRONLY;
+
+	char name[TASK_COMM_LEN];
+	get_task_comm(name, current);
+
+	thr->ret = 0;
+	thr->task = kthread_create(fn, thr, "%s", name);
+	ret = PTR_ERR_OR_ZERO(thr->task);
+	if (ret)
+		return ret;
+
+	ret = get_unused_fd_flags(fd_flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile(name, fops, thr, fd_flags);
+	ret = PTR_ERR_OR_ZERO(file);
+	if (ret)
+		goto err;
+
+	get_task_struct(thr->task);
+	wake_up_process(thr->task);
+	fd_install(fd, file);
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (thr->task)
+		kthread_stop(thr->task);
+	return ret;
+}
+
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr || stdio->done;
+}
+
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr || stdio->done;
+}
+
+#define STDIO_REDIRECT_BUFSIZE		4096
+
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+	spin_lock_init(&buf->lock);
+	init_waitqueue_head(&buf->wait);
+	darray_init(&buf->buf);
+}
+
+/* thread_with_stdio */
+
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+	thr->thr.done = true;
+	thr->stdio.done = true;
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
+}
+
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+				      size_t len, loff_t *ppos)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct stdio_buf *buf = &thr->stdio.output;
+	size_t copied = 0, b;
+	int ret = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+		if (ret)
+			return ret;
+	} else if (!stdio_redirect_has_output(&thr->stdio))
+		return -EAGAIN;
+
+	while (len && buf->buf.nr) {
+		if (fault_in_writeable(ubuf, len) == len) {
+			ret = -EFAULT;
+			break;
+		}
+
+		spin_lock_irq(&buf->lock);
+		b = min_t(size_t, len, buf->buf.nr);
+
+		if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+			buf->buf.nr -= b;
+			memmove(buf->buf.data,
+				buf->buf.data + b,
+				buf->buf.nr);
+		}
+		spin_unlock_irq(&buf->lock);
+	}
+
+	return copied ?: ret;
+}
+
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	thread_with_stdio_done(thr);
+	bch2_thread_with_file_exit(&thr->thr);
+	darray_exit(&thr->stdio.input.buf);
+	darray_exit(&thr->stdio.output.buf);
+	thr->ops->exit(thr);
+	return 0;
+}
+
+static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
+				       size_t len, loff_t *ppos)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct stdio_buf *buf = &thr->stdio.input;
+	size_t copied = 0;
+	ssize_t ret = 0;
+
+	while (len) {
+		if (thr->thr.done) {
+			ret = -EPIPE;
+			break;
+		}
+
+		size_t b = len - fault_in_readable(ubuf, len);
+		if (!b) {
+			ret = -EFAULT;
+			break;
+		}
+
+		spin_lock(&buf->lock);
+		if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+			darray_make_room_gfp(&buf->buf,
+				min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+		b = min(len, darray_room(buf->buf));
+
+		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+			buf->buf.nr += b;
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+		}
+		spin_unlock(&buf->lock);
+
+		if (b) {
+			wake_up(&buf->wait);
+		} else {
+			if ((file->f_flags & O_NONBLOCK)) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			ret = wait_event_interruptible(buf->wait,
+					stdio_redirect_has_input_space(&thr->stdio));
+			if (ret)
+				break;
+		}
+	}
+
+	return copied ?: ret;
+}
+
+static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+	poll_wait(file, &thr->stdio.input.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (stdio_redirect_has_input_space(&thr->stdio))
+		mask |= EPOLLOUT;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	if (thr->ops->unlocked_ioctl)
+		return thr->ops->unlocked_ioctl(thr, cmd, p);
+	return -ENOTTY;
+}
+
+static const struct file_operations thread_with_stdio_fops = {
+	.llseek		= no_llseek,
+	.read		= thread_with_stdio_read,
+	.write		= thread_with_stdio_write,
+	.poll		= thread_with_stdio_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
+	.llseek		= no_llseek,
+	.read		= thread_with_stdio_read,
+	.poll		= thread_with_stdout_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
+};
+
+static int thread_with_stdio_fn(void *arg)
+{
+	struct thread_with_stdio *thr = arg;
+
+	thr->thr.ret = thr->ops->fn(thr);
+
+	thread_with_stdio_done(thr);
+	return 0;
+}
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+			       const struct thread_with_stdio_ops *ops)
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
+
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
+
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+				const struct thread_with_stdio_ops *ops)
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
+
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
+}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
+
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
+{
+	struct stdio_buf *buf = &stdio->input;
+
+	/*
+	 * we're waiting on user input (or for the file descriptor to be
+	 * closed), don't want a hung task warning:
+	 */
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
+	if (stdio->done)
+		return -1;
+
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	buf->buf.nr -= ret;
+	memcpy(ubuf, buf->buf.data, ret);
+	memmove(buf->buf.data,
+		buf->buf.data + ret,
+		buf->buf.nr);
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+	return ret;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
+{
+	struct stdio_buf *buf = &stdio->input;
+	size_t copied = 0;
+	ssize_t ret = 0;
+again:
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
+	if (stdio->done) {
+		ret = -1;
+		goto out;
+	}
+
+	spin_lock(&buf->lock);
+	size_t b = min(len, buf->buf.nr);
+	char *n = memchr(buf->buf.data, '\n', b);
+	if (n)
+		b = min_t(size_t, b, n + 1 - buf->buf.data);
+	buf->buf.nr -= b;
+	memcpy(ubuf, buf->buf.data, b);
+	memmove(buf->buf.data,
+		buf->buf.data + b,
+		buf->buf.nr);
+	ubuf += b;
+	len -= b;
+	copied += b;
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+
+	if (!n && len)
+		goto again;
+out:
+	return copied ?: ret;
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+	ssize_t ret;
+
+	do {
+		va_list args2;
+		size_t len;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+		va_end(args2);
+
+		if (len + 1 <= darray_room(*out)) {
+			out->nr += len;
+			return len;
+		}
+
+		ret = darray_make_room_gfp(out, len + 1, gfp);
+	} while (ret == 0);
+
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+				    const char *fmt, va_list args)
+{
+	struct stdio_buf *buf = &stdio->output;
+	unsigned long flags;
+	ssize_t ret;
+
+again:
+	spin_lock_irqsave(&buf->lock, flags);
+	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+	spin_unlock_irqrestore(&buf->lock, flags);
+
+	if (ret < 0) {
+		if (nonblocking)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(buf->wait,
+				stdio_redirect_has_output_space(stdio));
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+	wake_up(&buf->wait);
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+				const char *fmt, ...)
+{
+	va_list args;
+	ssize_t ret;
+
+	va_start(args, fmt);
+	ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	va_end(args);
+
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h
new file mode 100644
index 00000000..af54ea8f
--- /dev/null
+++ b/libbcachefs/thread_with_file.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_H
+#define _BCACHEFS_THREAD_WITH_FILE_H
+
+#include "thread_with_file_types.h"
+
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ *  - kthread shutdown behaves like writing or reading from a pipe that has been
+ *    closed
+ *  - Input and output buffers are 4096 bytes, although buffers may in some
+ *    situations slightly exceed that limit so as to avoid chopping off a
+ *    message in the middle in nonblocking mode.
+ *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ *    should be fine but might change in future revisions.
+ *  - Output buffer may grow past 4096 bytes to deal with messages that are
+ *    bigger than 4096 bytes
+ *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ *    drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
+struct task_struct;
+
+struct thread_with_file {
+	struct task_struct	*task;
+	int			ret;
+	bool			done;
+};
+
+void bch2_thread_with_file_exit(struct thread_with_file *);
+int bch2_run_thread_with_file(struct thread_with_file *,
+			      const struct file_operations *,
+			      int (*fn)(void *));
+
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+	void (*exit)(struct thread_with_stdio *);
+	int (*fn)(struct thread_with_stdio *);
+	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
+struct thread_with_stdio {
+	struct thread_with_file	thr;
+	struct stdio_redirect	stdio;
+	const struct thread_with_stdio_ops	*ops;
+};
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *,
+			       const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+				const struct thread_with_stdio_ops *);
+int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h
new file mode 100644
index 00000000..e0daf4ee
--- /dev/null
+++ b/libbcachefs/thread_with_file_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+
+#include "darray.h"
+
+struct stdio_buf {
+	spinlock_t		lock;
+	wait_queue_head_t	wait;
+	darray_char		buf;
+};
+
+struct stdio_redirect {
+	struct stdio_buf	input;
+	struct stdio_buf	output;
+
+	spinlock_t		input_lock;
+	wait_queue_head_t	input_wait;
+	darray_char		input_buf;
+	bool			done;
+};
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c
new file mode 100644
index 00000000..4508e9dc
--- /dev/null
+++ b/libbcachefs/time_stats.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+	{ "eon",        U64_MAX          },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+	bool initted = stats->last_event != 0;
+
+	if (time_after64(end, start)) {
+		struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+				duration, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		stats->total_duration += duration;
+
+		if (quantiles)
+			quantiles_update(quantiles, duration);
+	}
+
+	if (stats->last_event && time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+				freq, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+	}
+
+	stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+				    struct time_stat_buffer *b)
+{
+	for (struct time_stat_buffer_entry *i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		time_stats_update_one(stats, i->start, i->end);
+	b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+					     struct time_stat_buffer *b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	__bch2_time_stats_clear_buffer(stats, b);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
diff --git a/include/linux/time_stats.h b/libbcachefs/time_stats.h
similarity index 63%
rename from include/linux/time_stats.h
rename to libbcachefs/time_stats.h
index 6df2b34a..5df61403 100644
--- a/include/linux/time_stats.h
+++ b/libbcachefs/time_stats.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * time_stats - collect statistics on events that have a duration, with nicely
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
  * formatted textual output on demand
  *
  * - percpu buffering of event collection: cheap enough to shotgun
@@ -21,14 +21,15 @@
  *
  * Particularly useful for tracking down latency issues.
  */
-#ifndef _LINUX_TIME_STATS_H
-#define _LINUX_TIME_STATS_H
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
 
-#include <linux/mean_and_variance.h>
 #include <linux/sched/clock.h>
 #include <linux/spinlock_types.h>
 #include <linux/string.h>
 
+#include "mean_and_variance.h"
+
 struct time_unit {
 	const char	*name;
 	u64		nsecs;
@@ -37,12 +38,12 @@ struct time_unit {
 /*
  * given a nanosecond value, pick the preferred time units for printing:
  */
-const struct time_unit *pick_time_units(u64 ns);
+const struct time_unit *bch2_pick_time_units(u64 ns);
 
 /*
  * quantiles - do not use:
  *
- * Only enabled if time_stats->quantiles_enabled has been manually set - don't
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
  * use in new code.
  */
 
@@ -66,7 +67,7 @@ struct time_stat_buffer {
 	}		entries[31];
 };
 
-struct time_stats {
+struct bch2_time_stats {
 	spinlock_t	lock;
 	bool		have_quantiles;
 	/* all fields are in nanoseconds */
@@ -87,52 +88,50 @@ struct time_stats {
 	struct mean_and_variance_weighted duration_stats_weighted;
 	struct mean_and_variance_weighted freq_stats_weighted;
 	struct time_stat_buffer __percpu *buffer;
-
-	u64		start_time;
 };
 
-struct time_stats_quantiles {
-	struct time_stats	stats;
+struct bch2_time_stats_quantiles {
+	struct bch2_time_stats	stats;
 	struct quantiles	quantiles;
 };
 
-static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
 {
 	return stats->have_quantiles
-		? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
+		? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
 		: NULL;
 }
 
-void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
-void __time_stats_update(struct time_stats *stats, u64, u64);
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
 
 /**
  * time_stats_update - collect a new event being tracked
  *
- * @stats	- time_stats to update
+ * @stats	- bch2_time_stats to update
  * @start	- start time of event, recorded with local_clock()
  *
  * The end duration of the event will be the current time
  */
-static inline void time_stats_update(struct time_stats *stats, u64 start)
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
 {
-	__time_stats_update(stats, start, local_clock());
+	__bch2_time_stats_update(stats, start, local_clock());
 }
 
 /**
  * track_event_change - track state change events
  *
- * @stats	- time_stats to update
+ * @stats	- bch2_time_stats to update
  * @v		- new state, true or false
  *
  * Use this when tracking time stats for state changes, i.e. resource X becoming
  * blocked/unblocked.
  */
-static inline bool track_event_change(struct time_stats *stats, bool v)
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
 {
 	if (v != !!stats->last_event_start) {
 		if (!v) {
-			time_stats_update(stats, stats->last_event_start);
+			bch2_time_stats_update(stats, stats->last_event_start);
 			stats->last_event_start = 0;
 		} else {
 			stats->last_event_start = local_clock() ?: 1;
@@ -143,25 +142,18 @@ static inline bool track_event_change(struct time_stats *stats, bool v)
 	return false;
 }
 
-#define TIME_STATS_PRINT_NO_ZEROES	(1U << 0)	/* print nothing if zero count */
-struct seq_buf;
-void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
-		const char *epoch_name, unsigned int flags);
-void time_stats_to_json(struct seq_buf *, struct time_stats *,
-		const char *epoch_name, unsigned int flags);
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
 
-void time_stats_exit(struct time_stats *);
-void time_stats_init(struct time_stats *);
-
-static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
 {
-	time_stats_exit(&statq->stats);
+	bch2_time_stats_exit(&statq->stats);
 }
-static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
 {
-	time_stats_init(&statq->stats);
+	bch2_time_stats_init(&statq->stats);
 	statq->stats.have_quantiles = true;
 	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
 }
 
-#endif /* _LINUX_TIME_STATS_H */
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 098ebbe4..216fadf1 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -11,7 +11,6 @@
 #include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
-#include <linux/eytzinger.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/log2.h>
@@ -23,8 +22,9 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
-#include <linux/mean_and_variance.h>
 
+#include "eytzinger.h"
+#include "mean_and_variance.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
@@ -339,14 +339,14 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
 
 void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
-	const struct time_unit *u = pick_time_units(ns);
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
 	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
-	const struct time_unit *u = pick_time_units(ns);
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
 	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
 	prt_tab_rjust(out);
@@ -363,7 +363,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 #define TABSTOP_SIZE 12
 
-void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
 	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
@@ -374,7 +374,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 
 		spin_lock_irq(&stats->lock);
 		for_each_possible_cpu(cpu)
-			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
 		spin_unlock_irq(&stats->lock);
 	}
 
@@ -469,7 +469,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 	if (quantiles) {
 		int i = eytzinger0_first(NR_QUANTILES);
 		const struct time_unit *u =
-			pick_time_units(quantiles->entries[i].m);
+			bch2_pick_time_units(quantiles->entries[i].m);
 		u64 last_q = 0;
 
 		prt_printf(out, "quantiles (%s):\t", u->name);
@@ -707,6 +707,149 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
+static int alignment_ok(const void *base, size_t align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+			 int (*cmp_func)(const void *, const void *, size_t),
+			 size_t l, size_t r)
+{
+	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+			base + inorder_to_eytzinger0(r, n) * size,
+			size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+			   void (*swap_func)(void *, void *, size_t),
+			   size_t l, size_t r)
+{
+	swap_func(base + inorder_to_eytzinger0(l, n) * size,
+		  base + inorder_to_eytzinger0(r, n) * size,
+		  size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     int (*cmp_func)(const void *, const void *, size_t),
+		     void (*swap_func)(void *, void *, size_t))
+{
+	int i, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		do_swap(base, n, size, swap_func, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
 #if 0
 void eytzinger1_test(void)
 {
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index c5fec87d..7ffbddb8 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -5,21 +5,23 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/closure.h>
-#include <linux/darray.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
+#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
-#include <linux/sched/clock.h>
 #include <linux/slab.h>
-#include <linux/time_stats.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
-#include <linux/mean_and_variance.h>
+
+#include "mean_and_variance.h"
+
+#include "darray.h"
+#include "time_stats.h"
 
 struct closure;
 
@@ -328,7 +330,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 #endif
 }
 
-void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
 #define ewma_add(ewma, val, weight)					\
 ({									\
@@ -629,6 +631,34 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
 static inline void __move_gap(void *array, size_t element_size,
 			      size_t nr, size_t size,
 			      size_t old_gap, size_t new_gap)
@@ -743,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
 void bch2_darray_str_exit(darray_str *);
 int bch2_split_devs(const char *, darray_str *);
 
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+	return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+	return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+	addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
 #endif /* _BCACHEFS_UTIL_H */
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 9c0d2316..754f17bb 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		kfree(buf);
 
 		if (ret < 0)
-			return ret;
+			goto err_class_exit;
 
 		ret = bch2_opt_check_may_set(c, opt_id, v);
 		if (ret < 0)
-			return ret;
+			goto err_class_exit;
 
 		s.v = v + 1;
 		s.defined = true;
@@ -595,6 +595,7 @@ err:
 	     (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
 		bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
 
+err_class_exit:
 	return bch2_err_class(ret);
 }
 
diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c
index 41f1bcdc..aaefb9b6 100644
--- a/linux/generic-radix-tree.c
+++ b/linux/generic-radix-tree.c
@@ -5,7 +5,7 @@
 #include <linux/gfp.h>
 #include <linux/kmemleak.h>
 
-#define GENRADIX_ARY		(PAGE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY		(GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
 #define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
 
 struct genradix_node {
@@ -14,13 +14,13 @@ struct genradix_node {
 		struct genradix_node	*children[GENRADIX_ARY];
 
 		/* Leaf: */
-		u8			data[PAGE_SIZE];
+		u8			data[GENRADIX_NODE_SIZE];
 	};
 };
 
 static inline int genradix_depth_shift(unsigned depth)
 {
-	return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+	return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
 }
 
 /*
@@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth)
 
 /* depth that's needed for a genradix that can address up to ULONG_MAX: */
 #define GENRADIX_MAX_DEPTH	\
-	DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
+	DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
 
 #define GENRADIX_DEPTH_MASK				\
 	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
@@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr);
 
 static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
 {
-	struct genradix_node *node;
-
-	node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
-
-	/*
-	 * We're using pages (not slab allocations) directly for kernel data
-	 * structures, so we need to explicitly inform kmemleak of them in order
-	 * to avoid false positive memory leak reports.
-	 */
-	kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
-	return node;
+	return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
 }
 
 static inline void genradix_free_node(struct genradix_node *node)
 {
-	kmemleak_free(node);
-	free_page((unsigned long)node);
+	kfree(node);
 }
 
 /*
@@ -200,7 +189,7 @@ restart:
 			i++;
 			iter->offset = round_down(iter->offset + objs_per_ptr,
 						  objs_per_ptr);
-			iter->pos = (iter->offset >> PAGE_SHIFT) *
+			iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) *
 				objs_per_page;
 			if (i == GENRADIX_ARY)
 				goto restart;
@@ -209,7 +198,7 @@ restart:
 		n = n->children[i];
 	}
 
-	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+	return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
@@ -235,7 +224,7 @@ restart:
 
 	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
 		iter->offset = genradix_depth_size(level);
-		iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+		iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
 
 		iter->offset -= obj_size_plus_page_remainder;
 		iter->pos--;
@@ -251,7 +240,7 @@ restart:
 			size_t objs_per_ptr = genradix_depth_size(level);
 
 			iter->offset = round_down(iter->offset, objs_per_ptr);
-			iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+			iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
 
 			if (!iter->offset)
 				return NULL;
@@ -267,7 +256,7 @@ restart:
 		n = n->children[i];
 	}
 
-	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+	return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
 }
 EXPORT_SYMBOL(__genradix_iter_peek_prev);
 
@@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
 {
 	size_t offset;
 
-	for (offset = 0; offset < size; offset += PAGE_SIZE)
+	for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
 		if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
 			return -ENOMEM;
 
diff --git a/linux/sort.c b/linux/sort.c
index ffa4817b..ecce71c5 100644
--- a/linux/sort.c
+++ b/linux/sort.c
@@ -277,92 +277,3 @@ void sort_r(void *base, size_t num, size_t size,
 	}
 }
 EXPORT_SYMBOL(sort_r);
-
-#include <linux/eytzinger.h>
-
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
-			 cmp_r_func_t cmp_func, const void *priv,
-			 size_t l, size_t r)
-{
-	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
-		      base + inorder_to_eytzinger0(r, n) * size,
-		      cmp_func, priv);
-}
-
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
-			   swap_r_func_t swap_func, const void *priv,
-			   size_t l, size_t r)
-{
-	do_swap(base + inorder_to_eytzinger0(l, n) * size,
-		base + inorder_to_eytzinger0(r, n) * size,
-		size, swap_func, priv);
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
-		       cmp_r_func_t cmp_func,
-		       swap_r_func_t swap_func,
-		       const void *priv)
-{
-	int i, c, r;
-
-	/* called from 'sort' without swap function, let's pick the default */
-	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
-		swap_func = NULL;
-
-	if (!swap_func) {
-		if (is_aligned(base, size, 8))
-			swap_func = SWAP_WORDS_64;
-		else if (is_aligned(base, size, 4))
-			swap_func = SWAP_WORDS_32;
-		else
-			swap_func = SWAP_BYTES;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		for (r = i; r * 2 + 1 < n; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < n &&
-			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
-				c++;
-
-			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
-				break;
-
-			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
-
-		for (r = 0; r * 2 + 1 < i; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < i &&
-			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
-				c++;
-
-			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
-				break;
-
-			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     cmp_func_t cmp_func,
-		     swap_func_t swap_func)
-{
-	struct wrapper w = {
-		.cmp  = cmp_func,
-		.swap = swap_func,
-	};
-
-	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-EXPORT_SYMBOL_GPL(eytzinger0_sort);
diff --git a/linux/time_stats.c b/linux/time_stats.c
deleted file mode 100644
index d7dd64ba..00000000
--- a/linux/time_stats.c
+++ /dev/null
@@ -1,373 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/eytzinger.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/time.h>
-#include <linux/time_stats.h>
-#include <linux/spinlock.h>
-
-static const struct time_unit time_units[] = {
-	{ "ns",		1		 },
-	{ "us",		NSEC_PER_USEC	 },
-	{ "ms",		NSEC_PER_MSEC	 },
-	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          (u64) NSEC_PER_SEC * 60},
-	{ "h",          (u64) NSEC_PER_SEC * 3600},
-	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
-	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
-	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
-	{ "eon",        U64_MAX          },
-};
-
-const struct time_unit *pick_time_units(u64 ns)
-{
-	const struct time_unit *u;
-
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
-
-	return u;
-}
-EXPORT_SYMBOL_GPL(pick_time_units);
-
-static void quantiles_update(struct quantiles *q, u64 v)
-{
-	unsigned i = 0;
-
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct quantile_entry *e = q->entries + i;
-
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
-
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
-
-		if (v >= e->m)
-			break;
-
-		i = eytzinger0_child(i, v > e->m);
-	}
-}
-
-static inline void time_stats_update_one(struct time_stats *stats,
-					      u64 start, u64 end)
-{
-	u64 duration, freq;
-	bool initted = stats->last_event != 0;
-
-	if (time_after64(end, start)) {
-		struct quantiles *quantiles = time_stats_to_quantiles(stats);
-
-		duration = end - start;
-		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
-				duration, initted, TIME_STATS_MV_WEIGHT);
-		stats->max_duration = max(stats->max_duration, duration);
-		stats->min_duration = min(stats->min_duration, duration);
-		stats->total_duration += duration;
-
-		if (quantiles)
-			quantiles_update(quantiles, duration);
-	}
-
-	if (stats->last_event && time_after64(end, stats->last_event)) {
-		freq = end - stats->last_event;
-		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
-				freq, initted, TIME_STATS_MV_WEIGHT);
-		stats->max_freq = max(stats->max_freq, freq);
-		stats->min_freq = min(stats->min_freq, freq);
-	}
-
-	stats->last_event = end;
-}
-
-void __time_stats_clear_buffer(struct time_stats *stats,
-			       struct time_stat_buffer *b)
-{
-	for (struct time_stat_buffer_entry *i = b->entries;
-	     i < b->entries + ARRAY_SIZE(b->entries);
-	     i++)
-		time_stats_update_one(stats, i->start, i->end);
-	b->nr = 0;
-}
-EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
-
-static noinline void time_stats_clear_buffer(struct time_stats *stats,
-					     struct time_stat_buffer *b)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-	__time_stats_clear_buffer(stats, b);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
-{
-	unsigned long flags;
-
-	if (!stats->buffer) {
-		spin_lock_irqsave(&stats->lock, flags);
-		time_stats_update_one(stats, start, end);
-
-		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
-		    stats->duration_stats.n > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-			time_stats_clear_buffer(stats, b);
-		preempt_enable();
-	}
-}
-EXPORT_SYMBOL_GPL(__time_stats_update);
-
-#include <linux/seq_buf.h>
-
-static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
-{
-	const struct time_unit *u = pick_time_units(ns);
-
-	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static inline u64 time_stats_lifetime(const struct time_stats *stats)
-{
-	return local_clock() - stats->start_time;
-}
-
-void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
-		const char *epoch_name, unsigned int flags)
-{
-	struct quantiles *quantiles = time_stats_to_quantiles(stats);
-	s64 f_mean = 0, d_mean = 0;
-	u64 f_stddev = 0, d_stddev = 0;
-	u64 lifetime = time_stats_lifetime(stats);
-
-	if (stats->buffer) {
-		int cpu;
-
-		spin_lock_irq(&stats->lock);
-		for_each_possible_cpu(cpu)
-			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
-		spin_unlock_irq(&stats->lock);
-	}
-
-	if (stats->freq_stats.n) {
-		/* avoid divide by zero */
-		f_mean = mean_and_variance_get_mean(stats->freq_stats);
-		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
-		d_mean = mean_and_variance_get_mean(stats->duration_stats);
-		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
-	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
-		/* unless we didn't want zeroes anyway */
-		return;
-	}
-
-	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
-	seq_buf_printf(out, "lifetime: ");
-	seq_buf_time_units_aligned(out, lifetime);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "                       since %-12s recent\n", epoch_name);
-
-	seq_buf_printf(out, "duration of events\n");
-
-	seq_buf_printf(out, "  min:                     ");
-	seq_buf_time_units_aligned(out, stats->min_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  max:                     ");
-	seq_buf_time_units_aligned(out, stats->max_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  total:                   ");
-	seq_buf_time_units_aligned(out, stats->total_duration);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  mean:                    ");
-	seq_buf_time_units_aligned(out, d_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  stddev:                  ");
-	seq_buf_time_units_aligned(out, d_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "time between events\n");
-
-	seq_buf_printf(out, "  min:                     ");
-	seq_buf_time_units_aligned(out, stats->min_freq);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  max:                     ");
-	seq_buf_time_units_aligned(out, stats->max_freq);
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  mean:                    ");
-	seq_buf_time_units_aligned(out, f_mean);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-	seq_buf_printf(out, "\n");
-
-	seq_buf_printf(out, "  stddev:                  ");
-	seq_buf_time_units_aligned(out, f_stddev);
-	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-	seq_buf_printf(out, "\n");
-
-	if (quantiles) {
-		int i = eytzinger0_first(NR_QUANTILES);
-		const struct time_unit *u =
-			pick_time_units(quantiles->entries[i].m);
-		u64 last_q = 0;
-
-		seq_buf_printf(out, "quantiles (%s):\t", u->name);
-		eytzinger0_for_each(i, NR_QUANTILES) {
-			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
-			u64 q = max(quantiles->entries[i].m, last_q);
-			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
-			if (is_last)
-				seq_buf_printf(out, "\n");
-			last_q = q;
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
-
-void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
-		const char *epoch_name, unsigned int flags)
-{
-	struct quantiles *quantiles = time_stats_to_quantiles(stats);
-	s64 f_mean = 0, d_mean = 0;
-	u64 f_stddev = 0, d_stddev = 0;
-
-	if (stats->buffer) {
-		int cpu;
-
-		spin_lock_irq(&stats->lock);
-		for_each_possible_cpu(cpu)
-			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
-		spin_unlock_irq(&stats->lock);
-	}
-
-	if (stats->freq_stats.n) {
-		/* avoid divide by zero */
-		f_mean = mean_and_variance_get_mean(stats->freq_stats);
-		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
-		d_mean = mean_and_variance_get_mean(stats->duration_stats);
-		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
-	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
-		/* unless we didn't want zeroes anyway */
-		return;
-	}
-
-	seq_buf_printf(out, "{\n");
-	seq_buf_printf(out, "  \"epoch\":       \"%s\",\n", epoch_name);
-	seq_buf_printf(out, "  \"count\":       %llu,\n", stats->duration_stats.n);
-
-	seq_buf_printf(out, "  \"duration_ns\": {\n");
-	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_duration);
-	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_duration);
-	seq_buf_printf(out, "    \"total\":     %llu,\n", stats->total_duration);
-	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
-	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
-	seq_buf_printf(out, "  },\n");
-
-	d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
-	d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
-
-	seq_buf_printf(out, "  \"duration_ewma_ns\": {\n");
-	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
-	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
-	seq_buf_printf(out, "  },\n");
-
-	seq_buf_printf(out, "  \"between_ns\": {\n");
-	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_freq);
-	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_freq);
-	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
-	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
-	seq_buf_printf(out, "  },\n");
-
-	f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
-	f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
-
-	seq_buf_printf(out, "  \"between_ewma_ns\": {\n");
-	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
-	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
-
-	if (quantiles) {
-		u64 last_q = 0;
-
-		/* close between_ewma_ns but signal more items */
-		seq_buf_printf(out, "  },\n");
-
-		seq_buf_printf(out, "  \"quantiles_ns\": [\n");
-		eytzinger0_for_each(i, NR_QUANTILES) {
-			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
-			u64 q = max(quantiles->entries[i].m, last_q);
-			seq_buf_printf(out, "    %llu", q);
-			if (!is_last)
-				seq_buf_printf(out, ", ");
-			last_q = q;
-		}
-		seq_buf_printf(out, "  ]\n");
-	} else {
-		/* close between_ewma_ns without dumping further */
-		seq_buf_printf(out, "  }\n");
-	}
-
-	seq_buf_printf(out, "}\n");
-}
-EXPORT_SYMBOL_GPL(time_stats_to_json);
-
-void time_stats_exit(struct time_stats *stats)
-{
-	free_percpu(stats->buffer);
-}
-EXPORT_SYMBOL_GPL(time_stats_exit);
-
-void time_stats_init(struct time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->min_duration = U64_MAX;
-	stats->min_freq = U64_MAX;
-	stats->start_time = local_clock();
-	spin_lock_init(&stats->lock);
-}
-EXPORT_SYMBOL_GPL(time_stats_init);
-
-MODULE_AUTHOR("Kent Overstreet");
-MODULE_LICENSE("GPL");