diff --git a/.bcachefs_revision b/.bcachefs_revision
index 797e1336..d3c500bc 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-481b5f34324809f47a58ed798d038fb17e5b7b0a
+50847e296b34efabe199e408ec4d72f10a866c39
diff --git a/Makefile b/Makefile
index 5cdb437c..876efbb7 100644
--- a/Makefile
+++ b/Makefile
@@ -273,11 +273,20 @@ update-bcachefs-sources:
 	git add include/linux/kmemleak.h
 	cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
 	git add linux/int_sqrt.c
-	git rm -f libbcachefs/mean_and_variance_test.c
-#	cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
-#	git add linux/mean_and_variance.c
-#	cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
-#	git add include/linux/mean_and_variance.h
+	cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
+	git add linux/mean_and_variance.c
+	cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
+	git add include/linux/mean_and_variance.h
+	cp $(LINUX_DIR)/lib/time_stats.c linux/
+	git add linux/time_stats.c
+	cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/
+	git add include/linux/time_stats.h
+	cp $(LINUX_DIR)/include/linux/darray.h include/linux/
+	git add include/linux/darray.h
+	cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/
+	git add include/linux/darray_types.h
+	cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/
+	git add include/linux/eytzinger.h
 	cp $(LINUX_DIR)/scripts/Makefile.compiler ./
 	git add Makefile.compiler
 	$(RM) libbcachefs/*.mod.c
diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c
index 6b77763e..3d29f413 100644
--- a/c_src/cmd_format.c
+++ b/c_src/cmd_format.c
@@ -23,12 +23,13 @@
 #include "cmds.h"
 #include "libbcachefs.h"
 #include "crypto.h"
-#include "libbcachefs/darray.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 #include "libbcachefs/util.h"
 
+#include "linux/darray.h"
+
 #define OPTS						\
 x(0,	replicas,		required_argument)	\
 x(0,	encrypted,		no_argument)		\
diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c
index 1a5d144b..d8542d61 100644
--- a/c_src/cmd_fs.c
+++ b/c_src/cmd_fs.c
@@ -9,13 +9,14 @@
 
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/buckets.h"
-#include "libbcachefs/darray.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 
 #include "cmds.h"
 #include "libbcachefs.h"
 
+#include "linux/darray.h"
+
 static void __dev_usage_type_to_text(struct printbuf *out,
 				     enum bch_data_type type,
 				     unsigned bucket_size,
diff --git a/c_src/tools-util.h b/c_src/tools-util.h
index bff3bc65..4682406e 100644
--- a/c_src/tools-util.h
+++ b/c_src/tools-util.h
@@ -20,7 +20,7 @@
 #include <linux/uuid.h>
 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/bbpos.h"
-#include "libbcachefs/darray.h"
+#include "linux/darray.h"
 
 #define noreturn __attribute__((noreturn))
 
diff --git a/libbcachefs/darray.h b/include/linux/darray.h
similarity index 66%
rename from libbcachefs/darray.h
rename to include/linux/darray.h
index 4b340d13..ff167eb7 100644
--- a/libbcachefs/darray.h
+++ b/include/linux/darray.h
@@ -1,34 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_H
+#define _LINUX_DARRAY_H
 
 /*
- * Dynamic arrays:
+ * Dynamic arrays
  *
  * Inspired by CCAN's darray
  */
 
+#include <linux/darray_types.h>
 #include <linux/slab.h>
 
-#define DARRAY_PREALLOCATED(_type, _nr)					\
-struct {								\
-	size_t nr, size;						\
-	_type *data;							\
-	_type preallocated[_nr];					\
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)	darray_char;
-typedef DARRAY(char *) darray_str;
-
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
 
 static inline int __darray_resize(darray_char *d, size_t element_size,
 				  size_t new_size, gfp_t gfp)
 {
 	return unlikely(new_size > d->size)
-		? __bch2_darray_resize(d, element_size, new_size, gfp)
+		? __darray_resize_slowpath(d, element_size, new_size, gfp)
 		: 0;
 }
 
@@ -69,6 +61,28 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 #define darray_first(_d)	((_d).data[0])
 #define darray_last(_d)		((_d).data[(_d).nr - 1])
 
+/* Insert/remove items into the middle of a darray: */
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
 #define darray_insert_item(_d, pos, _item)				\
 ({									\
 	size_t _pos = (pos);						\
@@ -79,10 +93,15 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
 	_ret;								\
 })
 
-#define darray_remove_item(_d, _pos)					\
-	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+#define darray_remove_items(_d, _pos, _nr_to_remove)			\
+	array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
 
-#define __darray_for_each(_d, _i)						\
+#define darray_remove_item(_d, _pos)					\
+	darray_remove_items(_d, _pos, 1)
+
+/* Iteration: */
+
+#define __darray_for_each(_d, _i)					\
 	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each(_d, _i)						\
@@ -106,4 +125,4 @@ do {									\
 	darray_init(_d);						\
 } while (0)
 
-#endif /* _BCACHEFS_DARRAY_H */
+#endif /* _LINUX_DARRAY_H */
diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h
new file mode 100644
index 00000000..a400a0c3
--- /dev/null
+++ b/include/linux/darray_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_DARRAY_TYpES_H
+#define _LINUX_DARRAY_TYpES_H
+
+#include <linux/types.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr)					\
+struct {								\
+	size_t nr, size;						\
+	_type *data;							\
+	_type preallocated[_nr];					\
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)	darray_char;
+typedef DARRAY(char *)	darray_str;
+
+#endif /* _LINUX_DARRAY_TYpES_H */
diff --git a/libbcachefs/eytzinger.h b/include/linux/eytzinger.h
similarity index 78%
rename from libbcachefs/eytzinger.h
rename to include/linux/eytzinger.h
index b04750db..9565a5c2 100644
--- a/libbcachefs/eytzinger.h
+++ b/include/linux/eytzinger.h
@@ -1,27 +1,37 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
+#ifndef _LINUX_EYTZINGER_H
+#define _LINUX_EYTZINGER_H
 
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
  *
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + child;
 }
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 	unsigned shift = __fls(size) - b;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + 1 + child;
 }
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	     (_i) != -1;				\
 	     (_i) = eytzinger0_next((_i), (_size)))
 
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
 /* return greatest node <= @search, or -1 if not found */
 static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-					 eytzinger_cmp_fn cmp, const void *search)
+					 cmp_func_t cmp, const void *search)
 {
 	unsigned i, n = 0;
 
@@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 	do {
 		i = n;
-		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+		n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
 	} while (n < nr);
 
 	if (n & 1) {
@@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	_i;								\
 })
 
-void eytzinger0_sort(void *, size_t, size_t,
-		    int (*cmp_func)(const void *, const void *, size_t),
-		    void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+		       cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
 
-#endif /* _EYTZINGER_H */
+#endif /* _LINUX_EYTZINGER_H */
diff --git a/libbcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h
similarity index 94%
rename from libbcachefs/mean_and_variance.h
rename to include/linux/mean_and_variance.h
index b2be565b..4fcf062d 100644
--- a/libbcachefs/mean_and_variance.h
+++ b/include/linux/mean_and_variance.h
@@ -17,7 +17,7 @@
  * Rust and rustc has issues with u128.
  */
 
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
 
 typedef struct {
 	unsigned __int128 v;
@@ -154,8 +154,6 @@ struct mean_and_variance {
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-	bool	init;
-	u8	weight;	/* base 2 logarithim */
 	s64	mean;
 	u64	variance;
 };
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
 u64 mean_and_variance_get_variance(struct mean_and_variance s1);
 u32 mean_and_variance_get_stddev(struct mean_and_variance s);
 
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 v, bool initted, u8 weight);
 
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight);
 
 #endif // MEAN_AND_VAIRANCE_H_
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 506da24d..37325170 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -90,6 +90,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 			      (void *) size);
 }
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kvfree(void *element, void *pool_data);
+
+static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
+static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
+}
+
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 6c4a623c..28ce667b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -1,65 +1 @@
-#ifndef __TOOLS_LINUX_SPINLOCK_H
-#define __TOOLS_LINUX_SPINLOCK_H
-
-#include <linux/atomic.h>
-#include <pthread.h>
-
-typedef struct {
-	pthread_mutex_t lock;
-} raw_spinlock_t;
-
-#define __RAW_SPIN_LOCK_UNLOCKED(name)	(raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
-
-static inline void raw_spin_lock_init(raw_spinlock_t *lock)
-{
-	pthread_mutex_init(&lock->lock, NULL);
-}
-
-static inline bool raw_spin_trylock(raw_spinlock_t *lock)
-{
-	return !pthread_mutex_trylock(&lock->lock);
-}
-
-static inline void raw_spin_lock(raw_spinlock_t *lock)
-{
-	pthread_mutex_lock(&lock->lock);
-}
-
-static inline void raw_spin_unlock(raw_spinlock_t *lock)
-{
-	pthread_mutex_unlock(&lock->lock);
-}
-
-#define raw_spin_lock_irq(lock)		raw_spin_lock(lock)
-#define raw_spin_unlock_irq(lock)	raw_spin_unlock(lock)
-
-#define raw_spin_lock_irqsave(lock, flags)		\
-do {							\
-	flags = 0;					\
-	raw_spin_lock(lock);				\
-} while (0)
-
-#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
-
-typedef raw_spinlock_t spinlock_t;
-
-#define __SPIN_LOCK_UNLOCKED(name)	__RAW_SPIN_LOCK_UNLOCKED(name)
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
-
-#define spin_lock_init(lock)		raw_spin_lock_init(lock)
-#define spin_lock(lock)			raw_spin_lock(lock)
-#define spin_unlock(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_nested(lock, n)	spin_lock(lock)
-
-#define spin_lock_bh(lock)		raw_spin_lock(lock)
-#define spin_unlock_bh(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_irq(lock)		raw_spin_lock(lock)
-#define spin_unlock_irq(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_irqsave(lock, flags)	raw_spin_lock_irqsave(lock, flags)
-#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
-
-#endif /* __TOOLS_LINUX_SPINLOCK_H */
+#include "linux/spinlock_types.h"
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
new file mode 100644
index 00000000..6c4a623c
--- /dev/null
+++ b/include/linux/spinlock_types.h
@@ -0,0 +1,65 @@
+#ifndef __TOOLS_LINUX_SPINLOCK_H
+#define __TOOLS_LINUX_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <pthread.h>
+
+typedef struct {
+	pthread_mutex_t lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED(name)	(raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
+
+static inline void raw_spin_lock_init(raw_spinlock_t *lock)
+{
+	pthread_mutex_init(&lock->lock, NULL);
+}
+
+static inline bool raw_spin_trylock(raw_spinlock_t *lock)
+{
+	return !pthread_mutex_trylock(&lock->lock);
+}
+
+static inline void raw_spin_lock(raw_spinlock_t *lock)
+{
+	pthread_mutex_lock(&lock->lock);
+}
+
+static inline void raw_spin_unlock(raw_spinlock_t *lock)
+{
+	pthread_mutex_unlock(&lock->lock);
+}
+
+#define raw_spin_lock_irq(lock)		raw_spin_lock(lock)
+#define raw_spin_unlock_irq(lock)	raw_spin_unlock(lock)
+
+#define raw_spin_lock_irqsave(lock, flags)		\
+do {							\
+	flags = 0;					\
+	raw_spin_lock(lock);				\
+} while (0)
+
+#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
+
+typedef raw_spinlock_t spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED(name)	__RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#define spin_lock_init(lock)		raw_spin_lock_init(lock)
+#define spin_lock(lock)			raw_spin_lock(lock)
+#define spin_unlock(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_nested(lock, n)	spin_lock(lock)
+
+#define spin_lock_bh(lock)		raw_spin_lock(lock)
+#define spin_unlock_bh(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irq(lock)		raw_spin_lock(lock)
+#define spin_unlock_irq(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irqsave(lock, flags)	raw_spin_lock_irqsave(lock, flags)
+#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
+
+#endif /* __TOOLS_LINUX_SPINLOCK_H */
diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
new file mode 100644
index 00000000..2a66e762
--- /dev/null
+++ b/include/linux/thread_with_file.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_THREAD_WITH_FILE_H
+#define _LINUX_THREAD_WITH_FILE_H
+
+struct stdio_redirect;
+
+__printf(3, 0)
+static inline void stdio_redirect_vprintf(struct stdio_redirect *s, bool nonblocking, const char *msg, va_list args) {}
+__printf(3, 4)
+static inline void stdio_redirect_printf(struct stdio_redirect *s, bool nonblocking, const char *msg, ...) {}
+
+#endif /* _LINUX_THREAD_WITH_FILE_H */
diff --git a/include/linux/thread_with_file_types.h b/include/linux/thread_with_file_types.h
new file mode 100644
index 00000000..e69de29b
diff --git a/include/linux/time.h b/include/linux/time.h
new file mode 100644
index 00000000..e69de29b
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644
index 00000000..6df2b34a
--- /dev/null
+++ b/include/linux/time_stats.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ *   everywhere without worrying about overhead
+ *
+ * tracks:
+ *  - number of events
+ *  - maximum event duration ever seen
+ *  - sum of all event durations
+ *  - average event duration, standard and weighted
+ *  - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _LINUX_TIME_STATS_H
+#define _LINUX_TIME_STATS_H
+
+#include <linux/mean_and_variance.h>
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+struct time_unit {
+	const char	*name;
+	u64		nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[31];
+};
+
+struct time_stats {
+	spinlock_t	lock;
+	bool		have_quantiles;
+	/* all fields are in nanoseconds */
+	u64             min_duration;
+	u64		max_duration;
+	u64		total_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	u64		last_event_start;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance	  freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT	8
+
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct time_stat_buffer __percpu *buffer;
+
+	u64		start_time;
+};
+
+struct time_stats_quantiles {
+	struct time_stats	stats;
+	struct quantiles	quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
+{
+	return stats->have_quantiles
+		? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
+		: NULL;
+}
+
+void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
+void __time_stats_update(struct time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats	- time_stats to update
+ * @start	- start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void time_stats_update(struct time_stats *stats, u64 start)
+{
+	__time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats	- time_stats to update
+ * @v		- new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct time_stats *stats, bool v)
+{
+	if (v != !!stats->last_event_start) {
+		if (!v) {
+			time_stats_update(stats, stats->last_event_start);
+			stats->last_event_start = 0;
+		} else {
+			stats->last_event_start = local_clock() ?: 1;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+#define TIME_STATS_PRINT_NO_ZEROES	(1U << 0)	/* print nothing if zero count */
+struct seq_buf;
+void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
+		const char *epoch_name, unsigned int flags);
+void time_stats_to_json(struct seq_buf *, struct time_stats *,
+		const char *epoch_name, unsigned int flags);
+
+void time_stats_exit(struct time_stats *);
+void time_stats_init(struct time_stats *);
+
+static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
+{
+	time_stats_exit(&statq->stats);
+}
+static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
+{
+	time_stats_init(&statq->stats);
+	statq->stats.have_quantiles = true;
+	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _LINUX_TIME_STATS_H */
diff --git a/include/linux/types.h b/include/linux/types.h
index ce454e26..6ae97c42 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -8,6 +8,7 @@
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <linux/posix_types.h>
 
 #define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
 #include <asm/types.h>
@@ -77,6 +78,10 @@ typedef __u64 __bitwise __be64;
 
 typedef u64 sector_t;
 
+typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
 typedef int (*cmp_func_t)(const void *a, const void *b);
 
 typedef unsigned int __bitwise slab_flags_t;
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 633d3223..ca58193d 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-				   &c->blocked_allocate_open_bucket, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
 		spin_unlock(&c->freelist_lock);
 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
 	}
@@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 	ca->nr_open_buckets++;
 	bch2_open_bucket_hash_add(c, ob);
 
-	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-			   &c->blocked_allocate_open_bucket, false);
-
-	track_event_change(&c->times[BCH_TIME_blocked_allocate],
-			   &c->blocked_allocate, false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
 
 	spin_unlock(&c->freelist_lock);
 	return ob;
@@ -555,8 +551,7 @@ again:
 			goto again;
 		}
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate],
-				   &c->blocked_allocate, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
 
 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
 		goto err;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b80c6c9e..70369495 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -200,6 +200,8 @@
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
+#include <linux/thread_with_file_types.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -465,7 +467,6 @@ enum bch_time_stats {
 #include "replicas_types.h"
 #include "subvolume_types.h"
 #include "super_types.h"
-#include "thread_with_file_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
@@ -593,7 +594,7 @@ struct bch_dev {
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct bch2_time_stats	io_latency[2];
+	struct time_stats_quantiles	io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
@@ -640,8 +641,8 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
-	struct bch2_time_stats	duration;
-	struct bch2_time_stats	lock_hold_times;
+	struct time_stats	duration;
+	struct time_stats	lock_hold_times;
 	struct mutex		lock;
 	unsigned		nr_max_paths;
 	unsigned		journal_entries_size;
@@ -919,8 +920,6 @@ struct bch_fs {
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
-	u64			blocked_allocate;
-	u64			blocked_allocate_open_bucket;
 
 	open_bucket_idx_t	open_buckets_freelist;
 	open_bucket_idx_t	open_buckets_nr_free;
@@ -1104,7 +1103,7 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	bool			promote_whole_extents;
 
-	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+	struct time_stats	times[BCH_TIME_STAT_NR];
 
 	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 0668b682..14f61361 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(dev_usage,		8)		\
 	x(log,			9)		\
 	x(overwrite,		10)		\
-	x(write_buffer_keys,	11)
+	x(write_buffer_keys,	11)		\
+	x(datetime,		12)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1376,6 +1377,11 @@ struct jset_entry_log {
 	u8			d[];
 } __packed __aligned(8);
 
+struct jset_entry_datetime {
+	struct jset_entry	entry;
+	__le64			seconds;
+} __packed __aligned(8);
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h
index cbad7a9e..4b8fba75 100644
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -379,7 +379,7 @@ struct bch_ioctl_disk_resize_journal {
 
 struct bch_ioctl_subvolume {
 	__u32			flags;
-	__s32			dirfd;
+	__u32			dirfd;
 	__u16			mode;
 	__u16			pad[3];
 	__u64			dst_ptr;
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 3fd1085b..1d77aa55 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -9,12 +9,12 @@
 #include "bcachefs.h"
 #include "btree_cache.h"
 #include "bset.h"
-#include "eytzinger.h"
 #include "trace.h"
 #include "util.h"
 
 #include <asm/unaligned.h>
 #include <linux/console.h>
+#include <linux/eytzinger.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index d7c81bea..9b7ea122 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	clear_btree_node_just_written(b);
 
-	kvpfree(b->data, btree_buf_bytes(b));
+	kvfree(b->data);
 	b->data = NULL;
 #ifdef __KERNEL__
 	kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
 	BUG_ON(b->data || b->aux_data);
 
-	b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+	b->data = kvmalloc(btree_buf_bytes(b), gfp);
 	if (!b->data)
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 		b->aux_data = NULL;
 #endif
 	if (!b->aux_data) {
-		kvpfree(b->data, btree_buf_bytes(b));
+		kvfree(b->data);
 		b->data = NULL;
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 	}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &bc->live);
 
-	kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+	kvfree(c->verify_ondisk);
 
 	for (i = 0; i < btree_id_nr_alive(c); i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
@@ -648,7 +648,7 @@ out:
 	bch2_btree_keys_init(b);
 	set_btree_node_accessed(b);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+	time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
 	memalloc_nofs_restore(flags);
@@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	b = bch2_btree_node_mem_alloc(trans, level != 0);
 
 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		if (!path)
+			return b;
+
 		trans->memory_allocation_failure = true;
 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	}
 
 	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		if (path)
-			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+		BUG_ON(!path);
+
+		trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
 	}
 
@@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	BUG_ON(trans && !btree_node_locked(path, level + 1));
+	BUG_ON(path && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
 	b = btree_cache_find(bc, k);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 11029956..eb92526b 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -389,7 +389,8 @@ again:
 	have_child = dropped_children = false;
 	bch2_bkey_buf_init(&prev_k);
 	bch2_bkey_buf_init(&cur_k);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -478,7 +479,8 @@ again:
 		goto err;
 
 	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -931,7 +933,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 	bch2_bkey_buf_init(&prev);
 	bch2_bkey_buf_init(&cur);
 	bkey_init(&prev.k->k);
@@ -963,7 +965,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 
 	if (b->c.level > target_depth) {
 		bch2_btree_and_journal_iter_exit(&iter);
-		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+		bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+		iter.prefetch = true;
 
 		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 			struct btree *child;
@@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
 	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(c, ca) {
-		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
-			sizeof(struct bucket_array) +
-			ca->mi.nbuckets * sizeof(struct bucket));
+		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
 		ca->buckets_gc = NULL;
 
 		free_percpu(ca->usage_gc);
@@ -1491,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	for_each_member_device(c, ca) {
-		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
 				GFP_KERNEL|__GFP_ZERO);
 		if (!buckets) {
@@ -1970,7 +1971,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	c->gc_count++;
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 	trace_and_count(c, gc_gens_end, c);
 err:
 	for_each_member_device(c, ca) {
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index aa9b6cbe..61b60938 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
 	if (used_mempool)
 		mempool_free(p, &c->btree_bounce_pool);
 	else
-		vpfree(p, size);
+		kvfree(p);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
 	BUG_ON(size > c->opts.btree_node_size);
 
 	*used_mempool = false;
-	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
 		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
 	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+		time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 				       start_time);
 
 	/* Make sure we preserve bset journal_seq: */
@@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 			&dst->format,
 			true);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+	time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 			       start_time);
 
 	set_btree_bset_end(dst, dst->set);
@@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 out:
 	mempool_free(iter, &c->fill_iter);
 	printbuf_exit(&buf);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
 	return retry_read;
 fsck_err:
 	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1323,7 +1323,7 @@ start:
 		}
 	}
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+	time_stats_update(&c->times[BCH_TIME_btree_node_read],
 			       rb->start_time);
 	bio_put(&rb->bio);
 
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 5467a863..3aac6ed5 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret = 0;
 
-	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
 
 	k = bch2_btree_and_journal_iter_peek(&jiter);
 
@@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 	path = &trans->paths[path_idx];
 
 	if (unlikely(path->level >= BTREE_MAX_DEPTH))
-		goto out;
+		goto out_uptodate;
 
 	path->level = btree_path_up_until_good_node(trans, path, 0);
 
@@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 			goto out;
 		}
 	}
-
+out_uptodate:
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 
 	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
 	    time_after64(now, trans->last_begin_time + 10))
-		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+		__time_stats_update(&btree_trans_stats(trans)->duration,
 					 trans->last_begin_time, now);
 
 	if (!trans->restarted &&
@@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
 		kfree(s->max_paths_text);
-		bch2_time_stats_exit(&s->lock_hold_times);
+		time_stats_exit(&s->lock_hold_times);
 	}
 
 	if (c->btree_trans_barrier_initialized)
@@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
-		bch2_time_stats_init(&s->duration);
-		bch2_time_stats_init(&s->lock_hold_times);
+		time_stats_init(&s->duration);
+		time_stats_init(&s->lock_hold_times);
 		mutex_init(&s->lock);
 	}
 
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
index 719a94a8..3da65562 100644
--- a/libbcachefs/btree_journal_iter.c
+++ b/libbcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bset.h"
+#include "btree_cache.h"
 #include "btree_journal_iter.h"
 #include "journal_io.h"
 
@@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 		iter->pos = bpos_successor(iter->pos);
 }
 
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+	struct btree_and_journal_iter iter = *_iter;
+	struct bch_fs *c = iter.trans->c;
+	unsigned level = iter.journal.level;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (level > 1 ? 0 :  2)
+		: (level > 1 ? 1 : 16);
+
+	iter.prefetch = false;
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr--) {
+		bch2_btree_and_journal_iter_advance(&iter);
+		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
 	struct bkey_s_c btree_k, journal_k, ret;
+
+	if (iter->prefetch && iter->journal.level)
+		btree_and_journal_iter_prefetch(iter);
 again:
 	if (iter->at_end)
 		return bkey_s_c_null;
@@ -376,17 +407,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 	bch2_journal_iter_exit(&iter->journal);
 }
 
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						  struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						  struct btree_and_journal_iter *iter,
 						  struct btree *b,
 						  struct btree_node_iter node_iter,
 						  struct bpos pos)
 {
 	memset(iter, 0, sizeof(*iter));
 
+	iter->trans = trans;
 	iter->b = b;
 	iter->node_iter = node_iter;
-	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
 	INIT_LIST_HEAD(&iter->journal.list);
 	iter->pos = b->data->min_key;
 	iter->at_end = false;
@@ -396,15 +428,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
  * this version is used by btree_gc before filesystem has gone RW and
  * multithreaded, so uses the journal_iters list:
  */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						struct btree_and_journal_iter *iter,
 						struct btree *b)
 {
 	struct btree_node_iter node_iter;
 
 	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-	list_add(&iter->journal.list, &c->journal_iters);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &trans->c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -415,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
 	struct genradix_iter iter;
 
 	genradix_for_each(&c->journal_entries, iter, i)
-		if (*i)
-			kvpfree(*i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&(*i)->j));
+		kvfree(*i);
 	genradix_free(&c->journal_entries);
 }
 
diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h
index 8ca4c100..c9d19da3 100644
--- a/libbcachefs/btree_journal_iter.h
+++ b/libbcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
+	struct btree_trans	*trans;
 	struct btree		*b;
 	struct btree_node_iter	node_iter;
 	struct bkey		unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
 	struct journal_iter	journal;
 	struct bpos		pos;
 	bool			at_end;
+	bool			prefetch;
 };
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
 
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+					 struct btree_and_journal_iter *);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-				struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *,
 				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-						struct bch_fs *,
-						struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *);
 
 void bch2_journal_keys_put(struct bch_fs *);
 
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 4bd72c85..f2e2c588 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
 					      struct btree_path *path, unsigned level)
 {
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+	__time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
 				 path->l[level].lock_taken_time,
 				 local_clock());
 #endif
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 4a5a6449..0d5eecbd 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -2,12 +2,12 @@
 #ifndef _BCACHEFS_BTREE_TYPES_H
 #define _BCACHEFS_BTREE_TYPES_H
 
+#include <linux/darray_types.h>
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
-#include "darray.h"
 #include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index c3ff365a..e5193116 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -14,6 +14,8 @@
 #include "snapshot.h"
 #include "trace.h"
 
+#include <linux/darray.h>
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 					 const struct btree_insert_entry *r)
 {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 17a5938a..030291cc 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
 	bch2_disk_reservation_put(c, &as->disk_res);
 	bch2_btree_reserve_put(as, trans);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+	time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
 			       as->start_time);
 
 	mutex_lock(&c->btree_interior_update_lock);
@@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
 	continue_at(&as->cl, btree_update_set_nodes_written,
 		    as->c->btree_interior_update_worker);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+	time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
 			       start_time);
 }
 
@@ -1629,7 +1629,7 @@ out:
 
 	bch2_trans_verify_locks(trans);
 
-	bch2_time_stats_update(&c->times[n2
+	time_stats_update(&c->times[n2
 			       ? BCH_TIME_btree_node_split
 			       : BCH_TIME_btree_node_compact],
 			       start_time);
@@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_btree_update_done(as, trans);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+	time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 out:
 err:
 	if (new_path)
@@ -2484,7 +2484,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 {
 	c->btree_interior_update_worker =
-		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
 	if (!c->btree_interior_update_worker)
 		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 
diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h
index 9b9433de..5f248873 100644
--- a/libbcachefs/btree_write_buffer_types.h
+++ b/libbcachefs/btree_write_buffer_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 #include "journal_types.h"
 
 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 54f7826a..7dca10ba 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 	struct bucket_gens *buckets =
 		container_of(rcu, struct bucket_gens, rcu);
 
-	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+	kvfree(buckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bool resize = ca->bucket_gens != NULL;
 	int ret;
 
-	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
-					    GFP_KERNEL|__GFP_ZERO))) {
+	if (!(bucket_gens	= kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+					   GFP_KERNEL|__GFP_ZERO))) {
 		ret = -BCH_ERR_ENOMEM_bucket_gens;
 		goto err;
 	}
 
 	if ((c->opts.buckets_nouse &&
-	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)))) {
+	     !(buckets_nouse	= kvmalloc(BITS_TO_LONGS(nbuckets) *
+					   sizeof(unsigned long),
+					   GFP_KERNEL|__GFP_ZERO)))) {
 		ret = -BCH_ERR_ENOMEM_buckets_nouse;
 		goto err;
 	}
@@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = 0;
 err:
-	kvpfree(buckets_nouse,
-		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvfree(buckets_nouse);
 	if (bucket_gens)
 		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 
@@ -1407,27 +1406,21 @@ err:
 
 void bch2_dev_buckets_free(struct bch_dev *ca)
 {
-	unsigned i;
+	kvfree(ca->buckets_nouse);
+	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
 
-	kvpfree(ca->buckets_nouse,
-		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
-		sizeof(struct bucket_gens) + ca->mi.nbuckets);
-
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
 		free_percpu(ca->usage[i]);
 	kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	unsigned i;
-
 	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
 	if (!ca->usage_base)
 		return -BCH_ERR_ENOMEM_usage_init;
 
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
 		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[i])
 			return -BCH_ERR_ENOMEM_usage_init;
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 226b39c1..4cbda66b 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -11,7 +11,6 @@
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
-#include "thread_with_file.h"
 
 #include <linux/cdev.h>
 #include <linux/device.h>
@@ -20,6 +19,7 @@
 #include <linux/major.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
+#include <linux/thread_with_file.h>
 #include <linux/uaccess.h>
 
 __must_check
@@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
 	kfree(thr);
 }
 
-static int bch2_fsck_offline_thread_fn(void *arg)
+static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
 
 	thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
 	if (!thr->thr.thr.ret)
 		bch2_fs_stop(c);
-
-	thread_with_stdio_done(&thr->thr);
-	return 0;
 }
 
 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
@@ -220,7 +217,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
+	ret = run_thread_with_stdio(&thr->thr,
 			bch2_fsck_thread_exit,
 			bch2_fsck_offline_thread_fn);
 err:
@@ -425,7 +422,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
 {
 	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 
-	bch2_thread_with_file_exit(&ctx->thr);
+	thread_with_file_exit(&ctx->thr);
 	kfree(ctx);
 	return 0;
 }
@@ -475,7 +472,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	ctx->c = c;
 	ctx->arg = arg;
 
-	ret = bch2_run_thread_with_file(&ctx->thr,
+	ret = run_thread_with_file(&ctx->thr,
 			&bcachefs_data_ops,
 			bch2_data_thread);
 	if (ret < 0)
@@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_fsck_online_thread_fn(void *arg)
+static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = thr->c;
 
 	c->stdio_filter = current;
@@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg)
 	c->stdio_filter = NULL;
 	c->opts.fix_errors = old_fix_errors;
 
-	thread_with_stdio_done(&thr->thr);
-
 	up(&c->online_fsck_mutex);
 	bch2_ro_ref_put(c);
-	return 0;
 }
 
 static long bch2_ioctl_fsck_online(struct bch_fs *c,
@@ -840,7 +834,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 			goto err;
 	}
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
+	ret = run_thread_with_stdio(&thr->thr,
 			bch2_fsck_thread_exit,
 			bch2_fsck_online_thread_fn);
 err:
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 33df8cf8..1410365a 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		return 0;
 
 	if (!mempool_initialized(&c->compression_bounce[READ]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
 	for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
-		if (mempool_init_kvpmalloc_pool(
+		if (mempool_init_kvmalloc_pool(
 				&c->compress_workspace[i->type],
 				1, i->compress_workspace))
 			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
 	if (!mempool_initialized(&c->decompress_workspace) &&
-	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
-					1, decompress_workspace_size))
+	    mempool_init_kvmalloc_pool(&c->decompress_workspace,
+				       1, decompress_workspace_size))
 		return -BCH_ERR_ENOMEM_decompression_workspace_init;
 
 	return 0;
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 7bdba850..b1f147e6 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	mutex_lock(&c->verify_lock);
 
 	if (!c->verify_ondisk) {
-		c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 		if (!c->verify_ondisk)
 			goto out;
 	}
@@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
-	n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 	if (!n_ondisk) {
 		prt_printf(out, "memory allocation failure\n");
 		goto out;
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 out:
 	if (bio)
 		bio_put(bio);
-	kvpfree(n_ondisk, btree_buf_bytes(b));
+	kvfree(n_ondisk);
 	percpu_ref_put(&ca->io_ref);
 }
 
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index ae29ad0c..97773cff 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
 	dirent->k.p.inode	= dir;
 	dirent->k.p.snapshot	= snapshot;
 
-	ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-				     zero_inum, snapshot,
-				     &dirent->k_i, str_hash_flags,
-				     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+					zero_inum, snapshot,
+					&dirent->k_i, str_hash_flags,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	*dir_offset = dirent->k.p.offset;
 
 	return ret;
@@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+	unsigned src_update_flags = 0;
+	bool delete_src, delete_dst;
 	int ret = 0;
 
-	if (src_dir.subvol != dst_dir.subvol)
-		return -EXDEV;
-
 	memset(src_inum, 0, sizeof(*src_inum));
 	memset(dst_inum, 0, sizeof(*dst_inum));
 
@@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
-	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
-		return -EOPNOTSUPP;
-
-
 	/* Lookup dst: */
 	if (mode == BCH_RENAME) {
 		/*
@@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				bkey_s_c_to_dirent(old_dst), dst_inum);
 		if (ret)
 			goto out;
-
-		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
-		if (dst_type == DT_SUBVOL)
-			return -EOPNOTSUPP;
 	}
 
 	if (mode != BCH_RENAME_EXCHANGE)
@@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
+	if (new_dst->v.d_type == DT_SUBVOL)
+		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    new_src->v.d_type == DT_SUBVOL)
+		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
 	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 	if (ret)
 		goto out;
 out_set_src:
-
 	/*
-	 * If we're deleting a subvolume, we need to really delete the dirent,
-	 * not just emit a whiteout in the current snapshot:
+	 * If we're deleting a subvolume we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot - there can only be
+	 * single dirent that points to a given subvolume.
+	 *
+	 * IOW, we don't maintain multiple versions in different snapshots of
+	 * dirents that point to subvolumes - dirents that point to subvolumes
+	 * are only visible in one particular subvolume so it's not necessary,
+	 * and it would be particularly confusing for fsck to have to deal with.
 	 */
-	if (src_type == DT_SUBVOL) {
-		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
-		ret = bch2_btree_iter_traverse(&src_iter);
+	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+		new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+	delete_dst = old_dst.k &&
+		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+	if (!delete_src || !bkey_deleted(&new_src->k)) {
+		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
 		if (ret)
 			goto out;
-
-		new_src->k.p = src_iter.pos;
-		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
 	}
 
-	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-	if (ret)
-		goto out;
+	if (delete_src) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&src_iter) ?:
+			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
+	}
+
+	if (delete_dst) {
+		bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&dst_iter) ?:
+			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
+	}
 
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
@@ -458,41 +472,29 @@ out:
 	return ret;
 }
 
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       subvol_inum dir,
-			       const struct bch_hash_info *hash_info,
-			       const struct qstr *name, subvol_inum *inum,
-			       unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     subvol_inum dir,
+			     const struct bch_hash_info *hash_info,
+			     const struct qstr *name, subvol_inum *inum,
+			     unsigned flags)
 {
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+				   hash_info, dir, name, flags);
 	if (ret)
 		return ret;
 
-	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-			       hash_info, dir, name, flags);
-	if (ret)
-		return ret;
-
-	k = bch2_btree_iter_peek_slot(iter);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	d = bkey_s_c_to_dirent(k);
-
-	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
 	if (ret > 0)
 		ret = -ENOENT;
 err:
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
-
 	return ret;
 }
 
@@ -504,7 +506,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 	struct btree_iter iter = { NULL };
 
 	int ret = lockrestart_do(trans,
-		__bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
 	return ret;
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 21ffeb78..f1dd7208 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *,
 		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
 			       subvol_inum, const struct bch_hash_info *,
 			       const struct qstr *, subvol_inum *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index d503af27..b98e2c2b 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 		unsigned i;
 
 		for (i = 0; i < s->v.nr_blocks; i++) {
-			kvpfree(buf->data[i], buf->size << 9);
+			kvfree(buf->data[i]);
 			buf->data[i] = NULL;
 		}
 	}
@@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 	memset(buf->valid, 0xFF, sizeof(buf->valid));
 
 	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
 		if (!buf->data[i])
 			goto err;
 	}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 8c40c206..3fd33b30 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -176,6 +176,8 @@
 	x(EINVAL,			invalid)				\
 	x(EINVAL,			internal_fsck_err)			\
 	x(EINVAL,			opt_parse_error)			\
+	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
+	x(EINVAL,			remove_would_lose_data)			\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index d32c8beb..70a12539 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -2,7 +2,7 @@
 #include "bcachefs.h"
 #include "error.h"
 #include "super.h"
-#include "thread_with_file.h"
+#include <linux/thread_with_file.h>
 
 #define FSCK_ERR_RATELIMIT_NR	10
 
@@ -105,7 +105,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
 	do {
 		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
 
-		int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+		int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
 		if (r < 0)
 			return YN_NO;
 		buf[r] = '\0';
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 66b945be..d8153fe2 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -24,12 +24,12 @@ struct {								\
 	(fifo)->mask	= (fifo)->size					\
 		? roundup_pow_of_two((fifo)->size) - 1			\
 		: 0;							\
-	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
 })
 
 #define free_fifo(fifo)							\
 do {									\
-	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	kvfree((fifo)->data);						\
 	(fifo)->data = NULL;						\
 } while (0)
 
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 1c1ea0f0..523507e3 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-					 name, &inum, BTREE_ITER_INTENT);
+	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+				       name, &inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	/* Can't move across subvolumes, unless it's a subvolume root: */
+	if (src_dir.subvol != dst_dir.subvol &&
+	    (!src_inode_u->bi_subvol ||
+	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (src_inode_u->bi_parent_subvol)
+		src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    dst_inode_u->bi_parent_subvol)
+		dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
 	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
 	src_inode_u->bi_dir_offset	= dst_offset;
 
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 3a4c24c2..3dc8630f 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
+	dir = d_inode(path.dentry);
 	if (victim->d_sb->s_fs_info != c) {
 		ret = -EXDEV;
 		goto err;
@@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 		ret = -ENOENT;
 		goto err;
 	}
-	dir = d_inode(path.dentry);
 	ret = __bch2_unlink(dir, victim, true);
 	if (!ret) {
 		fsnotify_rmdir(dir, victim);
 		d_delete(victim);
 	}
-	inode_unlock(dir);
 err:
+	inode_unlock(dir);
 	dput(victim);
 	path_put(&path);
 	return ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index ec419b8e..77ea6109 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked inode_u;
-	struct bch_inode_info *inode;
-	struct btree_trans *trans;
-	struct bch_subvolume subvol;
-	int ret;
+	subvol_inum inum = inode_inum(inode);
+	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
+	BUG_ON(!old);
 
-	inode = to_bch_ei(iget5_locked(c->vfs_sb,
-				       bch2_inode_hash(inum),
-				       bch2_iget5_test,
-				       bch2_iget5_set,
-				       &inum));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->v.i_state & I_NEW))
-		return &inode->v;
-
-	trans = bch2_trans_get(c);
-	ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
-
-	if (!ret)
-		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	bch2_trans_put(trans);
-
-	if (ret) {
-		iget_failed(&inode->v);
-		return ERR_PTR(bch2_err_class(ret));
+	if (unlikely(old != inode)) {
+		discard_new_inode(&inode->v);
+		inode = old;
+	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
 	}
 
-	mutex_lock(&c->vfs_inodes_lock);
-	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-	mutex_unlock(&c->vfs_inodes_lock);
+	return inode;
+}
 
-	unlock_new_inode(&inode->v);
+#define memalloc_flags_do(_flags, _do)						\
+({										\
+	unsigned _saved_flags = memalloc_flags_save(_flags);			\
+	typeof(_do) _ret = _do;							\
+	memalloc_noreclaim_restore(_saved_flags);				\
+	_ret;									\
+})
 
-	return &inode->v;
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	struct bch_inode_info *inode =
+		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+				  to_bch_ei(new_inode(c->vfs_sb)));
+
+	if (unlikely(!inode)) {
+		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+		if (ret && inode)
+			discard_new_inode(&inode->v);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	return inode;
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		return &inode->v;
+
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	struct bch_inode_unpacked inode_u;
+	struct bch_subvolume subvol;
+	int ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (!ret) {
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+		inode = bch2_inode_insert(c, inode);
+	}
+	bch2_trans_put(trans);
+
+	return ret ? ERR_PTR(ret) : &inode->v;
 }
 
 struct bch_inode_info *
@@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode, *old;
+	struct bch_inode_info *inode;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	subvol_inum inum;
@@ -293,7 +336,6 @@ err_before_quota:
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	bch2_iget5_set(&inode->v, &inum);
 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +346,7 @@ err_before_quota:
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
 	 */
-
-	inode->v.i_state |= I_CREATING;
-
-	old = to_bch_ei(inode_insert5(&inode->v,
-				      bch2_inode_hash(inum),
-				      bch2_iget5_test,
-				      bch2_iget5_set,
-				      &inum));
-	BUG_ON(!old);
-
-	if (unlikely(old != inode)) {
-		/*
-		 * We raced, another process pulled the new inode into cache
-		 * before us:
-		 */
-		make_bad_inode(&inode->v);
-		iput(&inode->v);
-
-		inode = old;
-	} else {
-		mutex_lock(&c->vfs_inodes_lock);
-		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-		mutex_unlock(&c->vfs_inodes_lock);
-		/*
-		 * we really don't want insert_inode_locked2() to be setting
-		 * I_NEW...
-		 */
-		unlock_new_inode(&inode->v);
-	}
-
+	inode = bch2_inode_insert(c, inode);
 	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
@@ -352,23 +365,78 @@ err_trans:
 
 /* methods */
 
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+			subvol_inum dir, struct bch_hash_info *dir_hash_info,
+			const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dirent_iter = {};
+	subvol_inum inum = {};
+
+	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+				   dir_hash_info, dir, name, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret)
+		goto err;
+
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		goto out;
+
+	struct bch_subvolume subvol;
+	struct bch_inode_unpacked inode_u;
+	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (bch2_err_matches(ret, ENOENT)) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch_err(c, "%s points to missing inode", buf.buf);
+		printbuf_exit(&buf);
+	}
+	if (ret)
+		goto err;
+
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	inode = bch2_inode_insert(c, inode);
+out:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	return inode;
+err:
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 				  unsigned int flags)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-	struct inode *vinode = NULL;
-	subvol_inum inum = { .subvol = 1 };
-	int ret;
 
-	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
-				 &dentry->d_name, &inum);
+	struct bch_inode_info *inode;
+	bch2_trans_do(c, NULL, NULL, 0,
+		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+							  &hash, &dentry->d_name)));
+	if (IS_ERR(inode))
+		inode = NULL;
 
-	if (!ret)
-		vinode = bch2_vfs_inode_get(c, inum);
-
-	return d_splice_alias(vinode, dentry);
+	return d_splice_alias(&inode->v, dentry);
 }
 
 static int bch2_mknod(struct mnt_idmap *idmap,
@@ -1371,6 +1439,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_unpacked *bi,
 				struct bch_subvolume *subvol)
 {
+	bch2_iget5_set(&inode->v, &inum);
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1571,7 +1640,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	 * number:
 	 */
 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
@@ -1582,10 +1650,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
 
-	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
-	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
 	buf->f_namelen	= BCH_NAME_MAX;
 
 	return 0;
@@ -1881,6 +1946,7 @@ got_sb:
 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	sb->s_uuid		= c->sb.user_uuid;
 	c->vfs_sb		= sb;
 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 7e82c7bc..e4a8a14c 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -5,7 +5,6 @@
 #include "btree_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
-#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs-common.h"
@@ -18,6 +17,7 @@
 #include "xattr.h"
 
 #include <linux/bsearch.h>
+#include <linux/darray.h>
 #include <linux/dcache.h> /* struct qstr */
 
 /*
@@ -100,8 +100,8 @@ err:
 }
 
 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			  struct bch_inode_unpacked *inode,
-			  u32 *snapshot)
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -123,17 +123,15 @@ err:
 	return ret;
 }
 
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
 			   struct bch_hash_info hash_info,
 			   subvol_inum dir, struct qstr *name,
-			   u64 *target, unsigned *type)
+			   u64 *target, unsigned *type, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c_dirent d;
-	int ret;
-
-	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
-			       &hash_info, dir, name, 0);
+	int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+			       &hash_info, dir, name, 0, snapshot);
 	if (ret)
 		return ret;
 
@@ -144,34 +142,6 @@ static int __lookup_dirent(struct btree_trans *trans,
 	return 0;
 }
 
-static int __write_inode(struct btree_trans *trans,
-			 struct bch_inode_unpacked *inode,
-			 u32 snapshot)
-{
-	struct bkey_inode_buf *inode_p =
-		bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = snapshot;
-
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-				&inode_p->inode.k_i,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *inode,
-			    u32 snapshot)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __write_inode(trans, inode, snapshot));
-	bch_err_fn(trans->c, ret);
-	return ret;
-}
-
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
@@ -224,15 +194,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 
 	struct bch_inode_unpacked root_inode;
 	struct bch_hash_info root_hash_info;
-	ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot);
+	u32 root_inode_snapshot = snapshot;
+	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
 	bch_err_msg(c, ret, "looking up root inode");
 	if (ret)
 		return ret;
 
 	root_hash_info = bch2_hash_info_init(c, &root_inode);
 
-	ret = __lookup_dirent(trans, root_hash_info, root_inum,
-			      &lostfound_str, &inum, &d_type);
+	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+			      &lostfound_str, &inum, &d_type, snapshot);
 	if (bch2_err_matches(ret, ENOENT))
 		goto create_lostfound;
 
@@ -250,7 +221,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 	 * shouldn't exist here:
 	 */
 	ret = lookup_inode(trans, inum, lostfound, &snapshot);
-	bch_err_msg(c, ret, "looking up lost+found");
+	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
 	return ret;
 
 create_lostfound:
@@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans,
 	if (S_ISDIR(inode->bi_mode)) {
 		lostfound.bi_nlink++;
 
-		ret = __write_inode(trans, &lostfound, U32_MAX);
+		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
 		if (ret)
 			return ret;
 	}
@@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans,
 	inode->bi_dir		= lostfound.bi_inum;
 	inode->bi_dir_offset	= dir_offset;
 
-	return __write_inode(trans, inode, inode_snapshot);
+	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -722,7 +694,7 @@ static int hash_redo_key(struct btree_trans *trans,
 	delete->k.p = k_iter->pos;
 	return  bch2_btree_iter_traverse(k_iter) ?:
 		bch2_trans_update(trans, k_iter, delete, 0) ?:
-		bch2_hash_set_snapshot(trans, desc, hash_info,
+		bch2_hash_set_in_snapshot(trans, desc, hash_info,
 				       (subvol_inum) { 0, k.k->p.inode },
 				       k.k->p.snapshot, tmp,
 				       BCH_HASH_SET_MUST_CREATE,
@@ -861,7 +833,8 @@ static int check_inode(struct btree_trans *trans,
 
 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
 
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -950,8 +923,33 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (u.bi_subvol) {
+		struct bch_subvolume s;
+
+		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret,
+				c, inode_bi_subvol_missing,
+				"inode %llu:%u bi_subvol points to missing subvolume %u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+							   k.k->p.snapshot),
+				c, inode_bi_subvol_wrong,
+				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+				le64_to_cpu(s.inode),
+				le32_to_cpu(s.snapshot))) {
+			u.bi_subvol = 0;
+			u.bi_parent_subvol = 0;
+			do_update = true;
+		}
+	}
+
 	if (do_update) {
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -1032,7 +1030,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 				w->last_pos.inode, i->snapshot,
 				i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1481,7 +1479,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
@@ -1491,16 +1489,15 @@ fsck_err:
 	return ret ?: trans_was_restarted(trans, restart_count);
 }
 
-static int check_dirent_target(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       u32 target_snapshot)
+static int check_inode_backpointer(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *target,
+				   u32 target_snapshot)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_i_dirent *n;
-	struct printbuf buf = PRINTBUF;
 	struct btree_iter bp_iter = { NULL };
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!target->bi_dir &&
@@ -1508,7 +1505,7 @@ static int check_dirent_target(struct btree_trans *trans,
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
 
-		ret = __write_inode(trans, target, target_snapshot);
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 		if (ret)
 			goto err;
 	}
@@ -1548,7 +1545,7 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_nlink++;
 			target->bi_flags &= ~BCH_INODE_unlinked;
 
-			ret = __write_inode(trans, target, target_snapshot);
+			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
@@ -1566,11 +1563,34 @@ static int check_dirent_target(struct btree_trans *trans,
 			target->bi_dir		= d.k->p.inode;
 			target->bi_dir_offset	= d.k->p.offset;
 
-			ret = __write_inode(trans, target, target_snapshot);
+			ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
 			if (ret)
 				goto err;
 		}
 	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
+	if (ret)
+		goto err;
 
 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
 			c, dirent_d_type_wrong,
@@ -1614,15 +1634,65 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		d = dirent_i_to_s_c(n);
 	}
-out:
 err:
 fsck_err:
-	bch2_trans_iter_exit(trans, &bp_iter);
 	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
 }
 
+static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			       struct bkey_s_c_dirent d)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked subvol_root;
+	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+	u32 target_snapshot;
+	u64 target_inum;
+	int ret = 0;
+
+	ret = subvol_lookup(trans, target_subvol,
+			      &target_snapshot, &target_inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, dirent_to_missing_subvol,
+			"dirent points to missing subvolume %u",
+			le32_to_cpu(d.v->d_child_subvol)))
+		return __remove_dirent(trans, d.k->p);
+
+	ret = lookup_inode(trans, target_inum,
+			   &subvol_root, &target_snapshot);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, subvol_to_missing_root,
+			"subvolume %u points to missing subvolume root %llu",
+			target_subvol,
+			target_inum)) {
+		bch_err(c, "repair not implemented yet");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+			c, subvol_root_wrong_bi_subvol,
+			"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+			target_inum,
+			subvol_root.bi_subvol, target_subvol)) {
+		subvol_root.bi_subvol = target_subvol;
+		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+		if (ret)
+			return ret;
+	}
+
+	ret = check_dirent_target(trans, iter, d, &subvol_root,
+				  target_snapshot);
+	if (ret)
+		return ret;
+fsck_err:
+	return ret;
+}
+
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bkey_s_c k,
 			struct bch_hash_info *hash_info,
@@ -1707,50 +1777,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	d = bkey_s_c_to_dirent(k);
 
 	if (d.v->d_type == DT_SUBVOL) {
-		struct bch_inode_unpacked subvol_root;
-		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-		u32 target_snapshot;
-		u64 target_inum;
-
-		ret = subvol_lookup(trans, target_subvol,
-				      &target_snapshot, &target_inum);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-				"dirent points to missing subvolume %u",
-				le32_to_cpu(d.v->d_child_subvol))) {
-			ret = __remove_dirent(trans, d.k->p);
-			goto err;
-		}
-
-		ret = lookup_inode(trans, target_inum,
-				   &subvol_root, &target_snapshot);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, subvol_to_missing_root,
-				"subvolume %u points to missing subvolume root %llu",
-				target_subvol,
-				target_inum)) {
-			bch_err(c, "repair not implemented yet");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
-				c, subvol_root_wrong_bi_subvol,
-				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
-				target_inum,
-				subvol_root.bi_subvol, target_subvol)) {
-			subvol_root.bi_subvol = target_subvol;
-			ret = __write_inode(trans, &subvol_root, target_snapshot);
-			if (ret)
-				goto err;
-		}
-
-		ret = check_dirent_target(trans, iter, d, &subvol_root,
-					  target_snapshot);
+		ret = check_subvol_dirent(trans, iter, d);
 		if (ret)
 			goto err;
 	} else {
@@ -1776,12 +1803,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			if (ret)
 				goto err;
 		}
+
+		if (d.v->d_type == DT_DIR)
+			for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+				i->count++;
 	}
-
-	if (d.v->d_type == DT_DIR)
-		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
-			i->count++;
-
 out:
 err:
 fsck_err:
@@ -1919,7 +1945,7 @@ static int check_root_trans(struct btree_trans *trans)
 				0, NULL);
 		root_inode.bi_inum = inum;
 
-		ret = __write_inode(trans, &root_inode, snapshot);
+		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
 		bch_err_msg(c, ret, "writing root inode");
 	}
 err:
@@ -2291,7 +2317,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 			bch2_inode_nlink_get(&u), link->count)) {
 		bch2_inode_nlink_set(&u, link->count);
-		ret = __write_inode(trans, &u, k.k->p.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
 	}
 fsck_err:
 	return ret;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 086f0090..dbe37ccc 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
 		    subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			    __bch2_fsck_write_inode(trans, inode, snapshot));
+	bch_err_fn(trans->c, ret);
+	return ret;
+}
+
 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 {
 	struct bch_inode_unpacked u;
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index b63f3125..9a9353c0 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
 		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
 	return bch2_inode_write_flags(trans, iter, inode, 0);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
 void bch2_inode_init_early(struct bch_fs *,
 			   struct bch_inode_unpacked *);
 void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index 3c574d88..dce136cd 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
 		container_of(wop, struct promote_op, write.op);
 	struct bch_fs *c = op->write.op.c;
 
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+	time_stats_update(&c->times[BCH_TIME_data_promote],
 			       op->start_time);
 	promote_free(c, op);
 }
@@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
 	if (rbio->start_time)
-		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+		time_stats_update(&rbio->c->times[BCH_TIME_data_read],
 				       rbio->start_time);
 	bio_endio(&rbio->bio);
 }
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index ef3a53f9..13b3514d 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
-	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+	__time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
 }
 
 #endif
@@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
 
 	EBUG_ON(op->open_buckets.nr);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 	bch2_disk_reservation_put(c, &op->res);
 
 	if (!(op->flags & BCH_WRITE_MOVE))
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index bc890776..214c8030 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = {
 	NULL
 };
 
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
 static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
 {
 	union journal_res_state s = READ_ONCE(j->reservations);
@@ -54,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
 	prt_printf(out, "%li jiffies", buf->expires - jiffies);
 	prt_newline(out);
 
+	if (buf->write_done)
+		prt_printf(out, "write done\n");
+	else if (buf->write_allocated)
+		prt_printf(out, "write allocated\n");
+	else if (buf->write_started)
+		prt_printf(out, "write started\n");
+
 	printbuf_indent_sub(out, 2);
 }
 
@@ -66,26 +93,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
 	     seq <= journal_cur_seq(j);
 	     seq++)
 		bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-	return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-	return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-	return __journal_entry_is_open(j->reservations);
+	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
 }
 
 static inline struct journal_buf *
@@ -174,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
+void bch2_journal_do_writes(struct journal *j)
+{
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *w = j->buf + idx;
+
+		if (w->write_started && !w->write_allocated)
+			break;
+		if (w->write_started)
+			continue;
+
+		if (!journal_state_count(j->reservations, idx)) {
+			w->write_started = true;
+			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+		}
+
+		break;
+	}
+}
+
 /*
  * Final processing when the last reference of a journal buffer has been
  * dropped. Drop the pin list reference acquired at journal entry open and write
  * the buffer, if requested.
  */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
 	lockdep_assert_held(&j->lock);
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write)
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	bch2_journal_do_writes(j);
 }
 
 /*
@@ -380,11 +407,14 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 
 	bkey_extent_init(&buf->key);
-	buf->noflush	= false;
-	buf->must_flush	= false;
-	buf->separate_flush = false;
-	buf->flush_time	= 0;
+	buf->noflush		= false;
+	buf->must_flush		= false;
+	buf->separate_flush	= false;
+	buf->flush_time		= 0;
 	buf->need_flush_to_write_buffer = true;
+	buf->write_started	= false;
+	buf->write_allocated	= false;
+	buf->write_done		= false;
 
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +448,10 @@ static int journal_entry_open(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	mod_delayed_work(c->io_complete_wq,
-			 &j->write_work,
-			 msecs_to_jiffies(c->opts.journal_flush_delay));
+	if (nr_unwritten_journal_entries(j) == 1)
+		mod_delayed_work(j->wq,
+				 &j->write_work,
+				 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
 
 	if (j->early_journal_entries.nr)
@@ -445,20 +476,16 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	long delta;
 
 	spin_lock(&j->lock);
-	if (!__journal_entry_is_open(j->reservations))
-		goto unlock;
+	if (__journal_entry_is_open(j->reservations)) {
+		long delta = journal_cur_buf(j)->expires - jiffies;
 
-	delta = journal_cur_buf(j)->expires - jiffies;
-
-	if (delta > 0)
-		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
-	else
-		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+		if (delta > 0)
+			mod_delayed_work(j->wq, &j->write_work, delta);
+		else
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	}
 	spin_unlock(&j->lock);
 }
 
@@ -473,33 +500,32 @@ retry:
 	if (journal_res_get_fast(j, res, flags))
 		return 0;
 
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		ret = JOURNAL_ERR_journal_full;
+		can_discard = j->can_discard;
+		goto out;
+	}
+
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
+
 	if (bch2_journal_error(j))
 		return -BCH_ERR_erofs_journal_err;
 
-	spin_lock(&j->lock);
-
-	/* check once more in case somebody else shut things down... */
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return -BCH_ERR_erofs_journal_err;
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+		ret = JOURNAL_ERR_max_in_flight;
+		goto out;
 	}
 
+	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 	 * unnecessarily
 	 */
 	if (journal_res_get_fast(j, res, flags)) {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-
-	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-		/*
-		 * Don't want to close current journal entry, just need to
-		 * invoke reclaim:
-		 */
-		ret = JOURNAL_ERR_journal_full;
+		ret = 0;
 		goto unlock;
 	}
 
@@ -515,30 +541,30 @@ retry:
 		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
 	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-	ret = journal_entry_open(j);
-
-	if (ret == JOURNAL_ERR_max_in_flight) {
-		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-				   &j->max_in_flight_start, true);
-		if (trace_journal_entry_full_enabled()) {
-			struct printbuf buf = PRINTBUF;
-			buf.atomic++;
-
-			bch2_journal_bufs_to_text(&buf, j);
-			trace_journal_entry_full(c, buf.buf);
-			printbuf_exit(&buf);
-		}
-		count_event(c, journal_entry_full);
-	}
+	ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
 unlock:
 	can_discard = j->can_discard;
 	spin_unlock(&j->lock);
-
-	if (!ret)
+out:
+	if (ret == JOURNAL_ERR_retry)
 		goto retry;
+	if (!ret)
+		return 0;
+
 	if (journal_error_check_stuck(j, ret, flags))
 		ret = -BCH_ERR_journal_res_get_blocked;
 
+	if (ret == JOURNAL_ERR_max_in_flight &&
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+		bch2_journal_bufs_to_text(&buf, j);
+		trace_journal_entry_full(c, buf.buf);
+		printbuf_exit(&buf);
+		count_event(c, journal_entry_full);
+	}
+
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
@@ -727,7 +753,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
 	if (!ret)
-		bch2_time_stats_update(j->flush_seq_time, start_time);
+		time_stats_update(j->flush_seq_time, start_time);
 
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }
@@ -1157,7 +1183,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	struct journal_replay *i, **_i;
 	struct genradix_iter iter;
 	bool had_entries = false;
-	unsigned ptr;
 	u64 last_seq = cur_seq, nr, seq;
 
 	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
@@ -1211,8 +1236,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 		p = journal_seq_pin(j, seq);
 
 		p->devs.nr = 0;
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr)
+			bch2_dev_list_add_dev(&p->devs, ptr->dev);
 
 		had_entries = true;
 	}
@@ -1240,13 +1265,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-	kfree(ca->journal.bio);
-	kfree(ca->journal.buckets);
-	kfree(ca->journal.bucket_seq);
+	struct journal_device *ja = &ca->journal;
 
-	ca->journal.bio		= NULL;
-	ca->journal.buckets	= NULL;
-	ca->journal.bucket_seq	= NULL;
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		kfree(ja->bio[i]);
+		ja->bio[i] = NULL;
+	}
+
+	kfree(ja->buckets);
+	kfree(ja->bucket_seq);
+	ja->buckets	= NULL;
+	ja->bucket_seq	= NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1285,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 		bch2_sb_field_get(sb, journal);
 	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
 		bch2_sb_field_get(sb, journal_v2);
-	unsigned i, nr_bvecs;
 
 	ja->nr = 0;
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
 
-		for (i = 0; i < nr; i++)
+		for (unsigned i = 0; i < nr; i++)
 			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
 	} else if (journal_buckets) {
 		ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1301,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	if (!ja->bucket_seq)
 		return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-	if (!ca->journal.bio)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+				     nr_bvecs), GFP_KERNEL);
+		if (!ja->bio[i])
+			return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+		ja->bio[i]->ca = ca;
+		ja->bio[i]->buf_idx = i;
+		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
@@ -1287,14 +1320,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-		unsigned j, dst = 0;
+		unsigned dst = 0;
 
-		for (i = 0; i < nr; i++)
-			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+		for (unsigned i = 0; i < nr; i++)
+			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
 				ja->buckets[dst++] =
 					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
 	} else if (journal_buckets) {
-		for (i = 0; i < ja->nr; i++)
+		for (unsigned i = 0; i < ja->nr; i++)
 			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
 	}
 
@@ -1303,19 +1336,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	unsigned i;
+	if (j->wq)
+		destroy_workqueue(j->wq);
 
 	darray_exit(&j->early_journal_entries);
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
-		kvpfree(j->buf[i].data, j->buf[i].buf_size);
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvfree(j->buf[i].data);
 	free_fifo(&j->pin);
 }
 
 int bch2_fs_journal_init(struct journal *j)
 {
 	static struct lock_class_key res_key;
-	unsigned i;
 
 	mutex_init(&j->buf_lock);
 	spin_lock_init(&j->lock);
@@ -1336,14 +1369,20 @@ int bch2_fs_journal_init(struct journal *j)
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
 		return -BCH_ERR_ENOMEM_journal_pin_fifo;
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
-		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
 			return -BCH_ERR_ENOMEM_journal_buf;
+		j->buf[i].idx = i;
 	}
 
 	j->pin.front = j->pin.back = 1;
+
+	j->wq = alloc_workqueue("bcachefs_journal",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+	if (!j->wq)
+		return -BCH_ERR_ENOMEM_fs_other_alloc;
 	return 0;
 }
 
@@ -1455,7 +1494,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	unsigned i;
 
 	spin_lock(&j->lock);
 	*seq = max(*seq, j->pin.front);
@@ -1473,7 +1511,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
 		list_for_each_entry(pin, &pin_list->list[i], list) {
 			prt_printf(out, "\t%px %ps", pin, pin->flush);
 			prt_newline(out);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 4544ce24..7c7528f8 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 }
 
 bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
 
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx))
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 }
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx)) {
 		spin_lock(&j->lock);
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 		spin_unlock(&j->lock);
 	}
 }
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index bfd6585e..e31e215f 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -17,6 +17,38 @@
 #include "sb-clean.h"
 #include "trace.h"
 
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	darray_for_each(j->ptrs, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+		u64 offset;
+
+		div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+		if (i != j->ptrs.data)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+			   i->dev, i->bucket, i->bucket_offset, i->sector);
+	}
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+					struct journal_replay *j)
+{
+	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+	bch2_journal_ptrs_to_text(out, c, j);
+
+	struct jset_entry *entry;
+	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+		struct jset_entry_datetime *datetime =
+			container_of(entry, struct jset_entry_datetime, entry);
+		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+		break;
+	}
+}
+
 static struct nonce journal_nonce(const struct jset *jset)
 {
 	return (struct nonce) {{
@@ -52,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c,
 
 	BUG_ON(*p != i);
 	*p = NULL;
-	kvpfree(i, offsetof(struct journal_replay, j) +
-		vstruct_bytes(&i->j));
+	kvfree(i);
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -84,9 +115,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct genradix_iter iter;
 	struct journal_replay **_i, *i, *dup;
-	struct journal_ptr *ptr;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	struct printbuf buf = PRINTBUF;
 	int ret = JOURNAL_ENTRY_ADD_OK;
 
 	/* Is this entry older than the range we need? */
@@ -131,72 +162,61 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	 */
 	dup = *_i;
 	if (dup) {
-		if (bytes == vstruct_bytes(&dup->j) &&
-		    !memcmp(j, &dup->j, bytes)) {
-			i = dup;
-			goto found;
-		}
+		bool identical = bytes == vstruct_bytes(&dup->j) &&
+			!memcmp(j, &dup->j, bytes);
+		bool not_identical = !identical &&
+			entry_ptr.csum_good &&
+			dup->csum_good;
 
-		if (!entry_ptr.csum_good) {
-			i = dup;
-			goto found;
-		}
+		bool same_device = false;
+		darray_for_each(dup->ptrs, ptr)
+			if (ptr->dev == ca->dev_idx)
+				same_device = true;
 
-		if (!dup->csum_good)
+		ret = darray_push(&dup->ptrs, entry_ptr);
+		if (ret)
+			goto out;
+
+		bch2_journal_replay_to_text(&buf, c, dup);
+
+		fsck_err_on(same_device,
+			    c, journal_entry_dup_same_device,
+			    "duplicate journal entry on same device\n  %s",
+			    buf.buf);
+
+		fsck_err_on(not_identical,
+			    c, journal_entry_replicas_data_mismatch,
+			    "found duplicate but non identical journal entries\n  %s",
+			    buf.buf);
+
+		if (entry_ptr.csum_good && !identical)
 			goto replace;
 
-		fsck_err(c, journal_entry_replicas_data_mismatch,
-			 "found duplicate but non identical journal entries (seq %llu)",
-			 le64_to_cpu(j->seq));
-		i = dup;
-		goto found;
+		goto out;
 	}
 replace:
-	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
 		return -BCH_ERR_ENOMEM_journal_entry_add;
 
-	i->nr_ptrs	= 0;
+	darray_init(&i->ptrs);
 	i->csum_good	= entry_ptr.csum_good;
 	i->ignore	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
 
 	if (dup) {
-		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
-			bch_err(c, "found too many copies of journal entry %llu",
-				le64_to_cpu(i->j.seq));
-			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
-		}
-
 		/* The first ptr should represent the jset we kept: */
-		memcpy(i->ptrs + i->nr_ptrs,
-		       dup->ptrs,
-		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
-		i->nr_ptrs += dup->nr_ptrs;
+		darray_for_each(dup->ptrs, ptr)
+			darray_push(&i->ptrs, *ptr);
 		__journal_replay_free(c, dup);
+	} else {
+		darray_push(&i->ptrs, entry_ptr);
 	}
 
 	*_i = i;
-	return 0;
-found:
-	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
-		if (ptr->dev == ca->dev_idx) {
-			bch_err(c, "duplicate journal entry %llu on same device",
-				le64_to_cpu(i->j.seq));
-			goto out;
-		}
-	}
-
-	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
-		bch_err(c, "found too many copies of journal entry %llu",
-			le64_to_cpu(i->j.seq));
-		goto out;
-	}
-
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -741,6 +761,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
 	journal_entry_btree_keys_to_text(out, c, entry);
 }
 
+static int journal_entry_datetime_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	unsigned bytes = vstruct_bytes(entry);
+	unsigned expected = 16;
+	int ret = 0;
+
+	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_datetime *datetime =
+		container_of(entry, struct jset_entry_datetime, entry);
+
+	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int,
@@ -913,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	new_size = roundup_pow_of_two(new_size);
-	n = kvpmalloc(new_size, GFP_KERNEL);
+	n = kvmalloc(new_size, GFP_KERNEL);
 	if (!n)
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
-	kvpfree(b->data, b->size);
+	kvfree(b->data);
 	b->data = n;
 	b->size = new_size;
 	return 0;
@@ -1102,16 +1153,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 		if (!r)
 			continue;
 
-		for (i = 0; i < r->nr_ptrs; i++) {
-			if (r->ptrs[i].dev == ca->dev_idx) {
-				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+		darray_for_each(r->ptrs, i)
+			if (i->dev == ca->dev_idx) {
+				unsigned wrote = bucket_remainder(ca, i->sector) +
 					vstruct_sectors(&r->j, c->block_bits);
 
-				ja->cur_idx = r->ptrs[i].bucket;
+				ja->cur_idx = i->bucket;
 				ja->sectors_free = ca->mi.bucket_size - wrote;
 				goto found;
 			}
-		}
 	}
 found:
 	mutex_unlock(&jlist->lock);
@@ -1144,7 +1194,7 @@ found:
 		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
 	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-	kvpfree(buf.data, buf.size);
+	kvfree(buf.data);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
 	return;
@@ -1155,27 +1205,6 @@ err:
 	goto out;
 }
 
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct journal_replay *j)
-{
-	unsigned i;
-
-	for (i = 0; i < j->nr_ptrs; i++) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
-		u64 offset;
-
-		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
-		if (i)
-			prt_printf(out, " ");
-		prt_printf(out, "%u:%u:%u (sector %llu)",
-		       j->ptrs[i].dev,
-		       j->ptrs[i].bucket,
-		       j->ptrs[i].bucket_offset,
-		       j->ptrs[i].sector);
-	}
-}
-
 int bch2_journal_read(struct bch_fs *c,
 		      u64 *last_seq,
 		      u64 *blacklist_seq,
@@ -1353,32 +1382,31 @@ int bch2_journal_read(struct bch_fs *c,
 			.e.data_type = BCH_DATA_journal,
 			.e.nr_required = 1,
 		};
-		unsigned ptr;
 
 		i = *_i;
 		if (!i || i->ignore)
 			continue;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-			if (!i->ptrs[ptr].csum_good)
-				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+			if (!ptr->csum_good)
+				bch_err_dev_offset(ca, ptr->sector,
 						   "invalid journal checksum, seq %llu%s",
 						   le64_to_cpu(i->j.seq),
 						   i->csum_good ? " (had good copy on another device)" : "");
 		}
 
 		ret = jset_validate(c,
-				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
 				    &i->j,
-				    i->ptrs[0].sector,
+				    i->ptrs.data[0].sector,
 				    READ);
 		if (ret)
 			goto err;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+		darray_for_each(i->ptrs, ptr)
+			replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
 
 		bch2_replicas_entry_sort(&replicas.e);
 
@@ -1545,7 +1573,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
 		return;
 
-	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
@@ -1556,7 +1584,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	swap(buf->buf_size,	new_size);
 	spin_unlock(&j->lock);
 
-	kvpfree(new_buf, new_size);
+	kvfree(new_buf);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1566,17 +1594,17 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 
 static CLOSURE_CALLBACK(journal_write_done)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq;
+	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
-	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
-			       ? j->flush_write_time
-			       : j->noflush_write_time, j->write_start_time);
+	time_stats_update(!JSET_NO_FLUSH(w->data)
+			  ? j->flush_write_time
+			  : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
@@ -1591,63 +1619,68 @@ static CLOSURE_CALLBACK(journal_write_done)
 	if (err)
 		bch2_fatal_error(c);
 
-	spin_lock(&j->lock);
-	seq = le64_to_cpu(w->data->seq);
+	closure_debug_destroy(cl);
 
+	spin_lock(&j->lock);
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
+	w->write_done = true;
 
-	if (!err) {
-		if (!JSET_NO_FLUSH(w->data)) {
+	bool completed = false;
+
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		w = j->buf + (seq & JOURNAL_BUF_MASK);
+		if (!w->write_done)
+			break;
+
+		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
 			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
-
 			bch2_reset_alloc_cursors(c);
 		}
-	} else if (!j->err_seq || seq < j->err_seq)
-		j->err_seq	= seq;
 
-	j->seq_ondisk		= seq;
+		j->seq_ondisk = seq;
 
-	/*
-	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-	 * more buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	if (j->watermark != BCH_WATERMARK_stripe)
-		journal_reclaim_kick(&c->journal);
+		/*
+		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+		 * more buckets:
+		 *
+		 * Must come before signaling write completion, for
+		 * bch2_fs_journal_stop():
+		 */
+		if (j->watermark != BCH_WATERMARK_stripe)
+			journal_reclaim_kick(&c->journal);
 
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
+		v = atomic64_read(&j->reservations.counter);
+		do {
+			old.v = new.v = v;
+			BUG_ON(journal_state_count(new, new.unwritten_idx));
+			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
-	v = atomic64_read(&j->reservations.counter);
-	do {
-		old.v = new.v = v;
-		BUG_ON(journal_state_count(new, new.unwritten_idx));
+			new.unwritten_idx++;
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
-		new.unwritten_idx++;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+		completed = true;
+	}
 
-	bch2_journal_reclaim_fast(j);
-	bch2_journal_space_available(j);
+	if (completed) {
+		bch2_journal_reclaim_fast(j);
+		bch2_journal_space_available(j);
 
-	track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-			   &j->max_in_flight_start, false);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
-	closure_wake_up(&w->wait);
-	journal_wake(j);
+		closure_wake_up(&w->wait);
+		journal_wake(j);
+	}
 
-	if (!journal_state_count(new, new.unwritten_idx) &&
-	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
-		spin_unlock(&j->lock);
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
-	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
@@ -1657,46 +1690,46 @@ static CLOSURE_CALLBACK(journal_write_done)
 		 * previous entries still in flight - the current journal entry
 		 * might want to be written now:
 		 */
-
-		spin_unlock(&j->lock);
-		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
-	} else {
-		spin_unlock(&j->lock);
+		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
 	}
+
+	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
 {
-	struct bch_dev *ca = bio->bi_private;
+	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+	struct bch_dev *ca = jbio->ca;
 	struct journal *j = &ca->fs->journal;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	unsigned long flags;
+	struct journal_buf *w = j->buf + jbio->buf_idx;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
 			       "error writing journal entry %llu: %s",
 			       le64_to_cpu(w->data->seq),
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
+		unsigned long flags;
+
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
-	closure_put(&j->io);
+	closure_put(&w->io);
 	percpu_ref_put(&ca->io_ref);
 }
 
 static CLOSURE_CALLBACK(do_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bio *bio;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct journal_device *ja = &ca->journal;
+
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@@ -1706,7 +1739,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		bio = ca->journal.bio;
+		struct bio *bio = &ja->bio[w->idx]->bio;
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1725,11 +1758,10 @@ static CLOSURE_CALLBACK(do_journal_write)
 		trace_and_count(c, journal_write, bio);
 		closure_bio_submit(bio, cl);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] =
-			le64_to_cpu(w->data->seq);
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 	}
 
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
 
 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1802,6 +1834,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
 	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
+	struct jset_entry_datetime *d =
+		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+	d->entry.type	= BCH_JSET_ENTRY_datetime;
+	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
+
 	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1901,16 +1938,16 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 CLOSURE_CALLBACK(bch2_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
-	struct bio *bio;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
 	int ret;
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(w->write_allocated);
 
 	j->write_start_time = local_clock();
 
@@ -1954,12 +1991,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
+	w->write_allocated = true;
 
 	/*
 	 * journal entry has been compacted and allocated, recalculate space
 	 * available:
 	 */
 	bch2_journal_space_available(j);
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1983,25 +2022,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (ret)
 		goto err;
 
+	if (!JSET_NO_FLUSH(w->data))
+		closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
 	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
-			bio = ca->journal.bio;
+			struct journal_device *ja = &ca->journal;
+			struct bio *bio = &ja->bio[w->idx]->bio;
 			bio_reset(bio, ca->disk_sb.bdev,
-				  REQ_OP_WRITE|REQ_PREFLUSH);
+				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
 			bio->bi_private		= ca;
 			closure_bio_submit(bio, cl);
 		}
 	}
 
-	continue_at(cl, do_journal_write, c->io_complete_wq);
+	continue_at(cl, do_journal_write, j->wq);
 	return;
 no_io:
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 	return;
 err:
 	bch2_fatal_error(c);
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
index c035e7c1..f18b9000 100644
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -2,19 +2,22 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
+#include <linux/darray_types.h>
+
+struct journal_ptr {
+	bool		csum_good;
+	u8		dev;
+	u32		bucket;
+	u32		bucket_offset;
+	u64		sector;
+};
+
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
  */
 struct journal_replay {
-	struct journal_ptr {
-		bool		csum_good;
-		u8		dev;
-		u32		bucket;
-		u32		bucket_offset;
-		u64		sector;
-	}			ptrs[BCH_REPLICAS_MAX];
-	unsigned		nr_ptrs;
+	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
 
 	bool			csum_good;
 	bool			ignore;
@@ -62,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
 
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 820d25e1..f29fd397 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
 		? BCH_WATERMARK_reclaim
 		: BCH_WATERMARK_stripe;
 
-	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
-			       &j->low_on_space_start, low_on_space) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-			       &j->low_on_pin_start, low_on_pin) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
-			       &j->write_buffer_full_start, low_on_wb))
+	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
 		trace_and_count(c, journal_full, c);
 
 	swap(watermark, j->watermark);
@@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *src,
 			   journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	u64 seq = READ_ONCE(src->seq);
@@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j,
 		return;
 	}
 
-	reclaim = __journal_pin_drop(j, dst);
+	bool reclaim = __journal_pin_drop(j, dst);
 
 	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
 
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+	spin_unlock(&j->lock);
 }
 
 void bch2_journal_pin_set(struct journal *j, u64 seq,
 			  struct journal_entry_pin *pin,
 			  journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	BUG_ON(seq < journal_last_seq(j));
 
-	reclaim = __journal_pin_drop(j, pin);
+	bool reclaim = __journal_pin_drop(j, pin);
 
 	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
-
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+
+	spin_unlock(&j->lock);
 }
 
 /**
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
index ae4fb8c3..156691c2 100644
--- a/libbcachefs/journal_sb.c
+++ b/libbcachefs/journal_sb.c
@@ -2,8 +2,8 @@
 
 #include "bcachefs.h"
 #include "journal_sb.h"
-#include "darray.h"
 
+#include <linux/darray.h>
 #include <linux/sort.h>
 
 /* BCH_SB_FIELD_journal: */
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index 0200e299..024c9b1b 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -2,10 +2,11 @@
 
 #include "bcachefs.h"
 #include "btree_iter.h"
-#include "eytzinger.h"
 #include "journal_seq_blacklist.h"
 #include "super-io.h"
 
+#include <linux/eytzinger.h>
+
 /*
  * journal_seq_blacklist machinery:
  *
@@ -119,8 +120,7 @@ out:
 	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-static int journal_seq_blacklist_table_cmp(const void *_l,
-					   const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 {
 	const struct journal_seq_blacklist_table_entry *l = _l;
 	const struct journal_seq_blacklist_table_entry *r = _r;
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 38817c7a..011f7a0d 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -18,6 +18,7 @@
  * the journal that are being staged or in flight.
  */
 struct journal_buf {
+	struct closure		io;
 	struct jset		*data;
 
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
-	bool			noflush;	/* write has already been kicked off, and was noflush */
-	bool			must_flush;	/* something wants a flush */
-	bool			separate_flush;
-	bool			need_flush_to_write_buffer;
+	bool			noflush:1;	/* write has already been kicked off, and was noflush */
+	bool			must_flush:1;	/* something wants a flush */
+	bool			separate_flush:1;
+	bool			need_flush_to_write_buffer:1;
+	bool			write_started:1;
+	bool			write_allocated:1;
+	bool			write_done:1;
+	u8			idx;
 };
 
 /*
@@ -134,6 +139,7 @@ enum journal_flags {
 /* Reasons we may fail to get a journal reservation: */
 #define JOURNAL_ERRORS()		\
 	x(ok)				\
+	x(retry)			\
 	x(blocked)			\
 	x(max_in_flight)		\
 	x(journal_full)			\
@@ -149,6 +155,13 @@ enum journal_errors {
 
 typedef DARRAY(u64)		darray_u64;
 
+struct journal_bio {
+	struct bch_dev		*ca;
+	unsigned		buf_idx;
+
+	struct bio		bio;
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
@@ -203,8 +216,8 @@ struct journal {
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
 
-	struct closure		io;
 	struct delayed_work	write_work;
+	struct workqueue_struct *wq;
 
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;
@@ -274,14 +287,9 @@ struct journal {
 	u64			nr_noflush_writes;
 	u64			entry_bytes_written;
 
-	u64			low_on_space_start;
-	u64			low_on_pin_start;
-	u64			max_in_flight_start;
-	u64			write_buffer_full_start;
-
-	struct bch2_time_stats	*flush_write_time;
-	struct bch2_time_stats	*noflush_write_time;
-	struct bch2_time_stats	*flush_seq_time;
+	struct time_stats	*flush_write_time;
+	struct time_stats	*noflush_write_time;
+	struct time_stats	*flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	res_map;
@@ -313,7 +321,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio;
+	struct journal_bio	*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 5623cee3..69098eeb 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
-		return -EINVAL;
+		return -BCH_ERR_remove_would_lose_data;
 
 	return 0;
 }
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	/* don't handle this yet: */
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return -EINVAL;
+		return -BCH_ERR_remove_with_metadata_missing_unimplemented;
 
 	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
 
 			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
 					    dev_idx, flags, true);
-			if (ret) {
-				bch_err(c, "Cannot drop device without losing data");
+			if (ret)
 				break;
-			}
 
 			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c
index 3c21981a..181efa4a 100644
--- a/libbcachefs/nocow_locking.c
+++ b/libbcachefs/nocow_locking.c
@@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 		u64 start_time = local_clock();
 
 		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+		time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
 	}
 }
 
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index cc2672c1..75fdce37 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -6,12 +6,15 @@
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r,  const void *priv)
 {
+	size_t size = (size_t) priv;
 	return memcmp(l, r, size);
 }
 
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-	eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 {
 	unsigned i;
 
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      bch2_memcmp, NULL);
+	sort_r(cpu_r->entries,
+	       cpu_r->nr,
+	       cpu_r->entry_size,
+	       bch2_memcmp, NULL,
+	       (void *)(size_t)cpu_r->entry_size);
 
 	for (i = 0; i < cpu_r->nr; i++) {
 		struct bch_replicas_entry_v1 *e =
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 654a4b26..983cce78 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -3,9 +3,10 @@
 #define _BCACHEFS_REPLICAS_H
 
 #include "bkey.h"
-#include "eytzinger.h"
 #include "replicas_types.h"
 
+#include <linux/eytzinger.h>
+
 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
 				 struct bch_replicas_entry_v1 *);
diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c
index b6bf0ebe..5980ba25 100644
--- a/libbcachefs/sb-clean.c
+++ b/libbcachefs/sb-clean.c
@@ -171,22 +171,6 @@ fsck_err:
 	return ERR_PTR(ret);
 }
 
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-	struct jset_entry *entry = *end;
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-	memset(entry, 0, u64s * sizeof(u64));
-	/*
-	 * The u64s field counts from the start of data, ignoring the shared
-	 * fields.
-	 */
-	entry->u64s = cpu_to_le16(u64s - 1);
-
-	*end = vstruct_next(*end);
-	return entry;
-}
-
 void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 441dcb1b..626eaaea 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -6,12 +6,13 @@
  */
 
 #include "bcachefs.h"
-#include "darray.h"
 #include "recovery.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "super-io.h"
 
+#include <linux/darray.h>
+
 #define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)
 
 /*
diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h
index c08aacdf..63f18c7f 100644
--- a/libbcachefs/sb-errors_types.h
+++ b/libbcachefs/sb-errors_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_ERRORS_TYPES_H
 #define _BCACHEFS_SB_ERRORS_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 #define BCH_SB_ERRS()							\
 	x(clean_but_journal_not_empty,				0)	\
@@ -250,7 +250,10 @@
 	x(hash_table_key_duplicate,				242)	\
 	x(hash_table_key_wrong_offset,				243)	\
 	x(unlinked_inode_not_on_deleted_list,			244)	\
-	x(reflink_p_front_pad_bad,				245)
+	x(reflink_p_front_pad_bad,				245)	\
+	x(journal_entry_dup_same_device,			246)	\
+	x(inode_bi_subvol_missing,				247)	\
+	x(inode_bi_subvol_wrong,				248)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
index be0a9418..e4d4d842 100644
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SB_MEMBERS_H
 #define _BCACHEFS_SB_MEMBERS_H
 
-#include "darray.h"
+#include <linux/darray.h>
 
 extern char * const bch2_member_error_strs[];
 
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 89fdb7c2..3976f807 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
 }
 
 static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
 		 struct btree_iter *iter,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
 		 subvol_inum inum, const void *key,
-		 unsigned flags)
+		 unsigned flags, u32 snapshot)
 {
 	struct bkey_s_c k;
-	u32 snapshot;
 	int ret;
 
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
 			   POS(inum.inum, U64_MAX),
@@ -194,6 +189,19 @@ bch2_hash_lookup(struct btree_trans *trans,
 	return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
 }
 
+static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 subvol_inum inum, const void *key,
+		 unsigned flags)
+{
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
 static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
 	       struct btree_iter *iter,
@@ -251,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
 			   const struct bch_hash_desc desc,
 			   const struct bch_hash_info *info,
 			   subvol_inum inum, u32 snapshot,
@@ -320,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
 		  struct bkey_i *insert,
 		  bch_str_hash_flags_t str_hash_flags)
 {
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	insert->k.p.inode = inum.inum;
 
-	return bch2_hash_set_snapshot(trans, desc, info, inum,
-				      snapshot, insert, str_hash_flags, 0);
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_set_in_snapshot(trans, desc, info, inum,
+					  snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 7c67c28d..e7ee52c3 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans,
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
+	struct bch_inode_unpacked inode;
+	struct btree_iter inode_iter = {};
+	ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+				    (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+				    0);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, subvol_to_missing_root,
+			"subvolume %llu points to missing subvolume root %llu:%u",
+			k.k->p.offset, le64_to_cpu(subvol.v->inode),
+			le32_to_cpu(subvol.v->snapshot))) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+			c, subvol_root_wrong_bi_subvol,
+			"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+			inode.bi_inum, inode_iter.k.p.snapshot,
+			inode.bi_subvol, subvol.k->p.offset)) {
+		inode.bi_subvol = subvol.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+		if (ret)
+			goto err;
+	}
+
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
 		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
 		u32 snapshot_tree;
@@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans,
 		}
 	}
 
+err:
 fsck_err:
 	return ret;
 }
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index a6f56f66..3ca1d183 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -2,7 +2,6 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
-#include "darray.h"
 #include "subvolume_types.h"
 
 enum bkey_invalid_flags;
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
index ae644adf..40f16e3a 100644
--- a/libbcachefs/subvolume_types.h
+++ b/libbcachefs/subvolume_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H
 
-#include "darray.h"
+#include <linux/darray_types.h>
 
 typedef DARRAY(u32) snapshot_id_list;
 
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 95e80e06..f3762091 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -3,12 +3,12 @@
 #define _BCACHEFS_SUPER_IO_H
 
 #include "extents.h"
-#include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
 #include "sb-members.h"
 
 #include <asm/byteorder.h>
+#include <linux/eytzinger.h>
 
 static inline bool bch2_version_compatible(u16 version)
 {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index da8697c7..68704a86 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -67,6 +67,7 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
+#include <linux/thread_with_file.h>
 #include <crypto/hash.h>
 
 MODULE_LICENSE("GPL");
@@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 	if (likely(!stdio)) {
 		vprintk(fmt, args);
 	} else {
-		unsigned long flags;
-
 		if (fmt[0] == KERN_SOH[0])
 			fmt += 2;
 
-		spin_lock_irqsave(&stdio->output_lock, flags);
-		prt_vprintf(&stdio->output_buf, fmt, args);
-		spin_unlock_irqrestore(&stdio->output_lock, flags);
-
-		wake_up(&stdio->output_wait);
+		stdio_redirect_vprintf(stdio, true, fmt, args);
 	}
 	va_end(args);
 }
@@ -520,7 +515,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	unsigned i;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_exit(&c->times[i]);
+		time_stats_exit(&c->times[i]);
 
 	bch2_free_pending_node_rewrites(c);
 	bch2_fs_sb_errors_exit(c);
@@ -576,7 +571,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 		destroy_workqueue(c->btree_update_wq);
 
 	bch2_free_super(&c->disk_sb);
-	kvpfree(c, sizeof(*c));
+	kvfree(c);
 	module_put(THIS_MODULE);
 }
 
@@ -715,7 +710,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	unsigned i, iter_size;
 	int ret = 0;
 
-	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 	if (!c) {
 		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
 		goto out;
@@ -753,7 +748,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->journal_keys.initial_ref_held = true;
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_init(&c->times[i]);
+		time_stats_init(&c->times[i]);
 
 	bch2_fs_copygc_init(c);
 	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@@ -882,8 +877,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-					c->opts.btree_node_size) ||
+	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+				       c->opts.btree_node_size) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
 					      sizeof(u64), GFP_KERNEL))) {
@@ -1124,7 +1119,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		prt_newline(&buf);
 
 		prt_bdevname(&buf, fs->bdev);
-		prt_str(&buf, "believes seq of ");
+		prt_str(&buf, " believes seq of ");
 		prt_bdevname(&buf, sb->bdev);
 		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
 		prt_bdevname(&buf, sb->bdev);
@@ -1168,8 +1163,8 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_dev_buckets_free(ca);
 	free_page((unsigned long) ca->sb_read_scratch);
 
-	bch2_time_stats_exit(&ca->io_latency[WRITE]);
-	bch2_time_stats_exit(&ca->io_latency[READ]);
+	time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+	time_stats_quantiles_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
@@ -1260,8 +1255,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	bch2_time_stats_init(&ca->io_latency[READ]);
-	bch2_time_stats_init(&ca->io_latency[WRITE]);
+	time_stats_quantiles_init(&ca->io_latency[READ]);
+	time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
 	ca->mi = bch2_mi_to_cpu(member);
 
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index cee80c47..c86a93a8 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -930,10 +930,10 @@ SHOW(bch2_dev)
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
 	if (attr == &sysfs_io_latency_stats_read)
-		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+		bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
 
 	if (attr == &sysfs_io_latency_stats_write)
-		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
deleted file mode 100644
index b1c867aa..00000000
--- a/libbcachefs/thread_with_file.c
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "printbuf.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
-	if (thr->task) {
-		kthread_stop(thr->task);
-		put_task_struct(thr->task);
-	}
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
-			      const struct file_operations *fops,
-			      int (*fn)(void *))
-{
-	struct file *file = NULL;
-	int ret, fd = -1;
-	unsigned fd_flags = O_CLOEXEC;
-
-	if (fops->read && fops->write)
-		fd_flags |= O_RDWR;
-	else if (fops->read)
-		fd_flags |= O_RDONLY;
-	else if (fops->write)
-		fd_flags |= O_WRONLY;
-
-	char name[TASK_COMM_LEN];
-	get_task_comm(name, current);
-
-	thr->ret = 0;
-	thr->task = kthread_create(fn, thr, "%s", name);
-	ret = PTR_ERR_OR_ZERO(thr->task);
-	if (ret)
-		return ret;
-
-	ret = get_unused_fd_flags(fd_flags);
-	if (ret < 0)
-		goto err;
-	fd = ret;
-
-	file = anon_inode_getfile(name, fops, thr, fd_flags);
-	ret = PTR_ERR_OR_ZERO(file);
-	if (ret)
-		goto err;
-
-	fd_install(fd, file);
-	get_task_struct(thr->task);
-	wake_up_process(thr->task);
-	return fd;
-err:
-	if (fd >= 0)
-		put_unused_fd(fd);
-	if (thr->task)
-		kthread_stop(thr->task);
-	return ret;
-}
-
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
-{
-	return thr->stdio.output_buf.pos ||
-		thr->output2.nr ||
-		thr->thr.done;
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
-				      size_t len, loff_t *ppos)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-	size_t copied = 0, b;
-	int ret = 0;
-
-	if ((file->f_flags & O_NONBLOCK) &&
-	    !thread_with_stdio_has_output(thr))
-		return -EAGAIN;
-
-	ret = wait_event_interruptible(thr->stdio.output_wait,
-		thread_with_stdio_has_output(thr));
-	if (ret)
-		return ret;
-
-	if (thr->thr.done)
-		return 0;
-
-	while (len) {
-		ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
-		if (ret)
-			break;
-
-		spin_lock_irq(&thr->stdio.output_lock);
-		b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
-
-		memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
-		memmove(thr->stdio.output_buf.buf,
-			thr->stdio.output_buf.buf + b,
-			thr->stdio.output_buf.pos - b);
-
-		thr->output2.nr += b;
-		thr->stdio.output_buf.pos -= b;
-		spin_unlock_irq(&thr->stdio.output_lock);
-
-		b = min(len, thr->output2.nr);
-		if (!b)
-			break;
-
-		b -= copy_to_user(buf, thr->output2.data, b);
-		if (!b) {
-			ret = -EFAULT;
-			break;
-		}
-
-		copied	+= b;
-		buf	+= b;
-		len	-= b;
-
-		memmove(thr->output2.data,
-			thr->output2.data + b,
-			thr->output2.nr - b);
-		thr->output2.nr -= b;
-	}
-
-	return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	bch2_thread_with_file_exit(&thr->thr);
-	printbuf_exit(&thr->stdio.input_buf);
-	printbuf_exit(&thr->stdio.output_buf);
-	darray_exit(&thr->output2);
-	thr->exit(thr);
-	return 0;
-}
-
-#define WRITE_BUFFER		4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
-	return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
-				       size_t len, loff_t *ppos)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct printbuf *buf = &thr->stdio.input_buf;
-	size_t copied = 0;
-	ssize_t ret = 0;
-
-	while (len) {
-		if (thr->thr.done) {
-			ret = -EPIPE;
-			break;
-		}
-
-		size_t b = len - fault_in_readable(ubuf, len);
-		if (!b) {
-			ret = -EFAULT;
-			break;
-		}
-
-		spin_lock(&thr->stdio.input_lock);
-		if (buf->pos < WRITE_BUFFER)
-			bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
-		b = min(len, printbuf_remaining_size(buf));
-
-		if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
-			ubuf += b;
-			len -= b;
-			copied += b;
-			buf->pos += b;
-		}
-		spin_unlock(&thr->stdio.input_lock);
-
-		if (b) {
-			wake_up(&thr->stdio.input_wait);
-		} else {
-			if ((file->f_flags & O_NONBLOCK)) {
-				ret = -EAGAIN;
-				break;
-			}
-
-			ret = wait_event_interruptible(thr->stdio.input_wait,
-					thread_with_stdio_has_input_space(thr));
-			if (ret)
-				break;
-		}
-	}
-
-	return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	poll_wait(file, &thr->stdio.output_wait, wait);
-	poll_wait(file, &thr->stdio.input_wait, wait);
-
-	__poll_t mask = 0;
-
-	if (thread_with_stdio_has_output(thr))
-		mask |= EPOLLIN;
-	if (thread_with_stdio_has_input_space(thr))
-		mask |= EPOLLOUT;
-	if (thr->thr.done)
-		mask |= EPOLLHUP|EPOLLERR;
-	return mask;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
-	.release	= thread_with_stdio_release,
-	.read		= thread_with_stdio_read,
-	.write		= thread_with_stdio_write,
-	.poll		= thread_with_stdio_poll,
-	.llseek		= no_llseek,
-};
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *))
-{
-	thr->stdio.input_buf = PRINTBUF;
-	thr->stdio.input_buf.atomic++;
-	spin_lock_init(&thr->stdio.input_lock);
-	init_waitqueue_head(&thr->stdio.input_wait);
-
-	thr->stdio.output_buf = PRINTBUF;
-	thr->stdio.output_buf.atomic++;
-	spin_lock_init(&thr->stdio.output_lock);
-	init_waitqueue_head(&thr->stdio.output_wait);
-
-	darray_init(&thr->output2);
-	thr->exit = exit;
-
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
-}
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
-{
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
-
-	if (stdio->done)
-		return -1;
-
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
-
-	wake_up(&stdio->input_wait);
-	return ret;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
-{
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
-
-	if (stdio->done)
-		return -1;
-
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	char *n = memchr(stdio->input_buf.buf, '\n', ret);
-	if (n)
-		ret = min(ret, n + 1 - stdio->input_buf.buf);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
-
-	wake_up(&stdio->input_wait);
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h
deleted file mode 100644
index 05879c50..00000000
--- a/libbcachefs/thread_with_file.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-struct task_struct;
-
-struct thread_with_file {
-	struct task_struct	*task;
-	int			ret;
-	bool			done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
-			      const struct file_operations *,
-			      int (*fn)(void *));
-
-struct thread_with_stdio {
-	struct thread_with_file	thr;
-	struct stdio_redirect	stdio;
-	DARRAY(char)		output2;
-	void			(*exit)(struct thread_with_stdio *);
-};
-
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-	thr->thr.done = true;
-	thr->stdio.done = true;
-	wake_up(&thr->stdio.input_wait);
-	wake_up(&thr->stdio.output_wait);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *));
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h
deleted file mode 100644
index 90b5e645..00000000
--- a/libbcachefs/thread_with_file_types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-struct stdio_redirect {
-	spinlock_t		output_lock;
-	wait_queue_head_t	output_wait;
-	struct printbuf		output_buf;
-
-	spinlock_t		input_lock;
-	wait_queue_head_t	input_wait;
-	struct printbuf		input_buf;
-	bool			done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 56b815fd..53973503 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -11,6 +11,7 @@
 #include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/eytzinger.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/log2.h>
@@ -22,9 +23,8 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
 
-#include "eytzinger.h"
-#include "mean_and_variance.h"
 #include "util.h"
 
 static const char si_units[] = "?kMGTPEZY";
@@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
 }
 #endif
 
-static const struct time_unit {
-	const char	*name;
-	u64		nsecs;
-} time_units[] = {
-	{ "ns",		1		 },
-	{ "us",		NSEC_PER_USEC	 },
-	{ "ms",		NSEC_PER_MSEC	 },
-	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          (u64) NSEC_PER_SEC * 60},
-	{ "h",          (u64) NSEC_PER_SEC * 3600},
-	{ "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-	const struct time_unit *u;
-
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
-
-	return u;
-}
-
 void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
@@ -370,120 +344,6 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
 	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
-	unsigned i = 0;
-
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct bch2_quantile_entry *e = q->entries + i;
-
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
-
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
-
-		if (v >= e->m)
-			break;
-
-		i = eytzinger0_child(i, v > e->m);
-	}
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
-					      u64 start, u64 end)
-{
-	u64 duration, freq;
-
-	if (time_after64(end, start)) {
-		duration = end - start;
-		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
-		stats->max_duration = max(stats->max_duration, duration);
-		stats->min_duration = min(stats->min_duration, duration);
-		stats->total_duration += duration;
-		bch2_quantiles_update(&stats->quantiles, duration);
-	}
-
-	if (time_after64(end, stats->last_event)) {
-		freq = end - stats->last_event;
-		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
-		stats->max_freq = max(stats->max_freq, freq);
-		stats->min_freq = min(stats->min_freq, freq);
-		stats->last_event = end;
-	}
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-					   struct bch2_time_stat_buffer *b)
-{
-	for (struct bch2_time_stat_buffer_entry *i = b->entries;
-	     i < b->entries + ARRAY_SIZE(b->entries);
-	     i++)
-		bch2_time_stats_update_one(stats, i->start, i->end);
-	b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-						  struct bch2_time_stat_buffer *b)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-	__bch2_time_stats_clear_buffer(stats, b);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-	unsigned long flags;
-
-	WARN_ONCE(!stats->duration_stats_weighted.weight ||
-		  !stats->freq_stats_weighted.weight,
-		  "uninitialized time_stats");
-
-	if (!stats->buffer) {
-		spin_lock_irqsave(&stats->lock, flags);
-		bch2_time_stats_update_one(stats, start, end);
-
-		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
-		    stats->duration_stats.n > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct bch2_time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct bch2_time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-			bch2_time_stats_clear_buffer(stats, b);
-		preempt_enable();
-	}
-}
-
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
@@ -503,19 +363,18 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 #define TABSTOP_SIZE 12
 
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 {
-	const struct time_unit *u;
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
-	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
-	int i;
+	u64 f_stddev = 0, d_stddev = 0;
 
 	if (stats->buffer) {
 		int cpu;
 
 		spin_lock_irq(&stats->lock);
 		for_each_possible_cpu(cpu)
-			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
 		spin_unlock_irq(&stats->lock);
 	}
 
@@ -570,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
@@ -593,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
 
 	printbuf_tabstops_reset(out);
 
-	i = eytzinger0_first(NR_QUANTILES);
-	u = pick_time_units(stats->quantiles.entries[i].m);
+	if (quantiles) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			pick_time_units(quantiles->entries[i].m);
+		u64 last_q = 0;
 
-	prt_printf(out, "quantiles (%s):\t", u->name);
-	eytzinger0_for_each(i, NR_QUANTILES) {
-		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+		prt_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
 
-		q = max(stats->quantiles.entries[i].m, last_q);
-		prt_printf(out, "%llu ",
-		       div_u64(q, u->nsecs));
-		if (is_last)
-			prt_newline(out);
-		last_q = q;
+			u64 q = max(quantiles->entries[i].m, last_q);
+			prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				prt_newline(out);
+			last_q = q;
+		}
 	}
 }
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-	free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->duration_stats_weighted.weight = 8;
-	stats->freq_stats_weighted.weight = 8;
-	stats->min_duration = U64_MAX;
-	stats->min_freq = U64_MAX;
-	spin_lock_init(&stats->lock);
-}
 
 /* ratelimit: */
 
@@ -863,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-static int alignment_ok(const void *base, size_t align)
-{
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-	char t;
-
-	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
-			 int (*cmp_func)(const void *, const void *, size_t),
-			 size_t l, size_t r)
-{
-	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
-			base + inorder_to_eytzinger0(r, n) * size,
-			size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
-			   void (*swap_func)(void *, void *, size_t),
-			   size_t l, size_t r)
-{
-	swap_func(base + inorder_to_eytzinger0(l, n) * size,
-		  base + inorder_to_eytzinger0(r, n) * size,
-		  size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     int (*cmp_func)(const void *, const void *, size_t),
-		     void (*swap_func)(void *, void *, size_t))
-{
-	int i, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		for (r = i; r * 2 + 1 < n; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < n &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		do_swap(base, n, size, swap_func, 0, i);
-
-		for (r = 0; r * 2 + 1 < i; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < i &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t size))
-{
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return size < PAGE_SIZE
-		? mempool_init_kmalloc_pool(pool, min_nr, size)
-		: mempool_init(pool, min_nr, mempool_alloc_vp,
-			       mempool_free_vp, (void *) size);
-}
-
 #if 0
 void eytzinger1_test(void)
 {
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index b414736d..1b3aced8 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -5,22 +5,21 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/closure.h>
+#include <linux/darray.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
-#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
+#include <linux/time_stats.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
+#include <linux/mean_and_variance.h>
 
 struct closure;
 
@@ -53,38 +52,6 @@ static inline size_t buf_pages(void *p, size_t len)
 			    PAGE_SIZE);
 }
 
-static inline void vpfree(void *p, size_t size)
-{
-	if (is_vmalloc_addr(p))
-		vfree(p);
-	else
-		free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-					 get_order(size)) ?:
-		__vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
-	if (size < PAGE_SIZE)
-		kfree(p);
-	else
-		vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return size < PAGE_SIZE
-		? kmalloc(size, gfp_mask)
-		: vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
 #define HEAP(type)							\
 struct {								\
 	size_t size, used;						\
@@ -97,13 +64,13 @@ struct {								\
 ({									\
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
-	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+	(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
 				 (gfp));				\
 })
 
 #define free_heap(heap)							\
 do {									\
-	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	kvfree((heap)->data);						\
 	(heap)->data = NULL;						\
 } while (0)
 
@@ -361,83 +328,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 #endif
 }
 
-#define NR_QUANTILES	15
-#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
-	struct bch2_quantile_entry {
-		u64	m;
-		u64	step;
-	}		entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
-	unsigned	nr;
-	struct bch2_time_stat_buffer_entry {
-		u64	start;
-		u64	end;
-	}		entries[32];
-};
-
-struct bch2_time_stats {
-	spinlock_t	lock;
-	/* all fields are in nanoseconds */
-	u64             min_duration;
-	u64		max_duration;
-	u64		total_duration;
-	u64             max_freq;
-	u64             min_freq;
-	u64		last_event;
-	struct bch2_quantiles quantiles;
-
-	struct mean_and_variance	  duration_stats;
-	struct mean_and_variance_weighted duration_stats_weighted;
-	struct mean_and_variance	  freq_stats;
-	struct mean_and_variance_weighted freq_stats_weighted;
-	struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-	__bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	if (v != !!*start) {
-		if (!v) {
-			bch2_time_stats_update(stats, *start);
-			*start = 0;
-		} else {
-			*start = local_clock() ?: 1;
-			return true;
-		}
-	}
-
-	return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	bool ret = v && !*start;
-	*start = v;
-	return ret;
-}
-#endif
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
 
 #define ewma_add(ewma, val, weight)					\
 ({									\
@@ -738,34 +629,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t));
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos)				\
-	memmove(&(_array)[(_pos) + 1],					\
-		&(_array)[(_pos)],					\
-		sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item)			\
-do {									\
-	__array_insert_item(_array, _nr, _pos);				\
-	(_nr)++;							\
-	(_array)[(_pos)] = (_new_item);					\
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
-do {									\
-	(_nr) -= (_nr_to_remove);					\
-	memmove(&(_array)[(_pos)],					\
-		&(_array)[(_pos) + (_nr_to_remove)],			\
-		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)				\
-	array_remove_items(_array, _nr, _pos, 1)
-
 static inline void __move_gap(void *array, size_t element_size,
 			      size_t nr, size_t size,
 			      size_t old_gap, size_t new_gap)
diff --git a/libbcachefs/darray.c b/linux/darray.c
similarity index 68%
rename from libbcachefs/darray.c
rename to linux/darray.c
index ac35b8b7..80e77959 100644
--- a/libbcachefs/darray.c
+++ b/linux/darray.c
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
 
+#include <linux/darray.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
-#include "darray.h"
 
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
 {
 	if (new_size > d->size) {
 		new_size = roundup_pow_of_two(new_size);
diff --git a/libbcachefs/mean_and_variance.c b/linux/mean_and_variance.c
similarity index 93%
rename from libbcachefs/mean_and_variance.c
rename to linux/mean_and_variance.c
index bf0ef668..b93d150d 100644
--- a/libbcachefs/mean_and_variance.c
+++ b/linux/mean_and_variance.c
@@ -40,10 +40,9 @@
 #include <linux/limits.h>
 #include <linux/math.h>
 #include <linux/math64.h>
+#include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
-#include "mean_and_variance.h"
-
 u128_u u128_div(u128_u n, u64 d)
 {
 	u128_u r;
@@ -107,10 +106,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 x, bool initted, u8 weight)
 {
 	// previous weighted variance.
-	u8 w		= s->weight;
+	u8 w		= weight;
 	u64 var_w0	= s->variance;
 	// new value weighted.
 	s64 x_w		= x << w;
@@ -119,14 +119,13 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
 	// new mean weighted.
 	s64 u_w1	= s->mean + diff;
 
-	if (!s->init) {
+	if (!initted) {
 		s->mean = x_w;
 		s->variance = 0;
 	} else {
 		s->mean = u_w1;
 		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
 	}
-	s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
@@ -134,9 +133,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
  * mean_and_variance_weighted_get_mean() - get mean from @s
  * @s: mean and variance number of samples and their sums
  */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return fast_divpow2(s.mean, s.weight);
+	return fast_divpow2(s.mean, weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
@@ -144,10 +144,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
  * mean_and_variance_weighted_get_variance() -- get variance from @s
  * @s: mean and variance number of samples and their sums
  */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight)
 {
 	// always positive don't need fast divpow2
-	return s.variance >> s.weight;
+	return s.variance >> weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
@@ -155,9 +156,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
  * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
  * @s: mean and variance number of samples and their sums
  */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
 
diff --git a/linux/mempool.c b/linux/mempool.c
index 74d4fbb3..74ed17bf 100644
--- a/linux/mempool.c
+++ b/linux/mempool.c
@@ -522,6 +522,19 @@ void mempool_kfree(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_kfree);
 
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t)pool_data;
+	return kvmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kvmalloc);
+
+void mempool_kvfree(void *element, void *pool_data)
+{
+	kvfree(element);
+}
+EXPORT_SYMBOL(mempool_kvfree);
+
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.
diff --git a/linux/sort.c b/linux/sort.c
new file mode 100644
index 00000000..ffa4817b
--- /dev/null
+++ b/linux/sort.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
+ *
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/sort.h>
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES    (swap_r_func_t)2
+#define SWAP_WRAPPER  (swap_r_func_t)3
+
+struct wrapper {
+	cmp_func_t cmp;
+	swap_func_t swap;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+	if (swap_func == SWAP_WRAPPER) {
+		((const struct wrapper *)priv)->swap(a, b, (int)size);
+		return;
+	}
+
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+	if (cmp == _CMP_WRAPPER)
+		return ((const struct wrapper *)priv)->cmp(a, b);
+	return cmp(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
+/**
+ * sort_r - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
+ *
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * quicksort is slightly faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+void sort_r(void *base, size_t num, size_t size,
+	    cmp_r_func_t cmp_func,
+	    swap_r_func_t swap_func,
+	    const void *priv)
+{
+	/* pre-scale counters for performance */
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
+
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+		swap_func = NULL;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
+
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			do_swap(base, base + n, size, swap_func, priv);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			do_swap(base + b, base + c, size, swap_func, priv);
+		}
+	}
+}
+EXPORT_SYMBOL(sort_r);
+
+#include <linux/eytzinger.h>
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+			 cmp_r_func_t cmp_func, const void *priv,
+			 size_t l, size_t r)
+{
+	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+		      base + inorder_to_eytzinger0(r, n) * size,
+		      cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+			   swap_r_func_t swap_func, const void *priv,
+			   size_t l, size_t r)
+{
+	do_swap(base + inorder_to_eytzinger0(l, n) * size,
+		base + inorder_to_eytzinger0(r, n) * size,
+		size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+		       cmp_r_func_t cmp_func,
+		       swap_r_func_t swap_func,
+		       const void *priv)
+{
+	int i, c, r;
+
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+		swap_func = NULL;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     cmp_func_t cmp_func,
+		     swap_func_t swap_func)
+{
+	struct wrapper w = {
+		.cmp  = cmp_func,
+		.swap = swap_func,
+	};
+
+	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort);
diff --git a/linux/time_stats.c b/linux/time_stats.c
new file mode 100644
index 00000000..0b90c80c
--- /dev/null
+++ b/linux/time_stats.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/eytzinger.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/time_stats.h>
+#include <linux/spinlock.h>
+
+static const struct time_unit time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+	{ "eon",        U64_MAX          },
+};
+
+const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+EXPORT_SYMBOL_GPL(pick_time_units);
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void time_stats_update_one(struct time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+	bool initted = stats->last_event != 0;
+
+	if (time_after64(end, start)) {
+		struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+				duration, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		stats->total_duration += duration;
+
+		if (quantiles)
+			quantiles_update(quantiles, duration);
+	}
+
+	if (stats->last_event && time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+				freq, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+	}
+
+	stats->last_event = end;
+}
+
+void __time_stats_clear_buffer(struct time_stats *stats,
+			       struct time_stat_buffer *b)
+{
+	for (struct time_stat_buffer_entry *i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		time_stats_update_one(stats, i->start, i->end);
+	b->nr = 0;
+}
+EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
+
+static noinline void time_stats_clear_buffer(struct time_stats *stats,
+					     struct time_stat_buffer *b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	__time_stats_clear_buffer(stats, b);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(__time_stats_update);
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+static inline u64 time_stats_lifetime(const struct time_stats *stats)
+{
+	return local_clock() - stats->start_time;
+}
+
+void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
+		const char *epoch_name, unsigned int flags)
+{
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
+	u64 lifetime = time_stats_lifetime(stats);
+
+	if (stats->buffer) {
+		int cpu;
+
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
+
+	if (stats->freq_stats.n) {
+		/* avoid divide by zero */
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+		/* unless we didn't want zeroes anyway */
+		return;
+	}
+
+	seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+	seq_buf_printf(out, "lifetime: ");
+	seq_buf_time_units_aligned(out, lifetime);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "                       since %-12s recent\n", epoch_name);
+
+	seq_buf_printf(out, "duration of events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  total:                   ");
+	seq_buf_time_units_aligned(out, stats->total_duration);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, d_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, d_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "time between events\n");
+
+	seq_buf_printf(out, "  min:                     ");
+	seq_buf_time_units_aligned(out, stats->min_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  max:                     ");
+	seq_buf_time_units_aligned(out, stats->max_freq);
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  mean:                    ");
+	seq_buf_time_units_aligned(out, f_mean);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+	seq_buf_printf(out, "\n");
+
+	seq_buf_printf(out, "  stddev:                  ");
+	seq_buf_time_units_aligned(out, f_stddev);
+	seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+	seq_buf_printf(out, "\n");
+
+	if (quantiles) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			pick_time_units(quantiles->entries[i].m);
+		u64 last_q = 0;
+
+		seq_buf_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(quantiles->entries[i].m, last_q);
+			seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				seq_buf_printf(out, "\n");
+			last_q = q;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
+
+void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
+		const char *epoch_name, unsigned int flags)
+{
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
+
+	if (stats->buffer) {
+		int cpu;
+
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
+
+	if (stats->freq_stats.n) {
+		/* avoid divide by zero */
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
+		/* unless we didn't want zeroes anyway */
+		return;
+	}
+
+	seq_buf_printf(out, "{\n");
+	seq_buf_printf(out, "  \"epoch\":       \"%s\",\n", epoch_name);
+	seq_buf_printf(out, "  \"count\":       %llu,\n", stats->duration_stats.n);
+
+	seq_buf_printf(out, "  \"duration_ns\": {\n");
+	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_duration);
+	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_duration);
+	seq_buf_printf(out, "    \"total\":     %llu,\n", stats->total_duration);
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+	d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+	seq_buf_printf(out, "  \"duration_ewma_ns\": {\n");
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", d_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", d_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	seq_buf_printf(out, "  \"frequency_ns\": {\n");
+	seq_buf_printf(out, "    \"min\":       %llu,\n", stats->min_freq);
+	seq_buf_printf(out, "    \"max\":       %llu,\n", stats->max_freq);
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+	seq_buf_printf(out, "  },\n");
+
+	f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+	f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
+
+	seq_buf_printf(out, "  \"frequency_ewma_ns\": {\n");
+	seq_buf_printf(out, "    \"mean\":      %llu,\n", f_mean);
+	seq_buf_printf(out, "    \"stddev\":    %llu\n", f_stddev);
+
+	if (quantiles) {
+		u64 last_q = 0;
+
+		/* close frequency_ewma_ns but signal more items */
+		seq_buf_printf(out, "  },\n");
+
+		seq_buf_printf(out, "  \"quantiles_ns\": [\n");
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(quantiles->entries[i].m, last_q);
+			seq_buf_printf(out, "    %llu", q);
+			if (!is_last)
+				seq_buf_printf(out, ", ");
+			last_q = q;
+		}
+		seq_buf_printf(out, "  ]\n");
+	} else {
+		/* close frequency_ewma_ns without dumping further */
+		seq_buf_printf(out, "  }\n");
+	}
+
+	seq_buf_printf(out, "}\n");
+}
+EXPORT_SYMBOL_GPL(time_stats_to_json);
+
+void time_stats_exit(struct time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+EXPORT_SYMBOL_GPL(time_stats_exit);
+
+void time_stats_init(struct time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	stats->start_time = local_clock();
+	spin_lock_init(&stats->lock);
+}
+EXPORT_SYMBOL_GPL(time_stats_init);
+
+MODULE_AUTHOR("Kent Overstreet");
+MODULE_LICENSE("GPL");
diff --git a/src/wrappers/handle.rs b/src/wrappers/handle.rs
index 336a029f..48148a8f 100644
--- a/src/wrappers/handle.rs
+++ b/src/wrappers/handle.rs
@@ -61,7 +61,7 @@ impl BcachefsHandle {
     pub fn create_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
         let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
         self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             dst_ptr: dst.as_ptr() as u64,
             ..Default::default()
@@ -73,7 +73,7 @@ impl BcachefsHandle {
     pub fn delete_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
         let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
         self.ioctl(BcachefsIoctl::SubvolumeDestroy, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             dst_ptr: dst.as_ptr() as u64,
             ..Default::default()
@@ -88,7 +88,7 @@ impl BcachefsHandle {
 
         let res = self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
             flags: BCH_SUBVOL_SNAPSHOT_CREATE | extra_flags,
-            dirfd: libc::AT_FDCWD,
+            dirfd: libc::AT_FDCWD as u32,
             mode: 0o777,
             src_ptr: src.as_ref().map_or(0, |x| x.as_ptr() as u64),
             //src_ptr: if let Some(src) = src { src.as_ptr() } else { std::ptr::null() } as u64,