Update bcachefs sources to 0906b1fb49 bcachefs: fixes for 32 bit/big endian machines

2025-02-23 00:00:02 +03:00 · 2018-05-17 01:38:57 -04:00 · 2018-05-17 01:38:57 -04:00 · ff86d47221
commit ff86d47221
parent 800408be11
65 changed files with 1237 additions and 791 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
+0906b1fb492e8e84f563b192fd8f458af1c1d420
--- a/bcachefs.c
+++ b/bcachefs.c
@ -36,10 +36,12 @@ static void usage(void)
 	     "  fsck                 Check an existing filesystem for errors\n"
 	     "\n"
 	     "Startup/shutdown, assembly of multi device filesystems:\n"
+#if 0
 	     "  assemble             Assemble an existing multi device filesystem\n"
 	     "  incremental          Incrementally assemble an existing multi device filesystem\n"
 	     "  run                  Start a partially assembled filesystem\n"
 	     "  stop	               Stop a running filesystem\n"
+#endif
 	     "\n"
 	     "Commands for managing a running filesystem:\n"
 	     "  fs usage             Show disk usage\n"
@ -150,6 +152,7 @@ int main(int argc, char *argv[])
 	if (!strcmp(cmd, "fsck"))
 		return cmd_fsck(argc, argv);

+#if 0
 	if (!strcmp(cmd, "assemble"))
 		return cmd_assemble(argc, argv);
 	if (!strcmp(cmd, "incremental"))
@ -158,6 +161,7 @@ int main(int argc, char *argv[])
 		return cmd_run(argc, argv);
 	if (!strcmp(cmd, "stop"))
 		return cmd_stop(argc, argv);
+#endif

 	if (!strcmp(cmd, "fs"))
 		return fs_cmds(argc, argv);
--- a/cmd_assemble.c
+++ b/cmd_assemble.c
@ -11,6 +11,7 @@
 #include "cmds.h"
 #include "libbcachefs.h"

+#if 0
 int cmd_assemble(int argc, char *argv[])
 {
 	unsigned nr_devs = argc - 1;
@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[])

 	unsigned i;
 	for (i = 0; i < nr_devs; i++)
-		assemble->devs[i] = (__u64) argv[i + 1];
+		assemble->devs[i] = (unsigned long) argv[i + 1];

 	xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble);
 	return 0;
@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[])
 		die("Please supply exactly one device");

 	struct bch_ioctl_incremental incremental = {
-		.dev = (__u64) argv[1],
+		.dev = (unsigned long) argv[1],
 	};

 	xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental);
 	return 0;
 }
+#endif
--- a/cmd_debug.c
+++ b/cmd_debug.c
@ -10,6 +10,7 @@

 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/alloc.h"
+#include "libbcachefs/bset.h"
 #include "libbcachefs/btree_cache.h"
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/buckets.h"
--- a/cmd_run.c
+++ b/cmd_run.c
@ -15,6 +15,7 @@
 #include "cmds.h"
 #include "libbcachefs.h"

+#if 0
 int cmd_run(int argc, char *argv[])
 {
 	return 0;
@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[])
 	xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
 	return 0;
 }
+#endif
--- a/cmds.h
+++ b/cmds.h
@ -12,10 +12,12 @@
 int cmd_format(int argc, char *argv[]);
 int cmd_show_super(int argc, char *argv[]);

+#if 0
 int cmd_assemble(int argc, char *argv[]);
 int cmd_incremental(int argc, char *argv[]);
 int cmd_run(int argc, char *argv[]);
 int cmd_stop(int argc, char *argv[]);
+#endif

 int cmd_fs_usage(int argc, char *argv[]);

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@ -6,27 +6,22 @@

 struct timer_list {
 	unsigned long		expires;
-	void			(*function)(unsigned long);
-	unsigned long		data;
+	void			(*function)(struct timer_list *timer);
 	bool			pending;
 };

-static inline void init_timer(struct timer_list *timer)
+static inline void timer_setup(struct timer_list *timer,
+			       void (*func)(struct timer_list *),
+			       unsigned int flags)
 {
 	memset(timer, 0, sizeof(*timer));
+	timer->function = func;
 }

-#define __init_timer(_timer, _flags)	init_timer(_timer)
+#define timer_setup_on_stack(timer, callback, flags)			\
+	timer_setup(timer, callback, flags)

-#define __setup_timer(_timer, _fn, _data, _flags)			\
-	do {								\
-		__init_timer((_timer), (_flags));			\
-		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
-	} while (0)
-
-#define setup_timer(timer, fn, data)					\
-	__setup_timer((timer), (fn), (data), 0)
+#define destroy_timer_on_stack(timer) do {} while (0)

 static inline int timer_pending(const struct timer_list *timer)
 {
@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer)
 int del_timer(struct timer_list * timer);
 int del_timer_sync(struct timer_list *timer);

+#define del_singleshot_timer_sync(timer) del_timer_sync(timer)
+
 int mod_timer(struct timer_list *timer, unsigned long expires);
-//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);

 static inline void add_timer(struct timer_list *timer)
 {
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@ -8,7 +8,7 @@ struct task_struct;
 struct workqueue_struct;
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
+void delayed_work_timer_fn(struct timer_list *);

 #define work_data_bits(work) ((unsigned long *)(&(work)->data))

@ -44,9 +44,7 @@ struct delayed_work {
 #define INIT_DELAYED_WORK(_work, _func)					\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
-		__setup_timer(&(_work)->timer, delayed_work_timer_fn,	\
-			      (unsigned long)(_work),			\
-			      TIMER_IRQSAFE);				\
+		timer_setup(&(_work)->timer, delayed_work_timer_fn, 0);	\
 	} while (0)

 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -1393,13 +1393,11 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
 {
 	int i;

-	for (i = wp->first_ptr - 1; i >= 0; --i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		if (dev_in_target(ca, target) == in_target)
+	for (i = wp->first_ptr - 1; i >= 0; --i)
+		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+				       target) == in_target)
 			writepoint_drop_ptr(c, wp, i);
 }
-}

 static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
 {
@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	/* does writepoint have ptrs we don't want to use? */
 	if (target)
 		writepoint_for_each_ptr(wp, ob, i)
-			if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
 				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
 				wp->first_ptr++;
 			}
@ -1590,7 +1588,8 @@ alloc_done:
 		 * one in the target we want:
 		 */
 		if (cache_idx >= 0) {
-			if (!dev_in_target(ca, target)) {
+			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+						target)) {
 				writepoint_drop_ptr(c, wp, i);
 			} else {
 				writepoint_drop_ptr(c, wp, cache_idx);
@ -1621,7 +1620,7 @@ alloc_done:

 			if (ca->mi.durability &&
 			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
-			    !dev_idx_in_target(c, ob->ptr.dev, target)) {
+			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
 				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
 				wp->first_ptr++;
 				nr_ptrs_effective -= ca->mi.durability;
@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 /* stop allocator thread: */
 void bch2_dev_allocator_stop(struct bch_dev *ca)
 {
-	struct task_struct *p = ca->alloc_thread;
+	struct task_struct *p;

+	p = rcu_dereference_protected(ca->alloc_thread, 1);
 	ca->alloc_thread = NULL;

 	/*
@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 		return PTR_ERR(p);

 	get_task_struct(p);
-	ca->alloc_thread = p;
+	rcu_assign_pointer(ca->alloc_thread, p);
 	wake_up_process(p);
 	return 0;
 }
@ -2099,7 +2099,7 @@ again:
 			if (btree_node_dirty(b) && (!b->written || b->level)) {
 				if (btree_node_may_write(b)) {
 					rcu_read_unlock();
-					six_lock_read(&b->lock);
+					btree_node_lock_type(c, b, SIX_LOCK_read);
 					bch2_btree_node_write(c, b, SIX_LOCK_read);
 					six_unlock_read(&b->lock);
 					goto again;
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 	struct task_struct *p;

 	rcu_read_lock();
-	if ((p = READ_ONCE(ca->alloc_thread)))
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
 		wake_up_process(p);
 	rcu_read_unlock();
 }
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -197,7 +197,6 @@
 #include <linux/zstd.h>

 #include "bcachefs_format.h"
-#include "bset.h"
 #include "fifo.h"
 #include "opts.h"
 #include "util.h"
@ -272,25 +271,37 @@ do {									\
 #endif

 #define BCH_TIME_STATS()			\
-	BCH_TIME_STAT(btree_node_mem_alloc)		\
-	BCH_TIME_STAT(btree_gc)				\
-	BCH_TIME_STAT(btree_split)			\
-	BCH_TIME_STAT(btree_sort)			\
-	BCH_TIME_STAT(btree_read)			\
-	BCH_TIME_STAT(data_write)			\
-	BCH_TIME_STAT(data_read)			\
-	BCH_TIME_STAT(data_promote)			\
-	BCH_TIME_STAT(journal_write)			\
-	BCH_TIME_STAT(journal_delay)			\
-	BCH_TIME_STAT(journal_blocked)			\
-	BCH_TIME_STAT(journal_flush_seq)
+	x(btree_node_mem_alloc)			\
+	x(btree_gc)				\
+	x(btree_split)				\
+	x(btree_sort)				\
+	x(btree_read)				\
+	x(btree_lock_contended_read)		\
+	x(btree_lock_contended_intent)		\
+	x(btree_lock_contended_write)		\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_write)			\
+	x(journal_delay)			\
+	x(journal_blocked)			\
+	x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};

 #include "alloc_types.h"
+#include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
+#include "rebalance_types.h"
 #include "super_types.h"

 /*
@ -372,7 +383,7 @@ struct bch_dev {
 	struct bch_dev_usage	usage_cached;

 	/* Allocator: */
-	struct task_struct	*alloc_thread;
+	struct task_struct __rcu *alloc_thread;

 	/*
 	 * free: Buckets that are ready to be used
@ -447,7 +458,6 @@ enum {
 	/* shutdown: */
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
-	BCH_FS_GC_STOPPING,

 	/* errors: */
 	BCH_FS_ERROR,
@ -570,12 +580,6 @@ struct bch_fs {
 	struct delayed_work	pd_controllers_update;
 	unsigned		pd_controllers_update_seconds;

-	/* REBALANCE */
-	struct task_struct	*rebalance_thread;
-	struct bch_pd_controller rebalance_pd;
-
-	atomic64_t		rebalance_work_unknown_dev;
-
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];

 	u64			capacity; /* sectors */
@ -664,6 +668,9 @@ struct bch_fs {

 	atomic64_t		key_version;

+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
@ -714,18 +721,13 @@ struct bch_fs {

 	unsigned		btree_gc_periodic:1;
 	unsigned		copy_gc_enabled:1;
-	unsigned		rebalance_enabled:1;
-	unsigned		rebalance_percent;
 	bool			promote_whole_extents;

 #define BCH_DEBUG_PARAM(name, description) bool name;
 	BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM

-#define BCH_TIME_STAT(name)				\
-	struct time_stats	name##_time;
-	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+	struct time_stats	times[BCH_TIME_STAT_NR];
 };

 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -3,6 +3,72 @@

 /*
 * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
 */

 #include <asm/types.h>
@ -44,12 +110,19 @@ struct bkey_format {
 /* Btree keys - all units are in sectors */

 struct bpos {
-	/* Word order matches machine byte order */
-#if defined(__LITTLE_ENDIAN)
+	/*
+	 * Word order matches machine byte order - btree code treats a bpos as a
+	 * single large integer, for search/comparison purposes
+	 *
+	 * Note that wherever a bpos is embedded in another on disk data
+	 * structure, it has to be byte swabbed when reading in metadata that
+	 * wasn't written in native endian order:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	__u32		snapshot;
 	__u64		offset;
 	__u64		inode;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	__u64		inode;
 	__u64		offset;		/* Points to end of extent - sectors */
 	__u32		snapshot;
@ -83,10 +156,10 @@ struct bch_val {
 };

 struct bversion {
-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	__u64		lo;
 	__u32		hi;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	__u32		hi;
 	__u64		lo;
 #endif
@ -110,13 +183,13 @@ struct bkey {
 	/* Type of the value */
 	__u8		type;

-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	__u8		pad[1];

 	struct bversion	version;
 	__u32		size;		/* extent size, in sectors */
 	struct bpos	p;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	struct bpos	p;
 	__u32		size;		/* extent size, in sectors */
 	struct bversion	version;
@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
 *
 * If an extent is not checksummed or compressed, when the extent is trimmed we
 * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the start of the data that
- * is currently live. The size field in struct bkey records the current (live)
- * size of the extent, and is also used to mean "size of region on disk that we
- * point to" in this case.
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
 *
 * Thus an extent that is not checksummed or compressed will consist only of a
 * list of bch_extent_ptrs, with none of the fields in
@ -446,11 +519,11 @@ struct bch_extent_crc128 {
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			compression_type:4,
 				csum_type:4,
-				nonce:14,
+				nonce:13,
 				offset:13,
 				_uncompressed_size:13,
 				_compressed_size:13,
-				type:3;
+				type:4;
 #endif
 	struct bch_csum		csum;
 } __attribute__((packed, aligned(8)));
@ -496,7 +569,7 @@ struct bch_extent_reservation {
 };

 union bch_extent_entry {
-#if defined(__LITTLE_ENDIAN) ||  __BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
 	unsigned long			type;
 #elif __BITS_PER_LONG == 32
 	struct {
@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 	  sizeof(struct bch_extent_ptr)) / sizeof(u64))

 /* Maximum possible size of an entire extent value: */
-/* There's a hack in the keylist code that needs to be fixed.. */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
 	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))

+#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)

@ -1378,33 +1452,4 @@ struct btree_node_entry {
 	};
 } __attribute__((packed, aligned(8)));

-/* Obsolete: */
-
-struct prio_set {
-	struct bch_csum		csum;
-
-	__le64			magic;
-	__le32			nonce[3];
-	__le16			version;
-	__le16			flags;
-
-	__u8			encrypted_start[0];
-
-	__le64			next_bucket;
-
-	struct bucket_disk {
-		__le16		prio[2];
-		__u8		gen;
-	} __attribute__((packed)) data[];
-} __attribute__((packed, aligned(8)));
-
-LE32_BITMASK(PSET_CSUM_TYPE,	struct prio_set, flags, 0, 4);
-
-#define PSET_MAGIC		__cpu_to_le64(0x6750e15f87337f91ULL)
-
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
-}
-
 #endif /* _BCACHEFS_FORMAT_H */
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@ -5,6 +5,9 @@
 #include <asm/ioctl.h>
 #include "bcachefs_format.h"

+/*
+ * Flags common to multiple ioctls:
+ */
 #define BCH_FORCE_IF_DATA_LOST		(1 << 0)
 #define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
 #define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
@ -14,12 +17,23 @@
 	(BCH_FORCE_IF_DATA_DEGRADED|		\
 	 BCH_FORCE_IF_METADATA_DEGRADED)

+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
 #define BCH_BY_INDEX			(1 << 4)

+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
 #define BCH_READ_DEV			(1 << 5)

 /* global control dev: */

+/* These are currently broken, and probably unnecessary: */
+#if 0
 #define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
 #define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)

@ -35,12 +49,18 @@ struct bch_ioctl_incremental {
 	__u64			pad;
 	__u64			dev;
 };
+#endif

 /* filesystem ioctls: */

 #define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
 #define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
 #define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
 #define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
@ -52,14 +72,70 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
 #define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)

+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
 struct bch_ioctl_query_uuid {
 	uuid_le			uuid;
 };

+#if 0
 struct bch_ioctl_start {
 	__u32			flags;
 	__u32			pad;
 };
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */

 struct bch_ioctl_disk {
 	__u32			flags;
@ -67,6 +143,16 @@ struct bch_ioctl_disk {
 	__u64			dev;
 };

+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
 struct bch_ioctl_disk_set_state {
 	__u32			flags;
 	__u8			new_state;
@ -81,6 +167,15 @@ enum bch_data_ops {
 	BCH_DATA_OP_NR		= 3,
 };

+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
 struct bch_ioctl_data {
 	__u32			op;
 	__u32			flags;
@ -93,9 +188,18 @@ struct bch_ioctl_data {
 		__u32		dev;
 		__u32		pad;
 	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
 	};
 } __attribute__((packed, aligned(8)));

+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
 struct bch_ioctl_data_progress {
 	__u8			data_type;
 	__u8			btree_id;
@ -106,6 +210,15 @@ struct bch_ioctl_data_progress {
 	__u64			sectors_total;
 } __attribute__((packed, aligned(8)));

+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __attribute__((packed, aligned(8)));
+
 struct bch_ioctl_dev_usage {
 	__u8			state;
 	__u8			alive;
@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage {
 	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
 };

+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices	- number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
 struct bch_ioctl_usage {
 	__u16			nr_devices;
 	__u16			pad[3];
@ -135,6 +261,20 @@ struct bch_ioctl_usage {
 	struct bch_ioctl_dev_usage devs[0];
 };

+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
 struct bch_ioctl_read_super {
 	__u32			flags;
 	__u32			pad;
@ -143,10 +283,22 @@ struct bch_ioctl_read_super {
 	__u64			sb;
 };

+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
 struct bch_ioctl_disk_get_idx {
 	__u64			dev;
 };

+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
 struct bch_ioctl_disk_resize {
 	__u32			flags;
 	__u32			pad;
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -13,8 +13,6 @@

 void bch2_to_binary(char *, const u64 *, unsigned);

-#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
 /* bkey with split value, const */
 struct bkey_s_c {
 	const struct bkey	*k;
@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);

 /* byte order helpers */

-#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
 #error edit for your odd byteorder.
 #endif

-#ifdef __LITTLE_ENDIAN
-
-#define high_bit_offset		0
-#define __high_word(u64s, k)	((k)->_data + (u64s) - 1)
-#define nth_word(p, n)		((p) - (n))
-
-#else
-
-#define high_bit_offset		KEY_PACKED_BITS_START
-#define __high_word(u64s, k)	((k)->_data)
-#define nth_word(p, n)		((p) + (n))
-
-#endif
-
-#define high_word(format, k)	__high_word((format)->key_u64s, k)
+#define high_word(f, k)		((k)->_data + high_word_offset(f))
 #define next_word(p)		nth_word(p, 1)
 #define prev_word(p)		nth_word(p, -1)

--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@ -6,6 +6,7 @@
 */

 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "bset.h"
 #include "eytzinger.h"
 #include "util.h"
@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b)
 	b->aux_data = NULL;
 }

+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
 {
 	b->page_order	= page_order;
@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 	 * (and then the bits we want are at the high end, so we shift them
 	 * back down):
 	 */
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	v >>= f->exponent & 7;
 #else
 	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 * Then we calculate the actual shift value, from the start of the key
 	 * (k->_data), to get the key bits starting at exponent:
 	 */
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;

 	EBUG_ON(shift + bits > b->format.key_u64s * 64);
@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
 	set_btree_bset(b, t, i);
 }

-void bch2_bset_init_next(struct btree *b, struct bset *i)
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+			 struct btree_node_entry *bne)
 {
+	struct bset *i = &bne->keys;
 	struct bset_tree *t;

+	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
 	BUG_ON(b->nsets >= MAX_BSETS);

 	memset(i, 0, sizeof(*i));
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b)
 #endif
 }

-struct btree_node_iter;
-struct btree_node_iter_set;
-
 enum bset_aux_tree_type {
 	BSET_NO_AUX_TREE,
 	BSET_RO_AUX_TREE,
@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
 void bch2_btree_keys_init(struct btree *, bool *);

 void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
 void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
 				  struct bkey_packed *);
@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,

 /* Btree key iteration */

-struct btree_node_iter {
-	u8		is_extents;
-
-	struct btree_node_iter_set {
-		u16	k, end;
-	} data[MAX_BSETS];
-};
-
 static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
 					      bool is_extents)
 {
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@ -554,7 +554,8 @@ out:
 	b->uncompacted_whiteout_u64s = 0;
 	bch2_btree_keys_init(b, &c->expensive_debug_checks);

-	bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);

 	return b;
 err:
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/task.h>
 #include <trace/events/bcachefs.h>

 struct range_checks {
@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)

 		gc_pos_set(c, gc_pos_btree_node(b));

-		if (max_stale > 32)
+		if (max_stale > 64)
 			bch2_btree_node_rewrite(c, &iter,
 					b->data->keys.seq,
 					BTREE_INSERT_USE_RESERVE|
+					BTREE_INSERT_NOWAIT|
 					BTREE_INSERT_GC_LOCK_HELD);
 		else if (!btree_gc_rewrite_disabled(c) &&
 			 (btree_gc_always_rewrite(c) || max_stale > 16))
@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c)
 out:
 	up_write(&c->gc_lock);
 	trace_gc_end(c);
-	bch2_time_stats_update(&c->btree_gc_time, start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);

 	/*
 	 * Wake up allocator in case it was waiting for buckets
@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 {
 	struct btree_iter iter;
 	struct btree *b;
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	unsigned i;

 	/* Sliding window of adjacent btree nodes */
@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)

 		lock_seq[0] = merge[0]->lock.state.seq;

-		if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
+		if (kthread && kthread_should_stop()) {
 			bch2_btree_iter_unlock(&iter);
 			return -ESHUTDOWN;
 		}
@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg)

 void bch2_gc_thread_stop(struct bch_fs *c)
 {
-	set_bit(BCH_FS_GC_STOPPING, &c->flags);
-
-	if (c->gc_thread)
-		kthread_stop(c->gc_thread);
+	struct task_struct *p;

+	p = c->gc_thread;
 	c->gc_thread = NULL;
-	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
 }

 int bch2_gc_thread_start(struct bch_fs *c)
@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c)

 	BUG_ON(c->gc_thread);

-	p = kthread_create(bch2_gc_thread, c, "bcache_gc");
+	p = kthread_create(bch2_gc_thread, c, "bch_gc");
 	if (IS_ERR(p))
 		return PTR_ERR(p);

+	get_task_struct(p);
 	c->gc_thread = p;
-	wake_up_process(c->gc_thread);
+	wake_up_process(p);
 	return 0;
 }

--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));

 	if (sorting_entire_node)
-		bch2_time_stats_update(&c->btree_sort_time, start_time);
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+				       start_time);

 	/* Make sure we preserve bset journal_seq: */
 	for (t = b->set + start_idx; t < b->set + end_idx; t++)
@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 				&dst->format,
 				true);

-	bch2_time_stats_update(&c->btree_sort_time, start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);

 	set_btree_bset_end(dst, dst->set);

@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,

 	bne = want_new_bset(c, b);
 	if (bne)
-		bch2_bset_init_next(b, &bne->keys);
+		bch2_bset_init_next(c, b, bne);

 	bch2_btree_build_aux_trees(b);

@ -1382,7 +1383,7 @@ start:
 		}
 	}

-	bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
 	bio_put(&rb->bio);
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	BUG_ON((b->will_make_reachable != 0) != !b->written);

 	BUG_ON(b->written >= c->opts.btree_node_size);
+	BUG_ON(b->written & (c->opts.block_size - 1));
 	BUG_ON(bset_written(b, btree_bset_last(b)));
 	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
 	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)

 	bne = want_new_bset(c, b);
 	if (bne)
-		bch2_bset_init_next(b, &bne->keys);
+		bch2_bset_init_next(c, b, bne);

 	bch2_btree_build_aux_trees(b);

--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -133,7 +133,7 @@ do {									\
 									\
 		six_unlock_read(&(_b)->lock);				\
 		btree_node_wait_on_io(_b);				\
-		six_lock_read(&(_b)->lock);				\
+		btree_node_lock_type(c, b, SIX_LOCK_read);		\
 	}								\
 } while (0)

--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -42,25 +42,17 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
 	six_unlock_write(&b->lock);
 }

-void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
+	struct bch_fs *c = iter->c;
 	struct btree_iter *linked;
 	unsigned readers = 0;

-	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
-
-	if (six_trylock_write(&b->lock))
-		return;
-
 	for_each_linked_btree_iter(iter, linked)
 		if (linked->l[b->level].b == b &&
 		    btree_node_read_locked(linked, b->level))
 			readers++;

-	if (likely(!readers)) {
-		six_lock_write(&b->lock);
-	} else {
 	/*
 	 * Must drop our read locks before calling six_lock_write() -
 	 * six_unlock() won't do wakeups until the reader count
@ -69,11 +61,10 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 	 */
 	atomic64_sub(__SIX_VAL(read_lock, readers),
 		     &b->lock.state.counter);
-		six_lock_write(&b->lock);
+	btree_node_lock_type(c, b, SIX_LOCK_write);
 	atomic64_add(__SIX_VAL(read_lock, readers),
 		     &b->lock.state.counter);
 }
-}

 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   struct btree_iter *iter,
 			   enum six_lock_type type)
 {
+	struct bch_fs *c = iter->c;
 	struct btree_iter *linked;

 	/* Can't have children locked before ancestors: */
@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		}
 	}

-	six_lock_type(&b->lock, type);
+	__btree_node_lock_type(c, b, type);
 	return true;
 }

--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -4,72 +4,6 @@
 #include <linux/dynamic_fault.h>

 #include "btree_types.h"
-#include "bset.h"
-
-#define BTREE_ITER_SLOTS		(1 << 0)
-#define BTREE_ITER_INTENT		(1 << 1)
-#define BTREE_ITER_PREFETCH		(1 << 2)
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
- */
-#define BTREE_ITER_IS_EXTENTS		(1 << 3)
-/*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
- */
-#define BTREE_ITER_AT_END_OF_LEAF	(1 << 4)
-#define BTREE_ITER_ERROR		(1 << 5)
-
-enum btree_iter_uptodate {
-	BTREE_ITER_UPTODATE		= 0,
-	BTREE_ITER_NEED_PEEK		= 1,
-	BTREE_ITER_NEED_RELOCK		= 2,
-	BTREE_ITER_NEED_TRAVERSE	= 3,
-	BTREE_ITER_END			= 4,
-};
-
-/*
- * @pos			- iterator's current position
- * @level		- current btree depth
- * @locks_want		- btree level below which we start taking intent locks
- * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked	- bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-	struct bch_fs		*c;
-	struct bpos		pos;
-
-	u8			flags;
-	unsigned		uptodate:4;
-	enum btree_id		btree_id:4;
-	unsigned		level:4,
-				locks_want:4,
-				nodes_locked:4,
-				nodes_intent_locked:4;
-
-	struct btree_iter_level {
-		struct btree	*b;
-		struct btree_node_iter iter;
-	}			l[BTREE_MAX_DEPTH];
-
-	u32			lock_seq[BTREE_MAX_DEPTH];
-
-	/*
-	 * Current unpacked key - so that bch2_btree_iter_next()/
-	 * bch2_btree_iter_next_slot() can correctly advance pos.
-	 */
-	struct bkey		k;
-
-	/*
-	 * Circular linked list of linked iterators: linked iterators share
-	 * locks (e.g. two linked iterators may have the same node intent
-	 * locked, or read and write locked, at the same time), and insertions
-	 * through one iterator won't invalidate the other linked iterators.
-	 */
-
-	/* Must come last: */
-	struct btree_iter	*next;
-};

 static inline void btree_iter_set_dirty(struct btree_iter *iter,
 					enum btree_iter_uptodate u)
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 	mark_btree_node_unlocked(iter, level);
 }

+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+	switch (type) {
+	case SIX_LOCK_read:
+		return BCH_TIME_btree_lock_contended_read;
+	case SIX_LOCK_intent:
+		return BCH_TIME_btree_lock_contended_intent;
+	case SIX_LOCK_write:
+		return BCH_TIME_btree_lock_contended_write;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					  enum six_lock_type type)
+{
+	u64 start_time = local_clock();
+
+	six_lock_type(&b->lock, type);
+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					enum six_lock_type type)
+{
+	if (!six_trylock_type(&b->lock, type))
+		__btree_node_lock_type(c, b, type);
+}
+
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
 			   struct btree_iter *, enum six_lock_type);

@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
 bool bch2_btree_iter_relock(struct btree_iter *);

 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
-void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+	if (!six_trylock_write(&b->lock))
+		__bch2_btree_node_lock_write(b, iter);
+}

 #endif /* _BCACHEFS_BTREE_LOCKING_H */

--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -176,6 +176,79 @@ struct btree_cache {
 	struct closure_waitlist	alloc_wait;
 };

+struct btree_node_iter {
+	u8		is_extents;
+
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+#define BTREE_ITER_SLOTS		(1 << 0)
+#define BTREE_ITER_INTENT		(1 << 1)
+#define BTREE_ITER_PREFETCH		(1 << 2)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS		(1 << 3)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF	(1 << 4)
+#define BTREE_ITER_ERROR		(1 << 5)
+
+enum btree_iter_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_PEEK		= 1,
+	BTREE_ITER_NEED_RELOCK		= 2,
+	BTREE_ITER_NEED_TRAVERSE	= 3,
+	BTREE_ITER_END			= 4,
+};
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct bch_fs		*c;
+	struct bpos		pos;
+
+	u8			flags;
+	unsigned		uptodate:4;
+	enum btree_id		btree_id:4;
+	unsigned		level:4,
+				locks_want:4,
+				nodes_locked:4,
+				nodes_intent_locked:4;
+
+	struct btree_iter_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+	}			l[BTREE_MAX_DEPTH];
+
+	u32			lock_seq[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/*
+	 * Circular linked list of linked iterators: linked iterators share
+	 * locks (e.g. two linked iterators may have the same node intent
+	 * locked, or read and write locked, at the same time), and insertions
+	 * through one iterator won't invalidate the other linked iterators.
+	 */
+
+	/* Must come last: */
+	struct btree_iter	*next;
+};
+
 #define BTREE_FLAG(flag)						\
 static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,

 	clear_btree_node_noevict(b);

-	six_lock_write(&b->lock);
+	btree_node_lock_type(c, b, SIX_LOCK_write);

 	bch2_btree_node_hash_remove(&c->btree_cache, b);

@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
 		 * b->will_make_reachable prevented it from being written, so
 		 * write it now if it needs to be written:
 		 */
-		six_lock_read(&b->lock);
+		btree_node_lock_type(c, b, SIX_LOCK_read);
 		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
 		six_unlock_read(&b->lock);
 		mutex_lock(&c->btree_interior_update_lock);
@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl)
 	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
 	if (ret < 0)
 		goto err;
-	if (!ret)
+	if (!ret) {
 		continue_at(cl, btree_update_wait_on_journal, system_wq);
+		return;
+	}

 	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
@ -679,7 +681,7 @@ retry:

 		if (!six_trylock_read(&b->lock)) {
 			mutex_unlock(&c->btree_interior_update_lock);
-			six_lock_read(&b->lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
 			six_unlock_read(&b->lock);
 			goto retry;
 		}
@ -720,7 +722,7 @@ retry:

 		if (!six_trylock_read(&b->lock)) {
 			mutex_unlock(&c->btree_interior_update_lock);
-			six_lock_read(&b->lock);
+			btree_node_lock_type(c, b, SIX_LOCK_read);
 			six_unlock_read(&b->lock);
 			goto retry;
 		}
@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_iter_node_replace(iter, n2);
 	bch2_btree_iter_node_replace(iter, n1);

-	bch2_time_stats_update(&c->btree_split_time, start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }

 static void
@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_node_write(c, n, SIX_LOCK_intent);

 	if (parent) {
-		bch2_btree_insert_node(as, parent, iter,
-				       &keylist_single(&n->key));
+		bch2_keylist_add(&as->parent_keys, &n->key);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
 	} else {
 		bch2_btree_set_root(as, n, iter);
 	}
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
 	return (void *) i > write_block(b);
 }

-static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
-				       struct bset *i)
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
 {
-	return round_up(bset_byte_offset(b, vstruct_end(i)),
-			block_bytes(c)) >> 9;
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size << 6;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
 }

 static inline unsigned btree_write_set_buffer(struct btree *b)
@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 						     struct btree *b)
 {
 	struct bset *i = btree_bset_last(b);
-	unsigned offset = max_t(unsigned, b->written << 9,
-				bset_byte_offset(b, vstruct_end(i)));
-	ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
-		(offset + sizeof(struct btree_node_entry) +
-		 b->whiteout_u64s * sizeof(u64) +
-		 b->uncompacted_whiteout_u64s * sizeof(u64));
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);

-	EBUG_ON(offset > btree_bytes(c));
-
-	if ((unlikely(bset_written(b, i)) &&
-	     remaining_space > block_bytes(c)) ||
-	    (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
-	     remaining_space > btree_write_set_buffer(b)))
-		return (void *) b->data + offset;
+	if (unlikely(bset_written(b, i))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}

 	return NULL;
 }
@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
 	}
 }

-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
-						   struct btree *b)
-{
-	struct bset *i = btree_bset_last(b);
-	unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
-		b->whiteout_u64s +
-		b->uncompacted_whiteout_u64s;
-	unsigned total = c->opts.btree_node_size << 6;
-
-	EBUG_ON(used > total);
-
-	if (bset_written(b, i))
-		return 0;
-
-	return total - used;
-}
-
 /*
 * write lock must be held on @b (else the dirty bset that we were going to
 * insert into could be written out from under us)
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 	struct btree_write *w = container_of(pin, struct btree_write, journal);
 	struct btree *b = container_of(w, struct btree, writes[i]);

-	six_lock_read(&b->lock);
+	btree_node_lock_type(c, b, SIX_LOCK_read);
 	bch2_btree_node_write_cond(c, b,
 			(btree_current_write(b) == w &&
 			 w->journal.pin_list == journal_seq_pin(j, seq)));
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		return;
 	}

-	v = READ_ONCE(g->_mark.counter);
+	v = atomic64_read(&g->_mark.v);
 	do {
-		new.counter = old.counter = v;
+		new.v.counter = old.v.counter = v;
 		saturated = 0;

 		/*
@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			g->_mark = new;
 			break;
 		}
-	} while ((v = cmpxchg(&g->_mark.counter,
-			      old.counter,
-			      new.counter)) != old.counter);
+	} while ((v = atomic64_cmpxchg(&g->_mark.v,
+			      old.v.counter,
+			      new.v.counter)) != old.v.counter);

 	bch2_dev_usage_update(c, ca, old, new);

@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	kvpfree(ca->buckets_dirty,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-	kvpfree(ca->buckets,	 sizeof(struct bucket_array) +
+	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));

 	free_percpu(ca->usage_percpu);
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -16,15 +16,15 @@

 #define bucket_cmpxchg(g, new, expr)				\
 ({								\
-	u64 _v = READ_ONCE((g)->_mark.counter);			\
+	u64 _v = atomic64_read(&(g)->_mark.v);			\
 	struct bucket_mark _old;				\
 								\
 	do {							\
-		(new).counter = _old.counter = _v;		\
+		(new).v.counter = _old.v.counter = _v;		\
 		expr;						\
-	} while ((_v = cmpxchg(&(g)->_mark.counter,		\
-			       _old.counter,			\
-			       (new).counter)) != _old.counter);\
+	} while ((_v = atomic64_cmpxchg(&(g)->_mark.v,		\
+			       _old.v.counter,			\
+			       (new).v.counter)) != _old.v.counter);\
 	_old;							\
 })

--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -6,7 +6,7 @@
 struct bucket_mark {
 	union {
 	struct {
-		u64		counter;
+		atomic64_t	v;
 	};

 	struct {
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 	return ca;
 }

+#if 0
 static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 {
 	struct bch_ioctl_assemble arg;
@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg

 	return 0;
 }
+#endif

 static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 {
 	switch (cmd) {
+#if 0
 	case BCH_IOCTL_ASSEMBLE:
 		return bch2_ioctl_assemble(arg);
 	case BCH_IOCTL_INCREMENTAL:
 		return bch2_ioctl_incremental(arg);
+#endif
 	default:
 		return -ENOTTY;
 	}
@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
 			    sizeof(c->sb.user_uuid));
 }

+#if 0
 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 {
 	if (arg.flags || arg.pad)
@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c)
 	bch2_fs_stop(c);
 	return 0;
 }
+#endif

 static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 {
@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 {
 	struct bch_data_ctx *ctx = file->private_data;
 	struct bch_fs *c = ctx->c;
-	struct bch_ioctl_data_progress p = {
-		.data_type	= ctx->stats.data_type,
-		.btree_id	= ctx->stats.iter.btree_id,
-		.pos		= ctx->stats.iter.pos,
-		.sectors_done	= atomic64_read(&ctx->stats.sectors_seen),
-		.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.iter.btree_id,
+		.p.pos			= ctx->stats.iter.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
 	};

-	if (len != sizeof(p))
+	if (len < sizeof(e))
 		return -EINVAL;

-	return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
+	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
 }

 static const struct file_operations bcachefs_data_ops = {
@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,

 		if (ca->dev_idx >= arg.nr_devices) {
 			percpu_ref_put(&ca->ref);
-			return -ENOSPC;
+			return -ERANGE;
 		}

 		if (percpu_ref_tryget(&ca->io_ref)) {
@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		return -EPERM;

 	switch (cmd) {
+#if 0
 	case BCH_IOCTL_START:
 		BCH_IOCTL(start, struct bch_ioctl_start);
 	case BCH_IOCTL_STOP:
 		return bch2_ioctl_stop(c);
+#endif
 	case BCH_IOCTL_READ_SUPER:
 		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
 	case BCH_IOCTL_DISK_GET_IDX:
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type,
 	BUG_ON(!bch2_checksum_mergeable(type));

 	while (b_len) {
-		unsigned b = min(b_len, PAGE_SIZE);
+		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);

 		a.lo = bch2_checksum_update(type, a.lo,
 				page_address(ZERO_PAGE(0)), b);
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 }

 struct io_clock_wait {
-	struct io_timer		timer;
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
 	struct task_struct	*task;
 	int			expired;
 };
@ -50,7 +51,16 @@ struct io_clock_wait {
 static void io_clock_wait_fn(struct io_timer *timer)
 {
 	struct io_clock_wait *wait = container_of(timer,
-				struct io_clock_wait, timer);
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);

 	wait->expired = 1;
 	wake_up_process(wait->task);
@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
 	struct io_clock_wait wait;

 	/* XXX: calculate sleep time rigorously */
-	wait.timer.expire	= until;
-	wait.timer.fn		= io_clock_wait_fn;
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
 	wait.task		= current;
 	wait.expired		= 0;
-	bch2_io_timer_add(clock, &wait.timer);
+	bch2_io_timer_add(clock, &wait.io_timer);

 	schedule();

-	bch2_io_timer_del(clock, &wait.timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
 }

-/*
- * _only_ to be used from a kthread
- */
 void bch2_kthread_io_clock_wait(struct io_clock *clock,
-			       unsigned long until)
+				unsigned long io_until,
+				unsigned long cpu_timeout)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct io_clock_wait wait;

-	/* XXX: calculate sleep time rigorously */
-	wait.timer.expire	= until;
-	wait.timer.fn		= io_clock_wait_fn;
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
 	wait.task		= current;
 	wait.expired		= 0;
-	bch2_io_timer_add(clock, &wait.timer);
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);

 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop())
+		if (kthread && kthread_should_stop())
 			break;

 		if (wait.expired)
@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 	}

 	__set_current_state(TASK_RUNNING);
-	bch2_io_timer_del(clock, &wait.timer);
+	del_singleshot_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
 }

 static struct io_timer *get_expired_timer(struct io_clock *clock,
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@ -3,7 +3,8 @@

 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
 void bch2_increment_clock(struct bch_fs *, unsigned, int);

 void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = {

 #undef BCH_FEATURE_NONE

-int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
 	int ret = 0;

@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 	mempool_exit(&c->compression_bounce[READ]);
 }

-static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t)pool_data;
-	return kvpmalloc(size, gfp_mask);
-}
-
-void mempool_kvpfree(void *element, void *pool_data)
-{
-	size_t size = (size_t)pool_data;
-	kvpfree(element, size);
-}
-
-static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return !mempool_initialized(pool)
-		? mempool_init(pool, min_nr, mempool_kvpmalloc,
-			       mempool_kvpfree, (void *) size)
-		: 0;
-}
-
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t max_extent = c->sb.encoded_extent_max << 9;
@ -611,6 +591,9 @@ have_compressed:
 		if (i->decompress_workspace)
 			decompress_workspace_needed = true;

+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
 		ret = mempool_init_kvpmalloc_pool(
 				&c->compress_workspace[i->type],
 				1, i->compress_workspace);
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@ -16,7 +16,7 @@ static int group_cmp(const void *_l, const void *_r)
 		strncmp(l->label, r->label, sizeof(l->label));
 }

-const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
 						struct bch_sb_field *f)
 {
 	struct bch_sb_field_disk_groups *groups =
@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 		}
 	}

-	old_g = c->disk_groups;
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
 	rcu_assign_pointer(c->disk_groups, cpu_g);
 	if (old_g)
 		kfree_rcu(old_g, rcu);
@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
 	}
 }

+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
 static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
 				  unsigned parent,
 				  const char *name, unsigned namelen)
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target)
 	return (struct target) { .type = TARGET_NULL };
 }

-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
-	struct target t = target_decode(target);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		return false;
-	case TARGET_DEV:
-		return ca->dev_idx == t.dev;
-	case TARGET_GROUP:
-		return ca->mi.group && ca->mi.group - 1 == t.group;
-	default:
-		BUG();
-	}
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
-	rcu_read_unlock();
-
-	return ret;
-}
-
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);

 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
 	const struct bch_extent_ptr *ptr;

 	extent_for_each_ptr(e, ptr) {
-		struct bch_dev *ca = c->devs[ptr->dev];
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);

 		if (ca->mi.group &&
 		    ca->mi.group - 1 == group)
@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
 {
 	const struct bch_extent_ptr *ptr;

-	extent_for_each_ptr(e, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (dev_in_target(ca, target) &&
-		    (!ptr->cached || !ptr_stale(ca, ptr)))
+	extent_for_each_ptr(e, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
 			return ptr;
-	}

 	return NULL;
 }
@ -732,7 +730,7 @@ err:
 	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
 		      "gen %i mark %08x",
 		      err, buf, PTR_BUCKET_NR(ca, ptr),
-		      mark.gen, (unsigned) mark.counter);
+		      mark.gen, (unsigned) mark.v.counter);
 }

 void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 			int n = bch2_extent_ptr_durability(c, ptr);

 			if (n && n <= extra &&
-			    !dev_in_target(c->devs[ptr->dev], target)) {
+			    !bch2_dev_in_target(c, ptr->dev, target)) {
 				ptr->cached = true;
 				extra -= n;
 			}
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 			.uncompressed_size	= k->size,
 			.live_size		= k->size,
 		};
-	case BCH_EXTENT_CRC32:
-		return (struct bch_extent_crc_unpacked) {
+	case BCH_EXTENT_CRC32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc32),
-			.csum.lo		= (__force __le64) crc->crc32.csum,
 		};
-	case BCH_EXTENT_CRC64:
-		return (struct bch_extent_crc_unpacked) {
+
+		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+		memcpy(&ret.csum.lo, &crc->crc32.csum,
+		       sizeof(crc->crc32.csum));
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc64),
 			.nonce			= crc->crc64.nonce,
 			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
-			.csum.hi		= (__force __le64) crc->crc64.csum_hi,
 		};
-	case BCH_EXTENT_CRC128:
-		return (struct bch_extent_crc_unpacked) {
+
+		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc128),
 			.nonce			= crc->crc128.nonce,
 			.csum			= crc->crc128.csum,
 		};
+
+		return ret;
+	}
 	default:
 		BUG();
 	}
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page)
 	if (!PagePrivate(page))
 		return;

-	s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
+	s.v = xchg(&page_state(page)->v, 0);
 	ClearPagePrivate(page);

 	if (s.dirty_sectors)
@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,

 			if (bkey_extent_is_data(k.k)) {
 				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-				const struct bch_extent_ptr *ptr;
 				struct bch_extent_crc_unpacked crc;
+				const union bch_extent_entry *i;

-				extent_for_each_ptr_crc(e, ptr, crc)
-					want_full_extent |= !!crc.csum_type |
-							     !!crc.compression_type;
+				extent_for_each_crc(e, crc, i)
+					want_full_extent |= ((crc.csum_type != 0) |
+							     (crc.compression_type != 0));
 			}

 			readpage_bio_extend(readpages_iter,
@ -1850,8 +1850,7 @@ err_wait_io:
 		dio->loop = true;

 		if (!dio->sync) {
-			continue_at_noreturn(&dio->cl,
-					bch2_dio_write_loop_async, NULL);
+			continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
 			return -EIOCBQUEUED;
 		}

--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -610,7 +610,8 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
 static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
 {
 	if (nr >= b->size) {
-		size_t new_size = max(max(PAGE_SIZE * 8,
+		size_t new_size = max_t(size_t, max_t(size_t,
+					PAGE_SIZE * 8,
 					b->size * 2),
 					nr + 1);
 		void *n;
@ -642,7 +643,7 @@ struct pathbuf {
 static int path_down(struct pathbuf *p, u64 inum)
 {
 	if (p->nr == p->size) {
-		size_t new_size = max(256UL, p->size * 2);
+		size_t new_size = max_t(size_t, 256UL, p->size * 2);
 		void *n = krealloc(p->entries,
 				   new_size * sizeof(p->entries[0]),
 				   GFP_KERNEL);
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -21,10 +21,10 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
-#include "tier.h"

 #include <linux/blkdev.h>
 #include <linux/random.h>
@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl)
 	percpu_ref_put(&c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);

-	bch2_time_stats_update(&c->data_write_time, op->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);

 	closure_return(cl);
 }
@ -842,20 +842,24 @@ again:
 	} while (ret);

 	continue_at(cl, bch2_write_index, index_update_wq(op));
+	return;
 err:
 	op->error = ret;

 	continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
 		    ? bch2_write_index
 		    : bch2_write_done, index_update_wq(op));
+	return;
 flush_io:
 	closure_sync(cl);

 	if (!bch2_keylist_empty(&op->insert_keys)) {
 		__bch2_write_index(op);

-		if (op->error)
+		if (op->error) {
 			continue_at_nobarrier(cl, bch2_write_done, NULL);
+			return;
+		}
 	}

 	goto again;
@ -901,6 +905,7 @@ void bch2_write(struct closure *cl)
 		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 			bch2_disk_reservation_put(c, &op->res);
 		closure_return(cl);
+		return;
 	}

 	bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
@ -974,7 +979,8 @@ static void promote_done(struct closure *cl)
 		container_of(cl, struct promote_op, cl);
 	struct bch_fs *c = op->write.op.c;

-	bch2_time_stats_update(&c->data_promote_time, op->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);

 	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
 	promote_free(c, op);
@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 		(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
 		bch2_bio_map(&(*rbio)->bio, NULL);

-		if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+		if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
 			goto err;

 		(*rbio)->bounce		= true;
@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)

 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
-	bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
+	bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+			       rbio->start_time);
 	bio_endio(&rbio->bio);
 }

@ -1486,7 +1493,7 @@ csum_err:
 	}

 	bch2_dev_io_error(ca,
-		"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+		"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
 		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
 		csum.hi, csum.lo, crc.csum_type);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);

+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
 int bch2_dev_journal_alloc(struct bch_dev *);

 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -324,7 +324,7 @@ struct jset_entry_ops {
 			struct jset_entry *, int);
 };

-const struct jset_entry_ops bch2_jset_entry_ops[] = {
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #define x(f, nr)						\
 	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
 		.validate	= journal_entry_validate_##f,	\
@ -696,6 +696,7 @@ out:
 	kvpfree(buf.data, buf.size);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
+	return;
 err:
 	mutex_lock(&jlist->lock);
 	jlist->ret = ret;
@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list)
 	}
 }

-static inline bool journal_has_keys(struct list_head *list)
-{
-	struct journal_replay *i;
-	struct jset_entry *entry;
-	struct bkey_i *k, *_n;
-
-	list_for_each_entry(i, list, list)
-		for_each_jset_key(k, _n, entry, &i->j)
-			return true;
-
-	return false;
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
 	struct journal *j = &c->journal;
@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	struct journal_entry_pin_list *p;
 	struct bch_dev *ca;
 	u64 cur_seq, end_seq, seq;
-	unsigned iter, keys = 0, entries = 0;
-	size_t nr;
+	unsigned iter;
+	size_t entries = 0;
+	u64 nr, keys = 0;
 	bool degraded = false;
 	int ret = 0;

@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		return BCH_FSCK_REPAIR_IMPOSSIBLE;
 	}

-	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
-		    "filesystem marked clean but journal has keys to replay");
-
 	list_for_each_entry(i, list, list) {
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		}
 	}

+	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
+
+		for_each_jset_key(k, _n, entry, &i->j)
+			keys++;
+	}
+
 	i = list_last_entry(list, struct journal_replay, list);

 	nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;

+	fsck_err_on(c->sb.clean && (keys || nr > 1), c,
+		    "filesystem marked clean but journal not empty (%llu keys in %llu entries)",
+		    keys, nr);
+
 	if (nr > j->pin.size) {
 		free_fifo(&j->pin);
 		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
 		if (!j->pin.data) {
-			bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
 			return -ENOMEM;
 		}
 	}
@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 				struct journal_replay, list)->j.seq);

 	list_for_each_entry(i, list, list) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
 		bool blacklisted;

 		mutex_lock(&j->blacklist_lock);
@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 			journal_last_seq(j), end_seq);

 		cur_seq = le64_to_cpu(i->j.seq) + 1;
-
-		for_each_jset_key(k, _n, entry, &i->j)
-			keys++;
 		entries++;
 	}

-	bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+	bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
 		 keys, entries, journal_cur_seq(j));
 fsck_err:
 	return ret;
@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl)
 		bch_err(c, "Unable to allocate journal write");
 		bch2_fatal_error(c);
 		continue_at(cl, journal_write_done, system_highpri_wq);
+		return;
 	}

 	/*
@ -1417,6 +1411,7 @@ no_io:
 		ptr->offset += sectors;

 	continue_at(cl, journal_write_done, system_highpri_wq);
+	return;
 err:
 	bch2_inconsistent_error(c);
 	continue_at(cl, journal_write_done, system_highpri_wq);
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
 	if (!bl->nr_entries ||
 	    is_power_of_2(bl->nr_entries)) {
 		n = krealloc(bl->entries,
-			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
 			     GFP_KERNEL);
 		if (!n) {
 			ret = -ENOMEM;
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
 	     _k != (_keylist)->top;				\
 	     _k = bkey_next(_k))

-#define keylist_single(k)					\
-	((struct keylist) { .keys = k, .top = bkey_next(k) })
-
 static inline u64 keylist_sectors(struct keylist *keys)
 {
 	struct bkey_i *k;
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -306,8 +306,11 @@ static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);

-	if (likely(!io->rbio.bio.bi_status &&
-		   !io->rbio.hole)) {
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		closure_return_with_destructor(cl, move_free);
+		return;
+	}
+
 	bch2_migrate_read_done(&io->write, &io->rbio);

 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@ -315,9 +318,6 @@ static void move_write(struct closure *cl)
 	continue_at(cl, move_write_done, NULL);
 }

-	closure_return_with_destructor(cl, move_free);
-}
-
 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
 {
 	struct moving_io *io =
@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;

 	bch2_bio_map(&io->write.op.wbio.bio, NULL);
-	if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
 		goto err_free;

 	io->rbio.opts = io_opts;
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@ -4,6 +4,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "io_types.h"
+#include "move_types.h"

 struct bch_read_bio;
 struct moving_context;
@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
 				enum bkey_type, struct bkey_s_c_extent,
 				struct bch_io_opts *, struct data_opts *);

-struct bch_move_stats {
-	enum bch_data_type	data_type;
-	struct btree_iter	iter;
-
-	atomic64_t		keys_moved;
-	atomic64_t		sectors_moved;
-	atomic64_t		sectors_seen;
-	atomic64_t		sectors_raced;
-};
-
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
 		   struct write_point_specifier,
 		   struct bpos, struct bpos,
--- a/libbcachefs/move_types.h
+++ b/libbcachefs/move_types.h
@ -0,0 +1,14 @@
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	struct btree_iter	iter;
+
+	atomic64_t		keys_moved;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg)
 			ca->mi.bucket_size;
 		if (available > reserve) {
 			next = last + available - reserve;
-			bch2_kthread_io_clock_wait(clock, next);
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}

@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg)
 		fragmented = usage.sectors_fragmented;
 		if (fragmented < reserve) {
 			next = last + reserve - fragmented;
-			bch2_kthread_io_clock_wait(clock, next);
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}

--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@ -0,0 +1,341 @@
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+#include <trace/events/bcachefs.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+				      const struct bch_extent_ptr *ptr,
+				      struct bch_extent_crc_unpacked crc,
+				      struct bch_io_opts *io_opts)
+{
+	if (io_opts->background_target &&
+	    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+	    !ptr->cached)
+		return true;
+
+	if (io_opts->background_compression &&
+	    crc.compression_type !=
+	    bch2_compression_opt_to_type[io_opts->background_compression])
+		return true;
+
+	return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+			    struct bkey_s_c k,
+			    struct bch_io_opts *io_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	struct bkey_s_c_extent e;
+
+	if (!bkey_extent_is_data(k.k))
+		return;
+
+	if (!io_opts->background_target &&
+	    !io_opts->background_compression)
+		return;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+			if (atomic64_add_return(crc.compressed_size,
+						&ca->rebalance_work) ==
+			    crc.compressed_size)
+				rebalance_wakeup(c);
+		}
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+				    enum bkey_type type,
+				    struct bkey_s_c_extent e,
+				    struct bch_io_opts *io_opts,
+				    struct data_opts *data_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	/* Make sure we have room to add a new pointer: */
+	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+	    BKEY_EXTENT_VAL_U64s_MAX)
+		return DATA_SKIP;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+			goto found;
+
+	return DATA_SKIP;
+found:
+	data_opts->target		= io_opts->background_target;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+	int		dev_most_full_idx;
+	unsigned	dev_most_full_percent;
+	u64		dev_most_full_work;
+	u64		dev_most_full_capacity;
+	u64		total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+	unsigned percent_full;
+	u64 work = dev_work + unknown_dev;
+
+	if (work < dev_work || work < unknown_dev)
+		work = U64_MAX;
+	work = min(work, capacity);
+
+	percent_full = div_u64(work * 100, capacity);
+
+	if (percent_full >= w->dev_most_full_percent) {
+		w->dev_most_full_idx		= idx;
+		w->dev_most_full_percent	= percent_full;
+		w->dev_most_full_work		= work;
+		w->dev_most_full_capacity	= capacity;
+	}
+
+	if (w->total_work + dev_work >= w->total_work &&
+	    w->total_work + dev_work >= dev_work)
+		w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct rebalance_work ret = { .dev_most_full_idx = -1 };
+	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		rebalance_work_accumulate(&ret,
+			atomic64_read(&ca->rebalance_work),
+			unknown_dev,
+			bucket_to_sector(ca, ca->mi.nbuckets -
+					 ca->mi.first_bucket),
+			i);
+
+	rebalance_work_accumulate(&ret,
+		unknown_dev, 0, c->capacity, -1);
+
+	return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		atomic64_set(&ca->rebalance_work, 0);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct rebalance_work w, p;
+	unsigned long start, prev_start;
+	unsigned long prev_run_time, prev_run_cputime;
+	unsigned long cputime, prev_cputime;
+	unsigned long io_start;
+	long throttle;
+
+	set_freezable();
+
+	io_start	= atomic_long_read(&clock->now);
+	p		= rebalance_work(c);
+	prev_start	= jiffies;
+	prev_cputime	= curr_cputime();
+
+	while (!kthread_wait_freezable(r->enabled)) {
+		start			= jiffies;
+		cputime			= curr_cputime();
+
+		prev_run_time		= start - prev_start;
+		prev_run_cputime	= cputime - prev_cputime;
+
+		w			= rebalance_work(c);
+		BUG_ON(!w.dev_most_full_capacity);
+
+		if (!w.total_work) {
+			r->state = REBALANCE_WAITING;
+			kthread_wait_freezable(rebalance_work(c).total_work);
+			continue;
+		}
+
+		/*
+		 * If there isn't much work to do, throttle cpu usage:
+		 */
+		throttle = prev_run_cputime * 100 /
+			max(1U, w.dev_most_full_percent) -
+			prev_run_time;
+
+		if (w.dev_most_full_percent < 20 && throttle > 0) {
+			r->state = REBALANCE_THROTTLED;
+			r->throttled_until_iotime = io_start +
+				div_u64(w.dev_most_full_capacity *
+					(20 - w.dev_most_full_percent),
+					50);
+			r->throttled_until_cputime = start + throttle;
+
+			bch2_kthread_io_clock_wait(clock,
+				r->throttled_until_iotime,
+				throttle);
+			continue;
+		}
+
+		/* minimum 1 mb/sec: */
+		r->pd.rate.rate =
+			max_t(u64, 1 << 11,
+			      r->pd.rate.rate *
+			      max(p.dev_most_full_percent, 1U) /
+			      max(w.dev_most_full_percent, 1U));
+
+		io_start	= atomic_long_read(&clock->now);
+		p		= w;
+		prev_start	= start;
+		prev_cputime	= cputime;
+
+		r->state = REBALANCE_RUNNING;
+		memset(&r->move_stats, 0, sizeof(r->move_stats));
+		rebalance_work_reset(c);
+
+		bch2_move_data(c,
+			       /* ratelimiting disabled for now */
+			       NULL, /*  &r->pd.rate, */
+			       writepoint_ptr(&c->rebalance_write_point),
+			       POS_MIN, POS_MAX,
+			       rebalance_pred, NULL,
+			       &r->move_stats);
+	}
+
+	return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = out + PAGE_SIZE;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct rebalance_work w = rebalance_work(c);
+	char h1[21], h2[21];
+
+	bch2_hprint(h1, w.dev_most_full_work << 9);
+	bch2_hprint(h2, w.dev_most_full_capacity << 9);
+	out += scnprintf(out, end - out,
+			 "fullest_dev (%i):\t%s/%s\n",
+			 w.dev_most_full_idx, h1, h2);
+
+	bch2_hprint(h1, w.total_work << 9);
+	bch2_hprint(h2, c->capacity << 9);
+	out += scnprintf(out, end - out,
+			 "total work:\t\t%s/%s\n",
+			 h1, h2);
+
+	out += scnprintf(out, end - out,
+			 "rate:\t\t\t%u\n",
+			 r->pd.rate.rate);
+
+	switch (r->state) {
+	case REBALANCE_WAITING:
+		out += scnprintf(out, end - out, "waiting\n");
+		break;
+	case REBALANCE_THROTTLED:
+		bch2_hprint(h1,
+			    (r->throttled_until_iotime -
+			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+		out += scnprintf(out, end - out,
+				 "throttled for %lu sec or %s io\n",
+				 (r->throttled_until_cputime - jiffies) / HZ,
+				 h1);
+		break;
+	case REBALANCE_RUNNING:
+		out += scnprintf(out, end - out, "running\n");
+		out += scnprintf(out, end - out, "pos %llu:%llu\n",
+				 r->move_stats.iter.pos.inode,
+				 r->move_stats.iter.pos.offset);
+		break;
+	}
+
+	return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
--- a/libbcachefs/rebalance.h
+++ b/libbcachefs/rebalance.h
@ -1,12 +1,14 @@
-#ifndef _BCACHEFS_TIER_H
-#define _BCACHEFS_TIER_H
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"

 static inline void rebalance_wakeup(struct bch_fs *c)
 {
 	struct task_struct *p;

 	rcu_read_lock();
-	p = rcu_dereference(c->rebalance_thread);
+	p = rcu_dereference(c->rebalance.thread);
 	if (p)
 		wake_up_process(p);
 	rcu_read_unlock();
@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
 			    struct bch_io_opts *);
 void bch2_rebalance_add_work(struct bch_fs *, u64);

+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
 void bch2_fs_rebalance_init(struct bch_fs *);

-#endif /* _BCACHEFS_TIER_H */
+#endif /* _BCACHEFS_REBALANCE_H */
--- a/libbcachefs/rebalance_types.h
+++ b/libbcachefs/rebalance_types.h
@ -0,0 +1,26 @@
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+	REBALANCE_WAITING,
+	REBALANCE_THROTTLED,
+	REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu *thread;
+	struct bch_pd_controller pd;
+
+	atomic64_t		work_unknown_dev;
+
+	enum rebalance_state	state;
+	unsigned long		throttled_until_iotime;
+	unsigned long		throttled_until_cputime;
+	struct bch_move_stats	move_stats;
+
+	unsigned		enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@ -146,6 +146,8 @@ struct six_lock_waiter {
 /* This is probably up there with the more evil things I've done */
 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))

+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
 static inline int six_can_spin_on_owner(struct six_lock *lock)
 {
 	struct task_struct *owner;
@ -257,6 +259,15 @@ fail:
 	return false;
 }

+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	return false;
+}
+
+#endif
+
 noinline
 static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
 {
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 	bio_set_dev(bio, ca->disk_sb.bdev);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
 	bio->bi_iter.bi_size	=
-		roundup(vstruct_bytes(sb),
+		roundup((size_t) vstruct_bytes(sb),
 			bdev_logical_block_size(ca->disk_sb.bdev));
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c)
 	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
 }

-static inline __u64 pset_magic(struct bch_fs *c)
-{
-	return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC);
-}
-
 static inline __u64 bset_magic(struct bch_fs *c)
 {
 	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 	};
 }

+size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+			     struct bch_sb_field *);
+
 #endif /* _BCACHEFS_SUPER_IO_H */
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -33,11 +33,11 @@
 #include "migrate.h"
 #include "movinggc.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
-#include "tier.h"

 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
@ -398,10 +398,10 @@ err:

 static void bch2_fs_free(struct bch_fs *c)
 {
-#define BCH_TIME_STAT(name)				\
-	bch2_time_stats_exit(&c->name##_time);
-	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+	unsigned i;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_exit(&c->times[i]);

 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)

 	init_rwsem(&c->gc_lock);

-#define BCH_TIME_STAT(name)				\
-	bch2_time_stats_init(&c->name##_time);
-	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_init(&c->times[i]);

 	bch2_fs_allocator_init(c);
 	bch2_fs_rebalance_init(c);
@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	seqcount_init(&c->gc_pos_lock);

 	c->copy_gc_enabled		= 1;
-	c->rebalance_enabled		= 1;
-	c->rebalance_percent		= 10;
+	c->rebalance.enabled		= 1;
 	c->promote_whole_extents	= true;

-	c->journal.write_time	= &c->journal_write_time;
-	c->journal.delay_time	= &c->journal_delay_time;
-	c->journal.blocked_time	= &c->journal_blocked_time;
-	c->journal.flush_seq_time = &c->journal_flush_seq_time;
+	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
+	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
+	c->journal.blocked_time	= &c->times[BCH_TIME_journal_blocked];
+	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];

 	bch2_fs_btree_cache_init_early(&c->btree_cache);

@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
-	    mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
+	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+					btree_bytes(c)) ||
 	    bch2_io_clock_init(&c->io_clock[READ]) ||
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -24,9 +24,9 @@
 #include "keylist.h"
 #include "move.h"
 #include "opts.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super-io.h"
-#include "tier.h"

 #include <linux/blkdev.h>
 #include <linux/sort.h>
@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);

 rw_attribute(rebalance_enabled);
-rw_attribute(rebalance_percent);
 sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);

 rw_attribute(pd_controllers_update_seconds);
@ -198,11 +198,11 @@ read_attribute(data_replicas_have);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM

-#define BCH_TIME_STAT(_name)						\
+#define x(_name)						\
 	static struct attribute sysfs_time_stat_##_name =		\
 		{ .name = #_name, .mode = S_IRUGO };
 	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x

 static struct attribute sysfs_state_rw = {
 	.name = "state",
@ -340,9 +340,11 @@ SHOW(bch2_fs)
 	sysfs_print(pd_controllers_update_seconds,
 		    c->pd_controllers_update_seconds);

-	sysfs_printf(rebalance_enabled,		"%i", c->rebalance_enabled);
-	sysfs_print(rebalance_percent,		c->rebalance_percent);
-	sysfs_pd_controller_show(rebalance,	&c->rebalance_pd); /* XXX */
+	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+
+	if (attr == &sysfs_rebalance_work)
+		return bch2_rebalance_work_show(c, buf);

 	sysfs_print(promote_whole_extents,	c->promote_whole_extents);

@ -404,7 +406,7 @@ STORE(__bch2_fs)
 	}

 	if (attr == &sysfs_rebalance_enabled) {
-		ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
+		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
 			?: (ssize_t) size;

 		rebalance_wakeup(c);
@ -413,9 +415,7 @@ STORE(__bch2_fs)

 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
-
-	sysfs_strtoul(rebalance_percent,	c->rebalance_percent);
-	sysfs_pd_controller_store(rebalance,	&c->rebalance_pd);
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);

 	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);

@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_journal_write_delay_ms,
 	&sysfs_journal_reclaim_delay_ms,

-	&sysfs_rebalance_percent,
 	&sysfs_promote_whole_extents,

 	&sysfs_compression_stats,
@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_prune_cache,

 	&sysfs_copy_gc_enabled,
+
 	&sysfs_rebalance_enabled,
+	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
+
 	&sysfs_internal_uuid,

 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);

-#define BCH_TIME_STAT(name)						\
+#define x(name)						\
 	if (attr == &sysfs_time_stat_##name)				\
-		return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
+		return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+					     buf, PAGE_SIZE);
 	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x

 	return 0;
 }
@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats)
 SYSFS_OPS(bch2_fs_time_stats);

 struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name)						\
+#define x(name)						\
 	&sysfs_time_stat_##name,
 	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
 	NULL
 };

--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@ -1,259 +0,0 @@
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "disk_groups.h"
-#include "extents.h"
-#include "io.h"
-#include "move.h"
-#include "super-io.h"
-#include "tier.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-#include <trace/events/bcachefs.h>
-
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
-				      const struct bch_extent_ptr *ptr,
-				      struct bch_extent_crc_unpacked crc,
-				      struct bch_io_opts *io_opts)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-	if (io_opts->background_target &&
-	    !dev_in_target(ca, io_opts->background_target) &&
-	    !ptr->cached)
-		return true;
-
-	if (io_opts->background_compression &&
-	    crc.compression_type !=
-	    bch2_compression_opt_to_type[io_opts->background_compression])
-		return true;
-
-	return false;
-}
-
-void bch2_rebalance_add_key(struct bch_fs *c,
-			    struct bkey_s_c k,
-			    struct bch_io_opts *io_opts)
-{
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-	struct bkey_s_c_extent e;
-
-	if (!bkey_extent_is_data(k.k))
-		return;
-
-	if (!io_opts->background_target &&
-	    !io_opts->background_compression)
-		return;
-
-	e = bkey_s_c_to_extent(k);
-
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-			if (!atomic64_add_return(crc.compressed_size,
-						 &ca->rebalance_work))
-				rebalance_wakeup(c);
-		}
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
-	if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
-		rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-				    enum bkey_type type,
-				    struct bkey_s_c_extent e,
-				    struct bch_io_opts *io_opts,
-				    struct data_opts *data_opts)
-{
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-
-	/* Make sure we have room to add a new pointer: */
-	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-	    BKEY_EXTENT_VAL_U64s_MAX)
-		return DATA_SKIP;
-
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
-			goto found;
-
-	return DATA_SKIP;
-found:
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
-}
-
-struct rebalance_work {
-	unsigned	dev_most_full_percent;
-	u64		dev_most_full_work;
-	u64		dev_most_full_capacity;
-	u64		total_work;
-};
-
-static struct rebalance_work rebalance_work(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct rebalance_work ret = { 0 };
-	unsigned i;
-
-	for_each_online_member(ca, c, i) {
-		u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
-						ca->mi.first_bucket);
-		u64 work = atomic64_read(&ca->rebalance_work) +
-			atomic64_read(&c->rebalance_work_unknown_dev);
-		unsigned percent_full = div_u64(work * 100, capacity);
-
-		if (percent_full > ret.dev_most_full_percent) {
-			ret.dev_most_full_percent	= percent_full;
-			ret.dev_most_full_work		= work;
-			ret.dev_most_full_capacity	= capacity;
-		}
-
-		ret.total_work += atomic64_read(&ca->rebalance_work);
-	}
-
-	ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
-
-	return ret;
-}
-
-static void rebalance_work_reset(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-
-	for_each_online_member(ca, c, i)
-		atomic64_set(&ca->rebalance_work, 0);
-
-	atomic64_set(&c->rebalance_work_unknown_dev, 0);
-}
-
-static unsigned long curr_cputime(void)
-{
-	u64 utime, stime;
-
-	task_cputime_adjusted(current, &utime, &stime);
-	return nsecs_to_jiffies(utime + stime);
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
-	struct bch_fs *c = arg;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	struct rebalance_work w, p;
-	unsigned long start, prev_start;
-	unsigned long prev_run_time, prev_run_cputime;
-	unsigned long cputime, prev_cputime;
-
-	set_freezable();
-
-	p		= rebalance_work(c);
-	prev_start	= jiffies;
-	prev_cputime	= curr_cputime();
-
-	while (!kthread_wait_freezable(c->rebalance_enabled)) {
-		struct bch_move_stats move_stats = { 0 };
-
-		w			= rebalance_work(c);
-		start			= jiffies;
-		cputime			= curr_cputime();
-
-		prev_run_time		= start - prev_start;
-		prev_run_cputime	= cputime - prev_cputime;
-
-		if (!w.total_work) {
-			kthread_wait_freezable(rebalance_work(c).total_work);
-			continue;
-		}
-
-		if (w.dev_most_full_percent < 20 &&
-		    prev_run_cputime * 5 > prev_run_time) {
-			if (w.dev_most_full_capacity) {
-				bch2_kthread_io_clock_wait(clock,
-					atomic_long_read(&clock->now) +
-					div_u64(w.dev_most_full_capacity, 5));
-			} else {
-
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (kthread_should_stop())
-					break;
-
-				schedule_timeout(prev_run_cputime * 5 -
-						 prev_run_time);
-				continue;
-			}
-		}
-
-		/* minimum 1 mb/sec: */
-		c->rebalance_pd.rate.rate =
-			max_t(u64, 1 << 11,
-			      c->rebalance_pd.rate.rate *
-			      max(p.dev_most_full_percent, 1U) /
-			      max(w.dev_most_full_percent, 1U));
-
-		rebalance_work_reset(c);
-
-		bch2_move_data(c, &c->rebalance_pd.rate,
-			       writepoint_ptr(&c->rebalance_write_point),
-			       POS_MIN, POS_MAX,
-			       rebalance_pred, NULL,
-			       &move_stats);
-	}
-
-	return 0;
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
-	struct task_struct *p;
-
-	c->rebalance_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&c->rebalance_pd.rate);
-
-	p = c->rebalance_thread;
-	c->rebalance_thread = NULL;
-
-	if (p) {
-		/* for sychronizing with rebalance_wakeup() */
-		synchronize_rcu();
-
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
-	struct task_struct *p;
-
-	if (c->opts.nochanges)
-		return 0;
-
-	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	get_task_struct(p);
-
-	rcu_assign_pointer(c->rebalance_thread, p);
-	wake_up_process(c->rebalance_thread);
-	return 0;
-}
-
-void bch2_fs_rebalance_init(struct bch_fs *c)
-{
-	bch2_pd_controller_init(&c->rebalance_pd);
-
-	atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
-}
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n)
 	return true;
 }

-void bch2_quantiles_update(struct quantiles *q, u64 v)
+static void bch2_quantiles_update(struct quantiles *q, u64 v)
 {
 	unsigned i = 0;

@ -569,6 +569,23 @@ start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 	}
 }

+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp_mask);
+		if (!bv->bv_page) {
+			while (--bv >= bio->bi_io_vec)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 size_t bch2_rand_range(size_t max)
 {
 	size_t rand;
@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 	}
 }

-void mempool_free_vp(void *element, void *pool_data)
+static void mempool_free_vp(void *element, void *pool_data)
 {
 	size_t size = (size_t) pool_data;

 	vpfree(element, size);
 }

-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
 {
 	size_t size = (size_t) pool_data;

 	return vpmalloc(size, gfp_mask);
 }

+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return size < PAGE_SIZE
+		? mempool_init_kmalloc_pool(pool, min_nr, size)
+		: mempool_init(pool, min_nr, mempool_alloc_vp,
+			       mempool_free_vp, (void *) size);
+}
+
 #if 0
 void eytzinger1_test(void)
 {
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -68,9 +68,9 @@ struct closure;
 #define __flatten
 #endif

-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #define CPU_BIG_ENDIAN		0
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #define CPU_BIG_ENDIAN		1
 #endif

@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 		: vpmalloc(size, gfp_mask);
 }

-void mempool_free_vp(void *element, void *pool_data);
-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
-
-static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return mempool_init(pool, min_nr, mempool_alloc_vp,
-			    mempool_free_vp, (void *) size);
-}
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);

 #define HEAP(type)							\
 struct {								\
@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }

 void bch2_bio_map(struct bio *bio, void *base);
+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);

 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@ -5,8 +5,8 @@
 #include "compress.h"
 #include "extents.h"
 #include "fs.h"
+#include "rebalance.h"
 #include "str_hash.h"
-#include "tier.h"
 #include "xattr.h"

 #include <linux/dcache.h>
--- a/linux/sched.c
+++ b/linux/sched.c
@ -40,14 +40,22 @@ void schedule(void)
 		      v, NULL, NULL, 0);
 }

-static void process_timeout(unsigned long __data)
+struct process_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct process_timer *timeout =
+		container_of(t, struct process_timer, timer);
+
+	wake_up_process(timeout->task);
 }

 long schedule_timeout(long timeout)
 {
-	struct timer_list timer;
+	struct process_timer timer;
 	unsigned long expire;

 	switch (timeout)
@ -80,10 +88,11 @@ long schedule_timeout(long timeout)

 	expire = timeout + jiffies;

-	setup_timer(&timer, process_timeout, (unsigned long)current);
-	mod_timer(&timer, expire);
+	timer.task = current;
+	timer_setup_on_stack(&timer.timer, process_timeout, 0);
+	mod_timer(&timer.timer, expire);
 	schedule();
-	del_timer_sync(&timer);
+	del_timer_sync(&timer.timer);

 	timeout = expire - jiffies;
 out:
--- a/linux/timer.c
+++ b/linux/timer.c
@ -273,7 +273,7 @@ static int timer_thread(void *arg)
 			BUG_ON(!timer_running());

 			pthread_mutex_unlock(&timer_lock);
-			timer->function(timer->data);
+			timer->function(timer);
 			pthread_mutex_lock(&timer_lock);

 			timer_seq++;
--- a/linux/workqueue.c
+++ b/linux/workqueue.c
@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
 	return ret;
 }

-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *timer)
 {
-	struct delayed_work *dwork = (struct delayed_work *) __data;
+	struct delayed_work *dwork =
+		container_of(timer, struct delayed_work, timer);

 	pthread_mutex_lock(&wq_lock);
 	__queue_work(dwork->wq, &dwork->work);
@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;

-	BUG_ON(timer->function != delayed_work_timer_fn ||
-	       timer->data != (unsigned long)dwork);
+	BUG_ON(timer->function != delayed_work_timer_fn);
 	BUG_ON(timer_pending(timer));
 	BUG_ON(!list_empty(&work->entry));