Update bcachefs sources to e82e656279 bcachefs: Cleanups for building in userspace

2025-02-23 00:00:02 +03:00 · 2017-10-05 14:41:44 -08:00 · 2017-10-05 14:41:44 -08:00 · 85ee972555
commit 85ee972555
parent e7c2bb91bc
85 changed files with 1300 additions and 1765 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-6a25f7a00d08c45b35bed3d649c05286ec60f7f6
+e82e65627960a46945b78a5e5e946b23b8f08972
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@ -197,24 +197,22 @@ DECLARE_EVENT_CLASS(btree_node,

 	TP_STRUCT__entry(
 		__array(char,		uuid,		16	)
-		__field(u64,		bucket			)
 		__field(u8,		level			)
 		__field(u8,		id			)
-		__field(u32,		inode			)
+		__field(u64,		inode			)
 		__field(u64,		offset			)
 	),

 	TP_fast_assign(
 		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
 		__entry->level		= b->level;
 		__entry->id		= b->btree_id;
 		__entry->inode		= b->key.k.p.inode;
 		__entry->offset		= b->key.k.p.offset;
 	),

-	TP_printk("%pU bucket %llu(%u) id %u: %u:%llu",
-		  __entry->uuid, __entry->bucket, __entry->level, __entry->id,
+	TP_printk("%pU  %u id %u %llu:%llu",
+		  __entry->uuid, __entry->level, __entry->id,
 		  __entry->inode, __entry->offset)
 );

@ -253,21 +251,9 @@ DEFINE_EVENT(btree_node, btree_node_free,
 	TP_ARGS(c, b)
 );

-TRACE_EVENT(btree_node_reap,
-	TP_PROTO(struct bch_fs *c, struct btree *b, int ret),
-	TP_ARGS(c, b, ret),
-
-	TP_STRUCT__entry(
-		__field(u64,			bucket		)
-		__field(int,			ret		)
-	),
-
-	TP_fast_assign(
-		__entry->bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
-		__entry->ret = ret;
-	),
-
-	TP_printk("bucket %llu ret %d", __entry->bucket, __entry->ret)
+DEFINE_EVENT(btree_node, btree_node_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
 );

 DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
@ -330,68 +316,31 @@ TRACE_EVENT(btree_insert_key,
 	TP_ARGS(c, b, k),

 	TP_STRUCT__entry(
-		__field(u64,		b_bucket		)
-		__field(u64,		b_offset		)
-		__field(u64,		offset			)
-		__field(u32,		b_inode			)
-		__field(u32,		inode			)
-		__field(u32,		size			)
-		__field(u8,		level			)
 		__field(u8,		id			)
+		__field(u64,		inode			)
+		__field(u64,		offset			)
+		__field(u32,		size			)
 	),

 	TP_fast_assign(
-		__entry->b_bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
-		__entry->level		= b->level;
 		__entry->id		= b->btree_id;
-		__entry->b_inode	= b->key.k.p.inode;
-		__entry->b_offset	= b->key.k.p.offset;
 		__entry->inode		= k->k.p.inode;
 		__entry->offset		= k->k.p.offset;
 		__entry->size		= k->k.size;
 	),

-	TP_printk("bucket %llu(%u) id %u: %u:%llu %u:%llu len %u",
-		  __entry->b_bucket, __entry->level, __entry->id,
-		  __entry->b_inode, __entry->b_offset,
+	TP_printk("btree %u: %llu:%llu len %u", __entry->id,
 		  __entry->inode, __entry->offset, __entry->size)
 );

-DECLARE_EVENT_CLASS(btree_split,
-	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys),
-	TP_ARGS(c, b, keys),
-
-	TP_STRUCT__entry(
-		__field(u64,		bucket			)
-		__field(u8,		level			)
-		__field(u8,		id			)
-		__field(u32,		inode			)
-		__field(u64,		offset			)
-		__field(u32,		keys			)
-	),
-
-	TP_fast_assign(
-		__entry->bucket	= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
-		__entry->level	= b->level;
-		__entry->id	= b->btree_id;
-		__entry->inode	= b->key.k.p.inode;
-		__entry->offset	= b->key.k.p.offset;
-		__entry->keys	= keys;
-	),
-
-	TP_printk("bucket %llu(%u) id %u: %u:%llu keys %u",
-		  __entry->bucket, __entry->level, __entry->id,
-		  __entry->inode, __entry->offset, __entry->keys)
+DEFINE_EVENT(btree_node, btree_split,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
 );

-DEFINE_EVENT(btree_split, btree_node_split,
-	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys),
-	TP_ARGS(c, b, keys)
-);
-
-DEFINE_EVENT(btree_split, btree_node_compact,
-	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys),
-	TP_ARGS(c, b, keys)
+DEFINE_EVENT(btree_node, btree_compact,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
 );

 DEFINE_EVENT(btree_node, btree_set_root,
@ -401,31 +350,9 @@ DEFINE_EVENT(btree_node, btree_set_root,

 /* Garbage collection */

-TRACE_EVENT(btree_gc_coalesce,
-	TP_PROTO(struct bch_fs *c, struct btree *b, unsigned nodes),
-	TP_ARGS(c, b, nodes),
-
-	TP_STRUCT__entry(
-		__field(u64,		bucket			)
-		__field(u8,		level			)
-		__field(u8,		id			)
-		__field(u32,		inode			)
-		__field(u64,		offset			)
-		__field(unsigned,	nodes			)
-	),
-
-	TP_fast_assign(
-		__entry->bucket		= PTR_BUCKET_NR_TRACE(c, &b->key, 0);
-		__entry->level		= b->level;
-		__entry->id		= b->btree_id;
-		__entry->inode		= b->key.k.p.inode;
-		__entry->offset		= b->key.k.p.offset;
-		__entry->nodes		= nodes;
-	),
-
-	TP_printk("bucket %llu(%u) id %u: %u:%llu nodes %u",
-		  __entry->bucket, __entry->level, __entry->id,
-		  __entry->inode, __entry->offset, __entry->nodes)
+DEFINE_EVENT(btree_node, btree_gc_coalesce,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
 );

 TRACE_EVENT(btree_gc_coalesce_fail,
@ -523,8 +450,8 @@ DEFINE_EVENT(bch_dev, prio_write_end,
 );

 TRACE_EVENT(invalidate,
-	TP_PROTO(struct bch_dev *ca, size_t bucket, unsigned sectors),
-	TP_ARGS(ca, bucket, sectors),
+	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
+	TP_ARGS(ca, offset, sectors),

 	TP_STRUCT__entry(
 		__field(unsigned,	sectors			)
@ -534,7 +461,7 @@ TRACE_EVENT(invalidate,

 	TP_fast_assign(
 		__entry->dev		= ca->disk_sb.bdev->bd_dev;
-		__entry->offset		= bucket << ca->bucket_bits;
+		__entry->offset		= offset,
 		__entry->sectors	= sectors;
 	),

--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@ -1,9 +1,12 @@
+#ifndef NO_BCACHEFS_FS
+
 #include "bcachefs.h"

-#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/fs.h>

 #include "xattr.h"
 #include "acl.h"
@ -223,3 +226,5 @@ int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)

 	return ret;
 }
+
+#endif /* NO_BCACHEFS_FS */
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@ -1,10 +1,7 @@
-/*
-  File: fs/bch/acl.h
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H

-  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/posix_acl_xattr.h>
+#ifndef NO_BCACHEFS_FS

 #define BCH_ACL_VERSION	0x0001

@ -52,5 +49,11 @@ static inline int bch2_acl_count(size_t size)
 	}
 }

+struct posix_acl;
+
 extern struct posix_acl *bch2_get_acl(struct inode *, int);
 extern int bch2_set_acl(struct inode *, struct posix_acl *, int);
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_ACL_H */
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -77,42 +77,6 @@

 static void bch2_recalc_min_prio(struct bch_dev *, int);

-/* Allocation groups: */
-
-void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
-{
-	unsigned i;
-
-	spin_lock(&grp->lock);
-
-	for (i = 0; i < grp->nr; i++)
-		if (grp->d[i].dev == ca) {
-			grp->nr--;
-			memmove(&grp->d[i],
-				&grp->d[i + 1],
-				(grp->nr- i) * sizeof(grp->d[0]));
-			break;
-		}
-
-	spin_unlock(&grp->lock);
-}
-
-void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
-{
-	unsigned i;
-
-	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr; i++)
-		if (grp->d[i].dev == ca)
-			goto out;
-
-	BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
-
-	grp->d[grp->nr++].dev = ca;
-out:
-	spin_unlock(&grp->lock);
-}
-
 /* Ratelimiting/PD controllers */

 static void pd_controllers_update(struct work_struct *work)
@ -139,24 +103,24 @@ static void pd_controllers_update(struct work_struct *work)
 				faster_tiers_dirty,
 				-1);

-		spin_lock(&c->tiers[i].devs.lock);
-		group_for_each_dev(ca, &c->tiers[i].devs, iter) {
+		for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) {
 			struct bch_dev_usage stats = bch2_dev_usage_read(ca);
-			unsigned bucket_bits = ca->bucket_bits + 9;

-			u64 size = (ca->mi.nbuckets -
-				    ca->mi.first_bucket) << bucket_bits;
-			u64 dirty = stats.buckets[S_DIRTY] << bucket_bits;
-			u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
+			u64 size = bucket_to_sector(ca, ca->mi.nbuckets -
+					ca->mi.first_bucket) << 9;
+			u64 dirty = bucket_to_sector(ca,
+					stats.buckets[S_DIRTY]) << 9;
+			u64 free = bucket_to_sector(ca,
+					__dev_buckets_free(ca, stats)) << 9;
 			/*
 			 * Bytes of internal fragmentation, which can be
 			 * reclaimed by copy GC
 			 */
-			s64 fragmented = ((stats.buckets[S_DIRTY] +
-					   stats.buckets_cached) <<
-					  bucket_bits) -
-				((stats.sectors[S_DIRTY] +
-				  stats.sectors_cached) << 9);
+			s64 fragmented = (bucket_to_sector(ca,
+						stats.buckets[S_DIRTY] +
+						stats.buckets_cached) -
+					  (stats.sectors[S_DIRTY] +
+					   stats.sectors_cached)) << 9;

 			fragmented = max(0LL, fragmented);

@ -174,7 +138,6 @@ static void pd_controllers_update(struct work_struct *work)

 			copygc_can_free			+= fragmented;
 		}
-		spin_unlock(&c->tiers[i].devs.lock);
 	}

 	rcu_read_unlock();
@ -427,19 +390,22 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	return ret;
 }

-int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
 {
 	struct btree_iter iter;
-	struct bucket *g;
+	unsigned long bucket;
 	int ret = 0;

 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 			     BTREE_ITER_INTENT);

-	for_each_bucket(g, ca) {
-		ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
+	for_each_set_bit(bucket, ca->bucket_dirty, ca->mi.nbuckets) {
+		ret = __bch2_alloc_write_key(c, ca, ca->buckets + bucket,
+					     &iter, journal_seq);
 		if (ret)
 			break;
+
+		clear_bit(bucket, ca->bucket_dirty);
 	}

 	bch2_btree_iter_unlock(&iter);
@ -926,8 +892,10 @@ static int bch2_allocator_thread(void *arg)

 			ca->nr_invalidated = ret;

-			if (ca->nr_invalidated == fifo_used(&ca->free_inc))
+			if (ca->nr_invalidated == fifo_used(&ca->free_inc)) {
 				ca->alloc_thread_started = true;
+				bch2_alloc_write(c, ca, &journal_seq);
+			}

 			if (ca->allocator_invalidating_data)
 				bch2_journal_flush_seq(&c->journal, journal_seq);
@ -996,6 +964,21 @@ static int bch2_allocator_thread(void *arg)

 /* Allocation */

+/*
+ * XXX: allocation on startup is still sketchy. There is insufficient
+ * synchronization for bch2_bucket_alloc_startup() to work correctly after
+ * bch2_alloc_write() has been called, and we aren't currently doing anything
+ * to guarantee that this won't happen.
+ *
+ * Even aside from that, it's really difficult to avoid situations where on
+ * startup we write out a pointer to a freshly allocated bucket before the
+ * corresponding gen - when we're still digging ourself out of the "i need to
+ * allocate to write bucket gens, but i need to write bucket gens to allocate"
+ * hole.
+ *
+ * Fortunately, bch2_btree_mark_key_initial() will detect and repair this
+ * easily enough...
+ */
 static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket *g;
@ -1012,6 +995,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 		    is_available_bucket(g->mark) &&
 		    bch2_mark_alloc_bucket_startup(ca, g)) {
 			r = g - ca->buckets;
+			set_bit(r, ca->bucket_dirty);
 			break;
 		}
 out:
@ -1055,6 +1039,7 @@ long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	spin_unlock(&ca->freelist_lock);

 	if (unlikely(!ca->alloc_thread_started) &&
+	    (reserve == RESERVE_ALLOC) &&
 	    (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
 		verify_not_on_freelist(ca, r);
 		goto out2;
@ -1081,92 +1066,87 @@ enum bucket_alloc_ret {
 	FREELIST_EMPTY,		/* Allocator thread not keeping up */
 };

-static void recalc_alloc_group_weights(struct bch_fs *c,
-				       struct dev_group *devs)
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+					 struct write_point *wp,
+					 struct bch_devs_mask *devs)
 {
-	struct bch_dev *ca;
-	u64 available_buckets = 1; /* avoid a divide by zero... */
-	unsigned i;
+	struct dev_alloc_list ret = { .nr = 0 };
+	struct bch_dev *ca, *ca2;
+	unsigned i, j;

-	for (i = 0; i < devs->nr; i++) {
-		ca = devs->d[i].dev;
+	for_each_member_device_rcu(ca, c, i, devs) {
+		for (j = 0; j < ret.nr; j++) {
+			unsigned idx = ret.devs[j];

-		devs->d[i].weight = dev_buckets_free(ca);
-		available_buckets += devs->d[i].weight;
+			ca2 = rcu_dereference(c->devs[idx]);
+			if (!ca2)
+				break;
+
+			if (ca->mi.tier < ca2->mi.tier)
+				break;
+
+			if (ca->mi.tier == ca2->mi.tier &&
+			    wp->next_alloc[i] < wp->next_alloc[idx])
+				break;
 		}

-	for (i = 0; i < devs->nr; i++) {
-		const unsigned min_weight = U32_MAX >> 4;
-		const unsigned max_weight = U32_MAX;
-
-		devs->d[i].weight =
-			min_weight +
-			div64_u64(devs->d[i].weight *
-				  devs->nr *
-				  (max_weight - min_weight),
-				  available_buckets);
-		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
+		memmove(&ret.devs[j + 1],
+			&ret.devs[j],
+			sizeof(ret.devs[0]) * (ret.nr - j));
+		ret.nr++;
+		ret.devs[j] = i;
 	}
+
+	return ret;
 }

-static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
-						    struct open_bucket *ob,
-						    enum alloc_reserve reserve,
-						    unsigned nr_replicas,
-						    struct dev_group *devs,
-						    long *devs_used)
+void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
+		     struct write_point *wp)
 {
-	enum bucket_alloc_ret ret;
-	unsigned fail_idx = -1, i;
-	unsigned available = 0;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++)
+		wp->next_alloc[i] >>= 1;
+}
+
+static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
+					struct write_point *wp,
+					struct open_bucket *ob,
+					unsigned nr_replicas,
+					enum alloc_reserve reserve,
+					struct bch_devs_mask *devs)
+{
+	enum bucket_alloc_ret ret = NO_DEVICES;
+	struct dev_alloc_list devs_sorted;
+	unsigned i;

 	BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));

 	if (ob->nr_ptrs >= nr_replicas)
 		return ALLOC_SUCCESS;

-	spin_lock(&devs->lock);
+	rcu_read_lock();
+	devs_sorted = bch2_wp_alloc_list(c, wp, devs);

-	for (i = 0; i < devs->nr; i++)
-		available += !test_bit(devs->d[i].dev->dev_idx,
-				       devs_used);
-
-	recalc_alloc_group_weights(c, devs);
-
-	i = devs->cur_device;
-
-	while (ob->nr_ptrs < nr_replicas) {
-		struct bch_dev *ca;
+	for (i = 0; i < devs_sorted.nr; i++) {
+		struct bch_dev *ca =
+			rcu_dereference(c->devs[devs_sorted.devs[i]]);
 		long bucket;

-		if (!available) {
-			ret = NO_DEVICES;
-			goto err;
-		}
-
-		i++;
-		i %= devs->nr;
-
-		ret = FREELIST_EMPTY;
-		if (i == fail_idx)
-			goto err;
-
-		ca = devs->d[i].dev;
-
-		if (test_bit(ca->dev_idx, devs_used))
-			continue;
-
-		if (fail_idx == -1 &&
-		    get_random_int() > devs->d[i].weight)
+		if (!ca)
 			continue;

 		bucket = bch2_bucket_alloc(c, ca, reserve);
 		if (bucket < 0) {
-			if (fail_idx == -1)
-				fail_idx = i;
+			ret = FREELIST_EMPTY;
 			continue;
 		}

+		wp->next_alloc[ca->dev_idx] +=
+			div64_u64(U64_MAX, dev_buckets_free(ca) *
+				  ca->mi.bucket_size);
+		bch2_wp_rescale(c, ca, wp);
+
 		/*
 		 * open_bucket_add_buckets expects new pointers at the head of
 		 * the list:
@ -1185,56 +1165,28 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
 		};
 		ob->ptr_offset[0] = 0;

-		__set_bit(ca->dev_idx, devs_used);
-		available--;
-		devs->cur_device = i;
+		if (ob->nr_ptrs == nr_replicas) {
+			ret = ALLOC_SUCCESS;
+			break;
+		}
 	}

-	ret = ALLOC_SUCCESS;
-err:
 	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
-	spin_unlock(&devs->lock);
+	rcu_read_unlock();
 	return ret;
 }

-static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
-						    struct write_point *wp,
-						    struct open_bucket *ob,
-						    unsigned nr_replicas,
-						    enum alloc_reserve reserve,
-						    long *devs_used)
-{
-	struct bch_tier *tier;
-	/*
-	 * this should implement policy - for a given type of allocation, decide
-	 * which devices to allocate from:
-	 *
-	 * XXX: switch off wp->type and do something more intelligent here
-	 */
-	if (wp->group)
-		return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
-					      wp->group, devs_used);
-
-	/* foreground writes: prefer fastest tier: */
-	tier = READ_ONCE(c->fastest_tier);
-	if (tier)
-		bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				       &tier->devs, devs_used);
-
-	return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				      &c->all_devs, devs_used);
-}
-
 static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
 				struct open_bucket *ob, unsigned nr_replicas,
-				enum alloc_reserve reserve, long *devs_used,
+				enum alloc_reserve reserve,
+				struct bch_devs_mask *devs,
 				struct closure *cl)
 {
 	bool waiting = false;

 	while (1) {
 		switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
-					       reserve, devs_used)) {
+						reserve, devs)) {
 		case ALLOC_SUCCESS:
 			if (waiting)
 				closure_wake_up(&c->freelist_wait);
@ -1354,13 +1306,12 @@ static unsigned ob_ptr_sectors_free(struct bch_fs *c,
 {
 	struct bch_dev *ca = c->devs[ptr->dev];
 	unsigned i = ptr - ob->ptrs;
-	unsigned bucket_size = ca->mi.bucket_size;
-	unsigned used = (ptr->offset & (bucket_size - 1)) +
+	unsigned used = bucket_remainder(ca, ptr->offset) +
 		ob->ptr_offset[i];

-	BUG_ON(used > bucket_size);
+	BUG_ON(used > ca->mi.bucket_size);

-	return bucket_size - used;
+	return ca->mi.bucket_size - used;
 }

 static unsigned open_bucket_sectors_free(struct bch_fs *c,
@ -1432,28 +1383,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 				   enum alloc_reserve reserve,
 				   struct closure *cl)
 {
-	long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+	struct bch_devs_mask devs = c->rw_devs[wp->type];
 	unsigned i;
 	int ret;

-	/*
-	 * We might be allocating pointers to add to an existing extent
-	 * (tiering/copygc/migration) - if so, some of the pointers in our
-	 * existing open bucket might duplicate devices we already have. This is
-	 * moderately annoying.
-	 */
-
-	/* Short circuit all the fun stuff if posssible: */
 	if (ob->nr_ptrs >= nr_replicas)
 		return 0;

-	memset(devs_used, 0, sizeof(devs_used));
-
+	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < ob->nr_ptrs; i++)
-		__set_bit(ob->ptrs[i].dev, devs_used);
+		__clear_bit(ob->ptrs[i].dev, devs.d);
+
+	if (wp->group)
+		bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX);

 	ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
-				   reserve, devs_used, cl);
+				    reserve, &devs, cl);

 	if (ret == -EROFS &&
 	    ob->nr_ptrs >= nr_replicas_required)
@ -1568,8 +1513,6 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
 		extent_ptr_append(e, tmp);

 		ob->ptr_offset[i] += sectors;
-
-		this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
 	}
 }

@ -1651,6 +1594,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	unsigned long ra_pages = 0;
 	unsigned i, j;

+	lockdep_assert_held(&c->state_lock);
+
 	for_each_online_member(ca, c, i) {
 		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;

@ -1663,7 +1608,7 @@ void bch2_recalc_capacity(struct bch_fs *c)

 	for (tier = c->tiers;
 	     tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-		if (!tier->devs.nr)
+		if (!dev_mask_nr(&tier->devs))
 			continue;
 		if (!fastest_tier)
 			fastest_tier = tier;
@ -1681,8 +1626,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	 * Capacity of the filesystem is the capacity of all the devices in the
 	 * slowest (highest) tier - we don't include lower tier devices.
 	 */
-	spin_lock(&slowest_tier->devs.lock);
-	group_for_each_dev(ca, &slowest_tier->devs, i) {
+	for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) {
 		size_t reserve = 0;

 		/*
@ -1712,13 +1656,11 @@ void bch2_recalc_capacity(struct bch_fs *c)
 			reserve += 1;	/* tiering write point */
 		reserve += 1;		/* btree write point */

-		reserved_sectors += reserve << ca->bucket_bits;
+		reserved_sectors += bucket_to_sector(ca, reserve);

-		capacity += (ca->mi.nbuckets -
-			     ca->mi.first_bucket) <<
-			ca->bucket_bits;
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
 	}
-	spin_unlock(&slowest_tier->devs.lock);
 set_capacity:
 	total_capacity = capacity;

@ -1795,7 +1737,6 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 /* device goes ro: */
 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct closure cl;
 	unsigned i;

@ -1805,9 +1746,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)

 	/* First, remove device from allocation groups: */

-	bch2_dev_group_remove(&c->journal.devs, ca);
-	bch2_dev_group_remove(tier, ca);
-	bch2_dev_group_remove(&c->all_devs, ca);
+	clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);

 	/*
 	 * Capacity is calculated based off of devices in allocation groups:
@ -1820,7 +1761,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)

 	bch2_stop_write_point(c, ca, &ca->copygc_write_point);
 	bch2_stop_write_point(c, ca, &c->promote_write_point);
-	bch2_stop_write_point(c, ca, &ca->tiering_write_point);
+	bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
 	bch2_stop_write_point(c, ca, &c->migration_write_point);
 	bch2_stop_write_point(c, ca, &c->btree_write_point);

@ -1862,21 +1803,12 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 /* device goes rw: */
 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
-	struct bch_sb_field_journal *journal_buckets;
-	bool has_journal;
+	unsigned i;

-	bch2_dev_group_add(&c->all_devs, ca);
-	bch2_dev_group_add(tier, ca);
-
-	mutex_lock(&c->sb_lock);
-	journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
-	has_journal = bch2_nr_journal_buckets(journal_buckets) >=
-		BCH_JOURNAL_BUCKETS_MIN;
-	mutex_unlock(&c->sb_lock);
-
-	if (has_journal)
-		bch2_dev_group_add(&c->journal.devs, ca);
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+	set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
 }

 /* stop allocator thread: */
@ -1942,13 +1874,17 @@ void bch2_fs_allocator_init(struct bch_fs *c)
 		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
 	}

-	spin_lock_init(&c->all_devs.lock);
+	c->journal.wp.type		= BCH_DATA_JOURNAL;
+	c->btree_write_point.type	= BCH_DATA_BTREE;

 	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
-		spin_lock_init(&c->tiers[i].devs.lock);
+		c->tiers[i].wp.type	= BCH_DATA_USER;

 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		c->write_points[i].throttle = true;
+		c->write_points[i].type = BCH_DATA_USER;
+
+	c->promote_write_point.type	= BCH_DATA_USER;
+	c->migration_write_point.type	= BCH_DATA_USER;

 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_ALLOC_H
-#define _BCACHE_ALLOC_H
+#ifndef _BCACHEFS_ALLOC_H
+#define _BCACHEFS_ALLOC_H

 #include "bcachefs.h"
 #include "alloc_types.h"
@ -10,11 +10,18 @@ struct bch_dev;
 struct bch_fs;
 struct dev_group;

-void bch2_dev_group_remove(struct dev_group *, struct bch_dev *);
-void bch2_dev_group_add(struct dev_group *, struct bch_dev *);
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+					 struct write_point *,
+					 struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+		     struct write_point *);

 int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *);
 int bch2_alloc_replay_key(struct bch_fs *, struct bpos);

 long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
@ -46,24 +53,6 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 	rcu_read_unlock();
 }

-static inline struct bch_dev *dev_group_next(struct dev_group *devs,
-					     unsigned *iter)
-{
-	struct bch_dev *ret = NULL;
-
-	while (*iter < devs->nr &&
-	       !(ret = rcu_dereference_check(devs->d[*iter].dev,
-					     lockdep_is_held(&devs->lock))))
-		(*iter)++;
-
-	return ret;
-}
-
-#define group_for_each_dev(ca, devs, iter)				\
-	for ((iter) = 0;						\
-	     ((ca) = dev_group_next((devs), &(iter)));			\
-	     (iter)++)
-
 #define open_bucket_for_each_ptr(_ob, _ptr)				\
 	for ((_ptr) = (_ob)->ptrs;					\
 	     (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs;			\
@ -81,4 +70,4 @@ void bch2_fs_allocator_init(struct bch_fs *);

 extern const struct bkey_ops bch2_bkey_alloc_ops;

-#endif /* _BCACHE_ALLOC_H */
+#endif /* _BCACHEFS_ALLOC_H */
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_ALLOC_TYPES_H
-#define _BCACHE_ALLOC_TYPES_H
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H

 #include <linux/mutex.h>

@ -42,16 +42,6 @@ enum alloc_reserve {
 	RESERVE_NR		= 3,
 };

-struct dev_group {
-	spinlock_t		lock;
-	unsigned		nr;
-	unsigned		cur_device;
-	struct {
-		u64		weight;
-		struct bch_dev	*dev;
-	}			d[BCH_SB_MEMBERS_MAX];
-};
-
 /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
 #define OPEN_BUCKETS_COUNT	256

@ -74,22 +64,18 @@ struct open_bucket {

 struct write_point {
 	struct open_bucket	*b;
-
-	/*
-	 * Throttle writes to this write point if tier 0 is full?
-	 */
-	bool			throttle;
+	enum bch_data_type	type;

 	/*
 	 * If not NULL, cache group for tiering, promotion and moving GC -
 	 * always allocates a single replica
-	 */
-	struct dev_group	*group;
-
-	/*
+	 *
 	 * Otherwise do a normal replicated bucket allocation that could come
 	 * from any device in tier 0 (foreground write)
 	 */
+	struct bch_devs_mask	*group;
+
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };

 struct alloc_heap_entry {
@ -99,4 +85,4 @@ struct alloc_heap_entry {

 typedef HEAP(struct alloc_heap_entry) alloc_heap;

-#endif /* _BCACHE_ALLOC_TYPES_H */
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -284,7 +284,6 @@ do {									\
 #include "clock_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
-#include "move_types.h"
 #include "super_types.h"

 /* 256k, in sectors */
@ -330,6 +329,7 @@ struct bch_member_cpu {
 	u8			tier;
 	u8			replacement;
 	u8			discard;
+	u8			data_allowed;
 	u8			valid;
 };

@ -345,6 +345,10 @@ struct bch_replicas_cpu {
 	struct bch_replicas_cpu_entry entries[];
 };

+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
 struct bch_dev {
 	struct kobject		kobj;
 	struct percpu_ref	ref;
@ -366,7 +370,7 @@ struct bch_dev {
 	struct bcache_superblock disk_sb;
 	int			sb_write_error;

-	struct dev_group	self;
+	struct bch_devs_mask	self;

 	/* biosets used in cloned bios for replicas and moving_gc */
 	struct bio_set		replica_set;
@ -387,7 +391,6 @@ struct bch_dev {
 	spinlock_t		freelist_lock;
 	unsigned		nr_invalidated;
 	bool			alloc_thread_started;
-	bool			need_alloc_write;

 	size_t			fifo_last_bucket;

@ -396,7 +399,7 @@ struct bch_dev {
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct bucket		*buckets;
-	unsigned short		bucket_bits;	/* ilog2(bucket_size) */
+	unsigned long		*bucket_dirty;

 	/* last calculated minimum prio */
 	u16			min_prio[2];
@ -423,9 +426,6 @@ struct bch_dev {

 	struct bch_pd_controller moving_gc_pd;

-	/* Tiering: */
-	struct write_point	tiering_write_point;
-
 	struct write_point	copygc_write_point;

 	struct journal_device	journal;
@ -433,9 +433,7 @@ struct bch_dev {
 	struct work_struct	io_error_work;

 	/* The rest of this all shows up in sysfs */
-	atomic64_t		meta_sectors_written;
-	atomic64_t		btree_sectors_written;
-	u64 __percpu		*sectors_written;
+	struct io_count __percpu *io_done;
 };

 /*
@ -472,7 +470,8 @@ struct bch_tier {
 	struct task_struct	*migrate;
 	struct bch_pd_controller pd;

-	struct dev_group	devs;
+	struct bch_devs_mask	devs;
+	struct write_point	wp;
 };

 enum bch_fs_state {
@ -520,6 +519,7 @@ struct bch_fs {

 		u16		block_size;
 		u16		btree_node_size;
+		u16		encoded_extent_max;

 		u8		nr_devices;
 		u8		clean;
@ -621,7 +621,7 @@ struct bch_fs {
 	 * These contain all r/w devices - i.e. devices we can currently
 	 * allocate from:
 	 */
-	struct dev_group	all_devs;
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
 	struct bch_tier		tiers[BCH_TIER_MAX];
 	/* NULL if we only have devices in one tier: */
 	struct bch_tier		*fastest_tier;
@ -789,11 +789,6 @@ static inline bool bch2_fs_running(struct bch_fs *c)
 	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
 }

-static inline unsigned bucket_pages(const struct bch_dev *ca)
-{
-	return ca->mi.bucket_size / PAGE_SECTORS;
-}
-
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
 {
 	return ca->mi.bucket_size << 9;
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -77,6 +77,7 @@ struct bpos {
 #define KEY_INODE_MAX			((__u64)~0ULL)
 #define KEY_OFFSET_MAX			((__u64)~0ULL)
 #define KEY_SNAPSHOT_MAX		((__u32)~0U)
+#define KEY_SIZE_MAX			((__u32)~0U)

 static inline struct bpos POS(__u64 inode, __u64 offset)
 {
@ -177,8 +178,6 @@ struct bkey_packed {
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
 #define KEY_PACKED_BITS_START		24

-#define KEY_SIZE_MAX			((__u32)~0U)
-
 #define KEY_FORMAT_LOCAL_BTREE		0
 #define KEY_FORMAT_CURRENT		1

@ -359,14 +358,16 @@ struct bch_csum {
 	__le64			hi;
 } __attribute__((packed, aligned(8)));

-#define BCH_CSUM_NONE			0U
-#define BCH_CSUM_CRC32C			1U
-#define BCH_CSUM_CRC64			2U
-#define BCH_CSUM_CHACHA20_POLY1305_80	3U
-#define BCH_CSUM_CHACHA20_POLY1305_128	4U
-#define BCH_CSUM_NR			5U
+enum bch_csum_type {
+	BCH_CSUM_NONE			= 0,
+	BCH_CSUM_CRC32C			= 1,
+	BCH_CSUM_CRC64			= 2,
+	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
+	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
+	BCH_CSUM_NR			= 5,
+};

-static inline _Bool bch2_csum_type_is_encryption(unsigned type)
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 {
 	switch (type) {
 	case BCH_CSUM_CHACHA20_POLY1305_80:
@ -377,6 +378,14 @@ static inline _Bool bch2_csum_type_is_encryption(unsigned type)
 	}
 }

+enum bch_compression_type {
+	BCH_COMPRESSION_NONE		= 0,
+	BCH_COMPRESSION_LZ4_OLD		= 1,
+	BCH_COMPRESSION_GZIP		= 2,
+	BCH_COMPRESSION_LZ4		= 3,
+	BCH_COMPRESSION_NR		= 4,
+};
+
 enum bch_extent_entry_type {
 	BCH_EXTENT_ENTRY_ptr		= 0,
 	BCH_EXTENT_ENTRY_crc32		= 1,
@ -462,12 +471,6 @@ struct bch_extent_crc128 {
 #define CRC128_SIZE_MAX		(1U << 13)
 #define CRC128_NONCE_MAX	((1U << 13) - 1)

-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-#define BCH_ENCODED_EXTENT_MAX	128U
-
 /*
 * @reservation - pointer hasn't been written to, just reserved
 */
@ -578,11 +581,12 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);

 #define BLOCKDEV_INODE_MAX	4096

-#define BCACHE_ROOT_INO		4096
+#define BCACHEFS_ROOT_INO	4096

 enum bch_inode_types {
 	BCH_INODE_FS		= 128,
 	BCH_INODE_BLOCKDEV	= 129,
+	BCH_INODE_GENERATION	= 130,
 };

 struct bch_inode {
@ -595,6 +599,15 @@ struct bch_inode {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(inode,		BCH_INODE_FS);

+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			i_generation;
+	__le32			pad;
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
+
+
 #define BCH_INODE_FIELDS()				\
 	BCH_INODE_FIELD(i_atime,	64)		\
 	BCH_INODE_FIELD(i_ctime,	64)		\
@ -735,24 +748,14 @@ BKEY_VAL_TYPE(alloc,	BCH_ALLOC);

 /* Superblock */

-/* Version 0: Cache device
- * Version 1: Backing device
- * Version 2: Seed pointer into btree node checksum
- * Version 3: Cache device with new UUID format
- * Version 4: Backing device with data offset
- * Version 5: All the incompat changes
- * Version 6: Cache device UUIDs all in superblock, another incompat bset change
- * Version 7: Encryption (expanded checksum fields), other random things
+/*
+ * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
+ *		BCH_MEMBER_DATA_ALLOWED
 */
-#define BCACHE_SB_VERSION_CDEV_V0	0
-#define BCACHE_SB_VERSION_BDEV		1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
-#define BCACHE_SB_VERSION_CDEV_V2	5
-#define BCACHE_SB_VERSION_CDEV_V3	6
-#define BCACHE_SB_VERSION_CDEV_V4	7
-#define BCACHE_SB_VERSION_CDEV		7
-#define BCACHE_SB_MAX_VERSION		7
+
+#define BCH_SB_VERSION_MIN		7
+#define BCH_SB_VERSION_EXTENT_MAX	8
+#define BCH_SB_VERSION_MAX		8

 #define BCH_SB_SECTOR			8
 #define BCH_SB_LABEL_SIZE		32
@ -774,6 +777,7 @@ LE64_BITMASK(BCH_MEMBER_TIER,		struct bch_member, flags[0],  4,  8)
 /* 8-10 unused, was HAS_(META)DATA */
 LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
 LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15);
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20);

 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
@ -880,7 +884,7 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
 LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
 LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);

-enum bch_data_types {
+enum bch_data_type {
 	BCH_DATA_NONE		= 0,
 	BCH_DATA_SB		= 1,
 	BCH_DATA_JOURNAL	= 2,
@ -981,7 +985,12 @@ LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
 LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
 LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);

-/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+					struct bch_sb, flags[1], 14, 20);

 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
@ -1032,10 +1041,10 @@ enum bch_str_hash_opts {
 };

 enum bch_compression_opts {
-	BCH_COMPRESSION_NONE		= 0,
-	BCH_COMPRESSION_LZ4		= 1,
-	BCH_COMPRESSION_GZIP		= 2,
-	BCH_COMPRESSION_NR		= 3,
+	BCH_COMPRESSION_OPT_NONE	= 0,
+	BCH_COMPRESSION_OPT_LZ4		= 1,
+	BCH_COMPRESSION_OPT_GZIP	= 2,
+	BCH_COMPRESSION_OPT_NR		= 3,
 };

 /*
@ -1049,7 +1058,7 @@ enum bch_compression_opts {
 	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
 		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)

-#define BCACHE_STATFS_MAGIC		0xca451a4e
+#define BCACHEFS_STATFS_MAGIC		0xca451a4e

 #define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
 #define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@ -287,7 +287,7 @@ struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
 	return out;
 }

-#ifndef HAVE_BCACHE_COMPILED_UNPACK
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
 struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 				     const struct bkey_packed *in)
 {
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BKEY_H
-#define _BCACHE_BKEY_H
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H

 #include <linux/bug.h>
 #include "bcachefs_format.h"
@ -345,7 +345,7 @@ bool bch2_bkey_transform(const struct bkey_format *,
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
 				   const struct bkey_packed *);

-#ifndef HAVE_BCACHE_COMPILED_UNPACK
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
 struct bpos __bkey_unpack_pos(const struct bkey_format *,
 			      const struct bkey_packed *);
 #endif
@ -382,7 +382,7 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
 }

 #ifdef CONFIG_X86_64
-#define HAVE_BCACHE_COMPILED_UNPACK	1
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1

 int bch2_compile_bkey_format(const struct bkey_format *, void *);

@ -575,6 +575,7 @@ BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);

 BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
 BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
+BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);

 BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);

@ -612,4 +613,4 @@ void bch2_bkey_pack_test(void);
 static inline void bch2_bkey_pack_test(void) {}
 #endif

-#endif /* _BCACHE_BKEY_H */
+#endif /* _BCACHEFS_BKEY_H */
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BKEY_METHODS_H
-#define _BCACHE_BKEY_METHODS_H
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H

 #include "bkey.h"

@ -10,6 +10,8 @@ enum bkey_type {
 	BKEY_TYPE_BTREE,
 };

+#undef DEF_BTREE_ID
+
 /* Type of a key in btree @id at level @level: */
 static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
 {
@ -77,6 +79,4 @@ void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,

 extern const struct bkey_ops *bch2_bkey_ops[];

-#undef DEF_BTREE_ID
-
-#endif /* _BCACHE_BKEY_METHODS_H */
+#endif /* _BCACHEFS_BKEY_METHODS_H */
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@ -691,7 +691,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	struct bkey_packed *l, *r;
 	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
 	unsigned mantissa;
-	int shift, exponent;
+	int shift, exponent, high_bit;

 	EBUG_ON(bkey_next(p) != m);

@ -737,7 +737,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 */

 	if (!bkey_packed(l) || !bkey_packed(r) ||
-	    !bkey_packed(p) || !bkey_packed(m)) {
+	    !bkey_packed(p) || !bkey_packed(m) ||
+	    !b->nr_key_bits) {
 		f->exponent = BFLOAT_FAILED_UNPACKED;
 		return;
 	}
@ -752,7 +753,9 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 * Note that this may be negative - we may be running off the low end
 	 * of the key: we handle this later:
 	 */
-	exponent = (int) bch2_bkey_greatest_differing_bit(b, l, r) - (bits - 1);
+	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+		       min_t(unsigned, bits, b->nr_key_bits) - 1);
+	exponent = high_bit - (bits - 1);

 	/*
 	 * Then we calculate the actual shift value, from the start of the key
@ -761,16 +764,16 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 #ifdef __LITTLE_ENDIAN
 	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;

-	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+	BUG_ON(shift + bits > b->format.key_u64s * 64);
 #else
 	shift = high_bit_offset +
 		b->nr_key_bits -
 		exponent -
 		bits;

-	EBUG_ON(shift < KEY_PACKED_BITS_START);
+	BUG_ON(shift < KEY_PACKED_BITS_START);
 #endif
-	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+	BUG_ON(shift < 0 || shift >= BFLOAT_FAILED);

 	f->exponent = shift;
 	mantissa = bkey_mantissa(m, f, j);
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BSET_H
-#define _BCACHE_BSET_H
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H

 #include <linux/kernel.h>
 #include <linux/types.h>
@ -183,7 +183,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
 {
 	struct bkey dst;

-#ifdef HAVE_BCACHE_COMPILED_UNPACK
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 	{
 		compiled_unpack_fn unpack_fn = b->aux_data;
 		unpack_fn(&dst, src);
@ -221,7 +221,7 @@ static inline struct bpos
 bkey_unpack_pos_format_checked(const struct btree *b,
 			       const struct bkey_packed *src)
 {
-#ifdef HAVE_BCACHE_COMPILED_UNPACK
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 	return bkey_unpack_key_format_checked(b, src).p;
 #else
 	return __bkey_unpack_pos(&b->format, src);
@ -618,4 +618,4 @@ static inline void bch2_verify_btree_nr_keys(struct btree *b)
 		__bch2_verify_btree_nr_keys(b);
 }

-#endif
+#endif /* _BCACHEFS_BSET_H */
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@ -180,8 +180,8 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 		btree_node_wait_on_io(b);
 	}
 out:
-	if (PTR_HASH(&b->key))
-		trace_btree_node_reap(c, b, ret);
+	if (PTR_HASH(&b->key) && !ret)
+		trace_btree_node_reap(c, b);
 	return ret;
 out_unlock:
 	six_unlock_write(&b->lock);
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_CACHE_H
-#define _BCACHE_BTREE_CACHE_H
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H

 #include "bcachefs.h"
 #include "btree_types.h"
@ -59,14 +59,14 @@ static inline size_t btree_max_u64s(struct bch_fs *c)
 	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
 }

-static inline size_t btree_pages(struct bch_fs *c)
-{
-	return c->sb.btree_node_size >> (PAGE_SHIFT - 9);
-}
-
 static inline size_t btree_page_order(struct bch_fs *c)
 {
-	return ilog2(btree_pages(c));
+	return get_order(btree_bytes(c));
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return 1 << btree_page_order(c);
 }

 static inline unsigned btree_blocks(struct bch_fs *c)
@ -86,4 +86,4 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 int bch2_print_btree_node(struct bch_fs *, struct btree *,
 			 char *, size_t);

-#endif /* _BCACHE_BTREE_CACHE_H */
+#endif /* _BCACHEFS_BTREE_CACHE_H */
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -129,7 +129,7 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 				struct bkey_s_c k)
 {
-	enum bch_data_types data_type = type == BKEY_TYPE_BTREE
+	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
 		? BCH_DATA_BTREE : BCH_DATA_USER;
 	int ret = 0;

@ -152,20 +152,23 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 			struct bch_dev *ca = c->devs[ptr->dev];
 			struct bucket *g = PTR_BUCKET(ca, ptr);

-			if (!g->mark.gen_valid) {
+			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
+					"found ptr with missing gen in alloc btree,\n"
+					"type %s gen %u",
+					bch2_data_types[data_type],
+					ptr->gen)) {
 				g->_mark.gen = ptr->gen;
 				g->_mark.gen_valid = 1;
-				ca->need_alloc_write = true;
+				set_bit(g - ca->buckets, ca->bucket_dirty);
 			}

-			if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
 					"%s ptr gen in the future: %u > %u",
-					type == BKEY_TYPE_BTREE
-					? "btree" : "data",
+					bch2_data_types[data_type],
 					ptr->gen, g->mark.gen)) {
 				g->_mark.gen = ptr->gen;
 				g->_mark.gen_valid = 1;
-				ca->need_alloc_write = true;
+				set_bit(g - ca->buckets, ca->bucket_dirty);
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}

@ -308,12 +311,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
 				  enum bucket_data_type type)
 {
-	u64 b = start >> ca->bucket_bits;
+	u64 b = sector_to_bucket(ca, start);

 	do {
 		bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
 		b++;
-	} while (b < end >> ca->bucket_bits);
+	} while (b < sector_to_bucket(ca, end));
 }

 static void bch2_dev_mark_superblocks(struct bch_dev *ca)
@ -608,7 +611,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		return;
 	}

-	trace_btree_gc_coalesce(c, parent, nr_old_nodes);
+	trace_btree_gc_coalesce(c, old_nodes[0]);

 	for (i = 0; i < nr_old_nodes; i++)
 		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_GC_H
-#define _BCACHE_GC_H
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H

 #include "btree_types.h"

@ -101,4 +101,4 @@ static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
 	return ret;
 }

-#endif
+#endif /* _BCACHEFS_BTREE_GC_H */
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -1292,6 +1292,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	bio->bi_iter.bi_size	= btree_bytes(c);
 	bch2_bio_map(bio, b->data);

+	this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
+		     bio_sectors(bio));
+
 	set_btree_node_read_in_flight(b);

 	if (sync) {
@ -1702,13 +1705,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	extent_for_each_ptr(e, ptr)
 		ptr->offset += b->written;

-	extent_for_each_ptr(e, ptr)
-		atomic64_add(sectors_to_write,
-			     &c->devs[ptr->dev]->btree_sectors_written);
-
 	b->written += sectors_to_write;

-	bch2_submit_wbio_replicas(wbio, c, &k.key);
+	bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
 	return;
 err:
 	set_btree_node_noevict(b);
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_IO_H
-#define _BCACHE_BTREE_IO_H
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H

 #include "extents.h"

@ -109,4 +109,4 @@ do {									\

 void bch2_btree_verify_flushed(struct bch_fs *);

-#endif /* _BCACHE_BTREE_IO_H */
+#endif /* _BCACHEFS_BTREE_IO_H */
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -249,10 +249,10 @@ fail:

 static void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
+	iter->flags &= ~BTREE_ITER_UPTODATE;
+
 	while (iter->nodes_locked)
 		btree_node_unlock(iter, __ffs(iter->nodes_locked));
-
-	iter->flags &= ~BTREE_ITER_UPTODATE;
 }

 int bch2_btree_iter_unlock(struct btree_iter *iter)
@ -627,9 +627,9 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	unsigned level = b->level;

 	if (iter->nodes[level] == b) {
+		iter->flags &= ~BTREE_ITER_UPTODATE;
 		btree_node_unlock(iter, level);
 		iter->nodes[level] = BTREE_ITER_NOT_END;
-		iter->flags &= ~BTREE_ITER_UPTODATE;
 	}
 }

@ -840,6 +840,11 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	unsigned depth_want = iter->level;

+	if (unlikely(!iter->nodes[iter->level]))
+		return 0;
+
+	iter->flags &= ~(BTREE_ITER_UPTODATE|BTREE_ITER_AT_END_OF_LEAF);
+
 	/* make sure we have all the intent locks we need - ugh */
 	if (unlikely(iter->nodes[iter->level] &&
 		     iter->level + 1 < iter->locks_want)) {
@ -893,6 +898,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 			: btree_iter_lock_root(iter, depth_want);
 		if (unlikely(ret)) {
 			iter->level = depth_want;
+			iter->nodes[iter->level] = BTREE_ITER_NOT_END;
 			return ret;
 		}
 	}
@ -904,13 +910,6 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;

-	iter->flags &= ~BTREE_ITER_UPTODATE;
-
-	if (unlikely(!iter->nodes[iter->level]))
-		return 0;
-
-	iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
-
 	ret = __bch2_btree_iter_traverse(iter);
 	if (unlikely(ret))
 		ret = btree_iter_traverse_error(iter, ret);
@ -1068,6 +1067,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			.v = bkeyp_val(&b->format, k)
 		};

+		EBUG_ON(!btree_node_locked(iter, 0));
+
 		if (debug_check_bkeys(iter->c))
 			bch2_bkey_debugcheck(iter->c, b, ret);
 		return ret;
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_ITER_H
-#define _BCACHE_BTREE_ITER_H
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H

 #include "btree_types.h"

@ -263,4 +263,4 @@ static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
 	}
 }

-#endif /* _BCACHE_BTREE_ITER_H */
+#endif /* _BCACHEFS_BTREE_ITER_H */
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_LOCKING_H
-#define _BCACHE_BTREE_LOCKING_H
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H

 /*
 * Only for internal btree use:
@ -91,6 +91,8 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);

+	EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
+
 	if (lock_type != BTREE_NODE_UNLOCKED)
 		six_unlock_type(&iter->nodes[level]->lock, lock_type);
 	mark_btree_node_unlocked(iter, level);
@ -113,4 +115,4 @@ bool bch2_btree_node_relock(struct btree_iter *, unsigned);
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);

-#endif /* _BCACHE_BTREE_LOCKING_H */
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_TYPES_H
-#define _BCACHE_BTREE_TYPES_H
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H

 #include <linux/list.h>
 #include <linux/rhashtable.h>
@ -321,4 +321,4 @@ typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
 							struct btree *,
 							struct btree_node_iter *);

-#endif /* _BCACHE_BTREE_TYPES_H */
+#endif /* _BCACHEFS_BTREE_TYPES_H */
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_UPDATE_H
-#define _BCACHE_BTREE_UPDATE_H
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H

 #include "btree_iter.h"
 #include "journal.h"
@ -133,5 +133,4 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
 			       struct bkey_i_extent *);

-#endif /* _BCACHE_BTREE_UPDATE_H */
-
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -1310,7 +1310,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		btree_split_insert_keys(as, n1, iter, keys);

 	if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
-		trace_btree_node_split(c, b, b->nr.live_u64s);
+		trace_btree_split(c, b);

 		n2 = __btree_split_node(as, n1, iter);

@ -1340,7 +1340,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 			bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent);
 		}
 	} else {
-		trace_btree_node_compact(c, b, b->nr.live_u64s);
+		trace_btree_compact(c, b);

 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->lock);
@ -1882,12 +1882,13 @@ retry:

 		if (new_hash) {
 			mutex_lock(&c->btree_cache_lock);
+			bch2_btree_node_hash_remove(c, new_hash);
+
 			bch2_btree_node_hash_remove(c, b);

 			bkey_copy(&b->key, &new_key->k_i);
-			__bch2_btree_node_hash_insert(c, b);
-
-			bch2_btree_node_hash_remove(c, new_hash);
+			ret = __bch2_btree_node_hash_insert(c, b);
+			BUG_ON(ret);
 			mutex_unlock(&c->btree_cache_lock);
 		} else {
 			bkey_copy(&b->key, &new_key->k_i);
@ -1959,7 +1960,10 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,

 	while (1) {
 		/* XXX haven't calculated capacity yet :/ */
-		as = bch2_btree_update_start(c, id, 1, 0, &cl);
+		as = bch2_btree_update_start(c, id, 1,
+					     BTREE_INSERT_USE_RESERVE|
+					     BTREE_INSERT_USE_ALLOC_RESERVE,
+					     &cl);
 		if (!IS_ERR(as))
 			break;

--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_BTREE_UPDATE_INTERIOR_H
-#define _BCACHE_BTREE_UPDATE_INTERIOR_H
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H

 #include "btree_cache.h"
 #include "btree_update.h"
@ -309,4 +309,4 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
 	return u64s <= trans->journal_res.u64s;
 }

-#endif /* _BCACHE_BTREE_UPDATE_INTERIOR_H */
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -314,7 +314,8 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 	}));

 	if (!old->owned_by_allocator && old->cached_sectors)
-		trace_invalidate(ca, g - ca->buckets, old->cached_sectors);
+		trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
+				 old->cached_sectors);
 	return true;
 }

@ -522,7 +523,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	if (saturated &&
 	    atomic_long_add_return(saturated,
 				   &ca->saturated_count) >=
-	    ca->free_inc.size << ca->bucket_bits) {
+	    bucket_to_sector(ca, ca->free_inc.size)) {
 		if (c->gc_thread) {
 			trace_gc_sectors_saturated(c);
 			wake_up_process(c->gc_thread);
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -45,28 +45,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 	return sector_to_bucket(ca, ptr->offset);
 }

-/*
- * Returns 0 if no pointers or device offline - only for tracepoints!
- */
-static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c,
-					 const struct bkey_i *k,
-					 unsigned ptr)
-{
-	size_t bucket = 0;
-#if 0
-	if (bkey_extent_is_data(&k->k)) {
-		const struct bch_extent_ptr *ptr;
-
-		extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) {
-			const struct bch_dev *ca = c->devs[ptr->dev];
-			bucket = PTR_BUCKET_NR(ca, ptr);
-			break;
-		}
-	}
-#endif
-	return bucket;
-}
-
 static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca,
 					const struct bch_extent_ptr *ptr)
 {
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -3,7 +3,7 @@

 #include "util.h"

-/* kill, switch to bch_data_types */
+/* kill, switch to bch_data_type */
 enum bucket_data_type {
 	BUCKET_DATA	= 0,
 	BUCKET_BTREE,
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -1,3 +1,5 @@
+#ifndef NO_BCACHEFS_CHARDEV
+
 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
 #include "super.h"
@ -404,3 +406,5 @@ int __init bch2_chardev_init(void)

 	return 0;
 }
+
+#endif /* NO_BCACHEFS_CHARDEV */
--- a/libbcachefs/chardev.h
+++ b/libbcachefs/chardev.h
@ -1,7 +1,7 @@
-#ifndef _BCACHE_CHARDEV_H
-#define _BCACHE_CHARDEV_H
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H

-#ifndef NO_BCACHE_CHARDEV
+#ifndef NO_BCACHEFS_FS

 long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);

@ -25,6 +25,6 @@ static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
 static inline void bch2_chardev_exit(void) {}
 static inline int __init bch2_chardev_init(void) { return 0; }

-#endif
+#endif /* NO_BCACHEFS_FS */

-#endif /* _BCACHE_CHARDEV_H */
+#endif /* _BCACHEFS_CHARDEV_H */
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_CHECKSUM_H
-#define _BCACHE_CHECKSUM_H
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H

 #include "bcachefs.h"
 #include "super-io.h"
@ -46,21 +46,51 @@ int bch2_enable_encryption(struct bch_fs *, bool);
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);

-static inline unsigned bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_NONE:
+	     return BCH_CSUM_NONE;
+	case BCH_CSUM_OPT_CRC32C:
+	     return BCH_CSUM_CRC32C;
+	case BCH_CSUM_OPT_CRC64:
+	     return BCH_CSUM_CRC64;
+	default:
+	     BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
 {
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
 			? BCH_CSUM_CHACHA20_POLY1305_128
 			: BCH_CSUM_CHACHA20_POLY1305_80;

-	return c->opts.data_checksum;
+	return bch2_csum_opt_to_type(c->opts.data_checksum);
 }

-static inline unsigned bch2_meta_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
-	return c->sb.encryption_type
-		? BCH_CSUM_CHACHA20_POLY1305_128
-		: c->opts.metadata_checksum;
+	if (c->sb.encryption_type)
+		return BCH_CSUM_CHACHA20_POLY1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum);
+}
+
+static inline enum bch_compression_type
+bch2_compression_opt_to_type(enum bch_compression_opts type)
+{
+	switch (type) {
+	case BCH_COMPRESSION_OPT_NONE:
+		return BCH_COMPRESSION_NONE;
+	case BCH_COMPRESSION_OPT_LZ4:
+		return BCH_COMPRESSION_LZ4;
+	case BCH_COMPRESSION_OPT_GZIP:
+		return BCH_COMPRESSION_GZIP;
+	default:
+	     BUG();
+	}
 }

 static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
@ -130,4 +160,4 @@ static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
 	}};
 }

-#endif /* _BCACHE_CHECKSUM_H */
+#endif /* _BCACHEFS_CHECKSUM_H */
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_CLOCK_H
-#define _BCACHE_CLOCK_H
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H

 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
@ -20,4 +20,4 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
 void bch2_io_clock_exit(struct io_clock *);
 int bch2_io_clock_init(struct io_clock *);

-#endif /* _BCACHE_CLOCK_H */
+#endif /* _BCACHEFS_CLOCK_H */
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_CLOCK_TYPES_H
-#define _BCACHE_CLOCK_TYPES_H
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H

 #include "util.h"

@ -32,5 +32,4 @@ struct io_clock {
 	io_timer_heap		timers;
 };

-#endif /* _BCACHE_CLOCK_TYPES_H */
-
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -5,45 +5,53 @@
 #include "super-io.h"

 #include "lz4.h"
+#include <linux/lz4.h>
 #include <linux/zlib.h>

-enum bounced {
-	BOUNCED_CONTIG,
-	BOUNCED_MAPPED,
-	BOUNCED_KMALLOCED,
-	BOUNCED_VMALLOCED,
-	BOUNCED_MEMPOOLED,
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_VMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
 };

-static void *__bounce_alloc(struct bch_fs *c, unsigned size,
-			    unsigned *bounced, int direction)
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 {
-	void *data;
+	void *b;

-	*bounced = BOUNCED_KMALLOCED;
-	data = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
-	if (data)
-		return data;
+	BUG_ON(size > c->sb.encoded_extent_max);

-	*bounced = BOUNCED_MEMPOOLED;
-	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT);
-	if (data)
-		return page_address(data);
+	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };

-	*bounced = BOUNCED_VMALLOCED;
-	data = vmalloc(size);
-	if (data)
-		return data;
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };

-	*bounced = BOUNCED_MEMPOOLED;
-	data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO);
-	return page_address(data);
+	b = vmalloc(size);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
 }

-static void *__bio_map_or_bounce(struct bch_fs *c,
-				 struct bio *bio, struct bvec_iter start,
-				 unsigned *bounced, int direction)
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
 {
+	struct bbuf ret;
 	struct bio_vec bv;
 	struct bvec_iter iter;
 	unsigned nr_pages = 0;
@ -53,18 +61,17 @@ static void *__bio_map_or_bounce(struct bch_fs *c,
 	unsigned prev_end = PAGE_SIZE;
 	void *data;

-	BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX);
+	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);

 #ifndef CONFIG_HIGHMEM
-	*bounced = BOUNCED_CONTIG;
-
 	__bio_for_each_contig_segment(bv, bio, iter, start) {
 		if (bv.bv_len == start.bi_size)
-			return page_address(bv.bv_page) + bv.bv_offset;
+			return (struct bbuf) {
+				.b = page_address(bv.bv_page) + bv.bv_offset,
+				.type = BB_NONE, .rw = rw
+			};
 	}
 #endif
-	*bounced = BOUNCED_MAPPED;
-
 	__bio_for_each_segment(bv, bio, iter, start) {
 		if ((!first && bv.bv_offset) ||
 		    prev_end != PAGE_SIZE)
@ -90,41 +97,43 @@ static void *__bio_map_or_bounce(struct bch_fs *c,
 	if (pages != stack_pages)
 		kfree(pages);

-	return data + bio_iter_offset(bio, start);
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
 bounce:
-	data = __bounce_alloc(c, start.bi_size, bounced, direction);
+	ret = __bounce_alloc(c, start.bi_size, rw);

-	if (direction == READ)
-		memcpy_from_bio(data, bio, start);
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);

-	return data;
+	return ret;
 }

-static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
-			       unsigned *bounced, int direction)
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
 {
-	return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction);
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
 }

-static void bio_unmap_or_unbounce(struct bch_fs *c, void *data,
-				  unsigned bounced, int direction)
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
 {
-	if (!data)
-		return;
-
-	switch (bounced) {
-	case BOUNCED_MAPPED:
-		vunmap((void *) ((unsigned long) data & PAGE_MASK));
-		return;
-	case BOUNCED_KMALLOCED:
-		kfree(data);
-		return;
-	case BOUNCED_VMALLOCED:
-		vfree(data);
-		return;
-	case BOUNCED_MEMPOOLED:
-		mempool_free(virt_to_page(data), &c->compression_bounce[direction]);
-		return;
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_VMALLOC:
+		vfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(virt_to_page(buf.b),
+			     &c->compression_bounce[buf.rw]);
+		break;
 	}
 }

@ -138,23 +147,30 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
 static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			    void *dst_data, struct bch_extent_crc128 crc)
 {
-	void *src_data = NULL;
-	unsigned src_bounced;
+	struct bbuf src_data = { NULL };
 	size_t src_len = src->bi_iter.bi_size;
 	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
 	int ret;

-	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+	src_data = bio_map_or_bounce(c, src, READ);

 	switch (crc.compression_type) {
-	case BCH_COMPRESSION_LZ4:
-		ret = lz4_decompress(src_data, &src_len,
+	case BCH_COMPRESSION_LZ4_OLD:
+		ret = bch2_lz4_decompress(src_data.b, &src_len,
 				     dst_data, dst_len);
 		if (ret) {
 			ret = -EIO;
 			goto err;
 		}
 		break;
+	case BCH_COMPRESSION_LZ4:
+		ret = LZ4_decompress_safe(src_data.b, dst_data,
+					  src_len, dst_len);
+		if (ret != dst_len) {
+			ret = -EIO;
+			goto err;
+		}
+		break;
 	case BCH_COMPRESSION_GZIP: {
 		void *workspace;
 		z_stream strm;
@ -166,7 +182,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			workspace = c->zlib_workspace;
 		}

-		strm.next_in	= src_data;
+		strm.next_in	= src_data.b;
 		strm.avail_in	= src_len;
 		strm.next_out	= dst_data;
 		strm.avail_out	= dst_len;
@ -191,7 +207,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	}
 	ret = 0;
 err:
-	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+	bio_unmap_or_unbounce(c, src_data);
 	return ret;
 }

@ -199,21 +215,19 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 			       unsigned live_data_sectors,
 			       struct bch_extent_crc128 crc)
 {
-	void *dst_data = NULL;
+	struct bbuf dst_data = { NULL };
 	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
 	int ret = -ENOMEM;

 	BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);

-	/* XXX mempoolify */
-	dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN);
-	if (!dst_data) {
-		dst_data = vmalloc(dst_len);
-		if (!dst_data)
-			goto err;
-	}
+	if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
+	    crc_compressed_size(NULL, &crc)   > c->sb.encoded_extent_max)
+		return -EIO;

-	ret = __bio_uncompress(c, bio, dst_data, crc);
+	dst_data = __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, bio, dst_data.b, crc);
 	if (ret)
 		goto err;

@ -231,9 +245,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,

 	bio->bi_iter.bi_size = live_data_sectors << 9;
 copy_data:
-	memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9));
+	memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
 err:
-	kvfree(dst_data);
+	bio_unmap_or_unbounce(c, dst_data);
 	return ret;
 use_mempool:
 	/*
@ -251,67 +265,72 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 		       struct bio *dst, struct bvec_iter dst_iter,
 		       struct bch_extent_crc128 crc)
 {
-	void *dst_data = NULL;
-	unsigned dst_bounced;
+	struct bbuf dst_data = { NULL };
 	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
 	int ret = -ENOMEM;

-	dst_data = dst_len == dst_iter.bi_size
-		? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE)
-		: __bounce_alloc(c, dst_len, &dst_bounced, WRITE);
+	if (crc_uncompressed_size(NULL, &crc) < c->sb.encoded_extent_max)
+		return -EIO;

-	ret = __bio_uncompress(c, src, dst_data, crc);
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
 	if (ret)
 		goto err;

-	if (dst_bounced)
-		memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9));
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
 err:
-	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	bio_unmap_or_unbounce(c, dst_data);
 	return ret;
 }

 static int __bio_compress(struct bch_fs *c,
 			  struct bio *dst, size_t *dst_len,
 			  struct bio *src, size_t *src_len,
-			  unsigned compression_type)
+			  unsigned *compression_type)
 {
-	void *src_data = NULL, *dst_data = NULL;
-	unsigned src_bounced, dst_bounced, pad;
-	int ret = -1;
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	unsigned pad;
+	int ret;

-	dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE);
-	src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (*compression_type) {
+	case BCH_COMPRESSION_LZ4_OLD:
+		*compression_type = BCH_COMPRESSION_LZ4;

-	switch (compression_type) {
 	case BCH_COMPRESSION_LZ4: {
 		void *workspace;
+		int len = src->bi_iter.bi_size;

-		*dst_len = dst->bi_iter.bi_size;
-		*src_len = src->bi_iter.bi_size;
+		ret = 0;

 		workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);

-		while (*src_len > block_bytes(c) &&
-		       (ret = lz4_compress(src_data, *src_len,
-					   dst_data, dst_len,
-					   workspace))) {
+		while (len > block_bytes(c) &&
+		       (!(ret = LZ4_compress_destSize(
+					src_data.b,	dst_data.b,
+					&len,		dst->bi_iter.bi_size,
+					workspace)) ||
+			(len & (block_bytes(c) - 1)))) {
 			/*
 			 * On error, the compressed data was bigger than
-			 * dst_len, and -ret is the amount of data we were able
-			 * to compress - round down to nearest block and try
-			 * again:
+			 * dst_len - round down to nearest block and try again:
 			 */
-			BUG_ON(ret > 0);
-			BUG_ON(-ret >= *src_len);
-
-			*src_len = round_down(-ret, block_bytes(c));
+			len = round_down(len, block_bytes(c));
 		}

 		mempool_free(workspace, &c->lz4_workspace_pool);

-		if (ret)
+		if (!ret)
 			goto err;
+
+		*src_len = len;
+		*dst_len = ret;
 		break;
 	}
 	case BCH_COMPRESSION_GZIP: {
@ -326,10 +345,10 @@ static int __bio_compress(struct bch_fs *c,
 			workspace = c->zlib_workspace;
 		}

-		strm.next_in	= src_data;
+		strm.next_in	= src_data.b;
 		strm.avail_in	= min(src->bi_iter.bi_size,
 				      dst->bi_iter.bi_size);
-		strm.next_out	= dst_data;
+		strm.next_out	= dst_data.b;
 		strm.avail_out	= dst->bi_iter.bi_size;
 		zlib_set_workspace(&strm, workspace);
 		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
@ -366,29 +385,24 @@ zlib_err:
 		BUG();
 	}

-	BUG_ON(!*dst_len);
-	BUG_ON(*dst_len > dst->bi_iter.bi_size);
-
-	BUG_ON(*src_len & (block_bytes(c) - 1));
-	BUG_ON(*src_len > src->bi_iter.bi_size);
-
 	/* Didn't get smaller: */
-	if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
-		ret = -1;
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
 		goto err;
-	}

 	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;

-	memset(dst_data + *dst_len, 0, pad);
+	memset(dst_data.b + *dst_len, 0, pad);
 	*dst_len += pad;

-	if (dst_bounced)
-		memcpy_to_bio(dst, dst->bi_iter, dst_data);
-err:
-	bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
-	bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
 	return ret;
+err:
+	ret = -1;
+	goto out;
 }

 void bch2_bio_compress(struct bch_fs *c,
@ -400,8 +414,8 @@ void bch2_bio_compress(struct bch_fs *c,
 	unsigned orig_src = src->bi_iter.bi_size;

 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
-	src->bi_iter.bi_size =
-		min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9);
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->sb.encoded_extent_max << 9);

 	/* Don't generate a bigger output than input: */
 	dst->bi_iter.bi_size =
@ -410,7 +424,7 @@ void bch2_bio_compress(struct bch_fs *c,
 	/* If it's only one block, don't bother trying to compress: */
 	if (*compression_type != BCH_COMPRESSION_NONE &&
 	    bio_sectors(src) > c->sb.block_size &&
-	    !__bio_compress(c, dst, dst_len, src, src_len, *compression_type))
+	    !__bio_compress(c, dst, dst_len, src, src_len, compression_type))
 		goto out;

 	/* If compressing failed (didn't get smaller), just copy: */
@ -420,6 +434,11 @@ void bch2_bio_compress(struct bch_fs *c,
 out:
 	dst->bi_iter.bi_size = orig_dst;
 	src->bi_iter.bi_size = orig_src;
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
 }

 /* doesn't write superblock: */
@ -460,7 +479,7 @@ void bch2_fs_compress_exit(struct bch_fs *c)

 int bch2_fs_compress_init(struct bch_fs *c)
 {
-	unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
+	unsigned order = get_order(c->sb.encoded_extent_max << 9);
 	int ret;

 	if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_COMPRESS_H
-#define _BCACHE_COMPRESS_H
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H

 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
 			       unsigned, struct bch_extent_crc128);
@ -12,4 +12,4 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
 int bch2_fs_compress_init(struct bch_fs *);

-#endif /* _BCACHE_COMPRESS_H */
+#endif /* _BCACHEFS_COMPRESS_H */
--- a/libbcachefs/debug.h
+++ b/libbcachefs/debug.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_DEBUG_H
-#define _BCACHE_DEBUG_H
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H

 #include "bcachefs.h"

@ -59,4 +59,4 @@ static inline void bch2_fs_debug_init(struct bch_fs *c) {}
 void bch2_debug_exit(void);
 int bch2_debug_init(void);

-#endif
+#endif /* _BCACHEFS_DEBUG_H */
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_DIRENT_H
-#define _BCACHE_DIRENT_H
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H

 #include "str_hash.h"

@ -35,5 +35,4 @@ u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
 int bch2_empty_dir(struct bch_fs *, u64);
 int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);

-#endif /* _BCACHE_DIRENT_H */
-
+#endif /* _BCACHEFS_DIRENT_H */
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_ERROR_H
-#define _BCACHE_ERROR_H
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H

 #include <linux/printk.h>

@ -220,4 +220,4 @@ do {									\
 	(bio)->bi_error = -EIO;						\
 } while (0)

-#endif /* _BCACHE_ERROR_H */
+#endif /* _BCACHEFS_ERROR_H */
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -435,13 +435,13 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 		if (ptr != ptr2 && ptr->dev == ptr2->dev)
 			return "multiple pointers to same device";

-	if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
+	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
 		return "offset past end of device";

-	if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
 		return "offset before first bucket";

-	if ((ptr->offset & (ca->mi.bucket_size - 1)) +
+	if (bucket_remainder(ca, ptr->offset) +
 	    size_ondisk > ca->mi.bucket_size)
 		return "spans multiple buckets";

@ -2126,7 +2126,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,

 		extent_for_each_entry(el, en_l) {
 			struct bch_extent_ptr *lp, *rp;
-			unsigned bucket_size;
+			struct bch_dev *ca;

 			en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);

@ -2144,10 +2144,9 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
 				return BCH_MERGE_NOMERGE;

 			/* We don't allow extents to straddle buckets: */
-			bucket_size = c->devs[lp->dev]->mi.bucket_size;
+			ca = c->devs[lp->dev];

-			if ((lp->offset & ~((u64) bucket_size - 1)) !=
-			    (rp->offset & ~((u64) bucket_size - 1)))
+			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
 				return BCH_MERGE_NOMERGE;
 		}

--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_EXTENTS_H
-#define _BCACHE_EXTENTS_H
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H

 #include "bcachefs.h"
 #include "bkey.h"
@ -565,4 +565,4 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);

-#endif /* _BCACHE_EXTENTS_H */
+#endif /* _BCACHEFS_EXTENTS_H */
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@ -259,29 +259,31 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
 }

-#define eytzinger0_find(base, _nr, _size, _cmp, _search)		\
-({									\
-	void *_base = base;						\
-	size_t _i = 0;							\
-	int _res;							\
-									\
-	while (_i < (_nr) &&						\
-	       (_res = _cmp(_search, _base + _i * (_size), _size)))	\
-		_i = eytzinger0_child(_i, _res > 0);			\
-									\
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {			\
-		bool found1 = _i < _nr, found2 = false;			\
-		unsigned _j;						\
-									\
-		for (_j = 0; _j < _nr; _j++)				\
-			if (!_cmp(_base + _j * (_size), _search, _size))\
-				found2 = true;				\
-									\
-		BUG_ON(found1 != found2);				\
-	}								\
-									\
-	_i;								\
-})
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
+				     eytzinger_cmp_fn cmp, void *search)
+{
+	size_t i = 0;
+	int res;
+
+	while (i < nr &&
+	       (res = cmp(search, base + i * size, size)))
+		i = eytzinger0_child(i, res > 0);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		bool found1 = i < nr, found2 = false;
+		size_t j;
+
+		for (j = 0; j < nr; j++)
+			if (!cmp(base + j * size, search, size))
+				found2 = true;
+
+		BUG_ON(found1 != found2);
+	}
+
+	return i;
+}

 void eytzinger0_sort(void *, size_t, size_t,
 		    int (*cmp_func)(const void *, const void *, size_t),
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_FIFO_H
-#define _BCACHE_FIFO_H
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H

 #include "util.h"

@ -111,5 +111,4 @@ do {									\
 	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
 	     _iter++)

-#endif /* _BCACHE_FIFO_H */
-
+#endif /* _BCACHEFS_FIFO_H */
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -1,3 +1,4 @@
+#ifndef NO_BCACHEFS_FS

 #include "bcachefs.h"
 #include "btree_update.h"
@ -520,7 +521,7 @@ int bch2_set_page_dirty(struct page *page)

 static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
 {
-	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;

 	return bio->bi_vcnt < bio->bi_max_vecs &&
 		bio_end_sector(bio) == offset;
@ -539,7 +540,7 @@ static void __bio_add_page(struct bio *bio, struct page *page)

 static int bio_add_page_contig(struct bio *bio, struct page *page)
 {
-	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;

 	BUG_ON(!bio->bi_max_vecs);

@ -798,9 +799,10 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 		pagecache_add_get(&mapping->add_lock);

 	while ((page = readpage_iter_next(&readpages_iter))) {
-		unsigned n = max(min_t(unsigned, readpages_iter.nr_pages + 1,
+		unsigned n = max_t(unsigned,
+				   min_t(unsigned, readpages_iter.nr_pages + 1,
 					 BIO_MAX_PAGES),
-				 BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT);
+				   c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);

 		struct bch_read_bio *rbio =
 			to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
@ -976,9 +978,10 @@ alloc_io:
 				  (struct disk_reservation) {
 					.nr_replicas = c->opts.data_replicas,
 				  },
-				  foreground_write_point(c, inum),
+				  foreground_write_point(c, ei->last_dirtied),
 				  POS(inum, 0),
-				  &ei->journal_seq, 0);
+				  &ei->journal_seq,
+				  BCH_WRITE_THROTTLE);
 		w->io->op.op.index_update_fn = bchfs_write_index_update;
 	}

@ -1327,6 +1330,7 @@ int bch2_write_end(struct file *filp, struct address_space *mapping,
 		   struct page *page, void *fsdata)
 {
 	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
 	struct bch_fs *c = inode->i_sb->s_fs_info;

 	lockdep_assert_held(&inode->i_rwsem);
@ -1350,6 +1354,8 @@ int bch2_write_end(struct file *filp, struct address_space *mapping,
 			SetPageUptodate(page);
 		if (!PageDirty(page))
 			set_page_dirty(page);
+
+		ei->last_dirtied = (unsigned long) current;
 	} else {
 		bch2_put_page_reservation(c, page);
 	}
@ -1546,9 +1552,10 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
 	dio->iop.is_dio		= true;
 	dio->iop.new_i_size	= U64_MAX;
 	bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
-			  foreground_write_point(dio->c, inode->i_ino),
+			  foreground_write_point(dio->c, (unsigned long) current),
 			  POS(inode->i_ino, (dio->offset + dio->written) >> 9),
-			  &ei->journal_seq, flags);
+			  &ei->journal_seq,
+			  flags|BCH_WRITE_THROTTLE);
 	dio->iop.op.index_update_fn = bchfs_write_index_update;

 	dio->res.sectors -= bio_sectors(bio);
@ -1900,10 +1907,10 @@ static int __bch2_truncate_page(struct address_space *mapping,
 		 */
 		for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
 				   POS(inode->i_ino,
-				       index << (PAGE_SHIFT - 9)), 0, k) {
+				       index << PAGE_SECTOR_SHIFT), 0, k) {
 			if (bkey_cmp(bkey_start_pos(k.k),
 				     POS(inode->i_ino,
-					 (index + 1) << (PAGE_SHIFT - 9))) >= 0)
+					 (index + 1) << PAGE_SECTOR_SHIFT)) >= 0)
 				break;

 			if (k.k->type != KEY_TYPE_DISCARD &&
@ -2022,17 +2029,12 @@ int bch2_truncate(struct inode *inode, struct iattr *iattr)
 	mutex_lock(&ei->update_lock);
 	setattr_copy(inode, iattr);
 	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
-
+err:
 	/* clear I_SIZE_DIRTY: */
 	i_size_dirty_put(ei);
 	ret = bch2_write_inode_size(c, ei, inode->i_size);
 	mutex_unlock(&ei->update_lock);

-	pagecache_block_put(&mapping->add_lock);
-
-	return 0;
-err:
-	i_size_dirty_put(ei);
 err_put_pagecache:
 	pagecache_block_put(&mapping->add_lock);
 	return ret;
@ -2566,3 +2568,5 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)

 	return -EINVAL;
 }
+
+#endif /* NO_BCACHEFS_FS */
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_FS_IO_H
-#define _BCACHE_FS_IO_H
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H

 #include "buckets.h"
 #include <linux/uio.h>
@ -91,4 +91,4 @@ struct dio_read {

 extern struct bio_set *bch2_dio_read_bioset;

-#endif /* _BCACHE_FS_IO_H */
+#endif /* _BCACHEFS_FS_IO_H */
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -1,3 +1,4 @@
+#ifndef NO_BCACHEFS_FS

 #include "bcachefs.h"
 #include "acl.h"
@ -18,8 +19,10 @@
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
 #include <linux/compat.h>
+#include <linux/exportfs.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/posix_acl.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
 #include <linux/xattr.h>
@ -208,7 +211,6 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c,
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	struct bch_inode_info *ei;
 	struct bch_inode_unpacked inode_u;
-	struct bkey_inode_buf inode_p;
 	int ret;

 	inode = new_inode(parent->i_sb);
@ -227,9 +229,7 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c,

 	bch2_inode_init(c, &inode_u, i_uid_read(inode),
 		       i_gid_read(inode), inode->i_mode, rdev);
-	bch2_inode_pack(&inode_p, &inode_u);
-
-	ret = bch2_inode_create(c, &inode_p.inode.k_i,
+	ret = bch2_inode_create(c, &inode_u,
 			       BLOCKDEV_INODE_MAX, 0,
 			       &c->unused_inode_hint);
 	if (unlikely(ret)) {
@ -241,7 +241,6 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c,
 		goto err;
 	}

-	inode_u.inum = inode_p.inode.k.p.inode;
 	bch2_vfs_inode_init(c, ei, &inode_u);

 	if (default_acl) {
@ -1022,6 +1021,45 @@ static const struct address_space_operations bch_address_space_operations = {
 	.error_remove_page = generic_error_remove_page,
 };

+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < BCACHEFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	inode = bch2_vfs_inode_get(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    bch2_nfs_get_inode);
+}
+
+static const struct export_operations bch_export_ops = {
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	//.get_parent	= bch2_get_parent,
+};
+
 static void bch2_vfs_inode_init(struct bch_fs *c,
 				struct bch_inode_info *ei,
 				struct bch_inode_unpacked *bi)
@ -1154,7 +1192,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct bch_fs *c = sb->s_fs_info;
 	u64 fsid;

-	buf->f_type	= BCACHE_STATFS_MAGIC;
+	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
 	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
 	buf->f_bfree	= (c->capacity - bch2_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT;
@ -1371,8 +1409,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	sb->s_blocksize_bits	= PAGE_SHIFT;
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_op		= &bch_super_operations;
+	sb->s_export_op		= &bch_export_ops;
 	sb->s_xattr		= bch2_xattr_handlers;
-	sb->s_magic		= BCACHE_STATFS_MAGIC;
+	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
 	sb->s_time_gran		= c->sb.time_precision;
 	c->vfs_sb		= sb;
 	sb->s_bdi		= &c->bdi;
@ -1393,7 +1432,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	else
 		sb->s_flags	|= opts.posix_acl ? MS_POSIXACL : 0;

-	inode = bch2_vfs_inode_get(sb, BCACHE_ROOT_INO);
+	inode = bch2_vfs_inode_get(sb, BCACHEFS_ROOT_INO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		goto err_put_super;
@ -1480,3 +1519,5 @@ err:
 	bch2_vfs_exit();
 	return ret;
 }
+
+#endif /* NO_BCACHEFS_FS */
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_FS_H
-#define _BCACHE_FS_H
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H

 #include "str_hash.h"

@ -25,6 +25,8 @@ struct bch_inode_info {
 	atomic64_t		i_sectors;

 	struct bch_hash_info	str_hash;
+
+	unsigned long		last_dirtied;
 };

 #define to_bch_ei(_inode)					\
@ -42,7 +44,7 @@ static inline unsigned nlink_bias(umode_t mode)

 struct bch_inode_unpacked;

-#ifndef NO_BCACHE_FS
+#ifndef NO_BCACHEFS_FS

 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
@ -61,6 +63,6 @@ int bch2_vfs_init(void);
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }

-#endif
+#endif /* NO_BCACHEFS_FS */

-#endif /* _BCACHE_FS_H */
+#endif /* _BCACHEFS_FS_H */
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -251,7 +251,7 @@ static int check_extents(struct bch_fs *c)
 	int ret = 0;

 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(BCACHE_ROOT_INO, 0), 0, k) {
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
 		if (k.k->type == KEY_TYPE_DISCARD)
 			continue;

@ -310,7 +310,7 @@ static int check_dirents(struct bch_fs *c)
 	hash_check_init(bch2_dirent_hash_desc, &h, c);

 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
-			   POS(BCACHE_ROOT_INO, 0), 0, k) {
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
 		struct bkey_s_c_dirent d;
 		struct bch_inode_unpacked target;
 		bool have_target;
@ -444,7 +444,7 @@ static int check_xattrs(struct bch_fs *c)
 	hash_check_init(bch2_xattr_hash_desc, &h, c);

 	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
-			   POS(BCACHE_ROOT_INO, 0), 0, k) {
+			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
 		ret = walk_inode(c, &w, k.k->p.inode);
 		if (ret)
 			break;
@ -478,7 +478,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 	struct bkey_inode_buf packed;
 	int ret;

-	ret = bch2_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode);
+	ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
 	if (ret && ret != -ENOENT)
 		return ret;

@ -494,7 +494,7 @@ fsck_err:
 	return ret;
 create_root:
 	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
-	root_inode->inum = BCACHE_ROOT_INO;
+	root_inode->inum = BCACHEFS_ROOT_INO;

 	bch2_inode_pack(&packed, root_inode);

@ -514,7 +514,7 @@ static int check_lostfound(struct bch_fs *c,
 	u64 inum;
 	int ret;

-	inum = bch2_dirent_lookup(c, BCACHE_ROOT_INO, &root_hash_info,
+	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
 				 &lostfound);
 	if (!inum) {
 		bch_notice(c, "creating lost+found");
@ -546,16 +546,13 @@ create_lostfound:
 		return ret;

 	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
-	bch2_inode_pack(&packed, lostfound_inode);

-	ret = bch2_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
+	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
 			       &c->unused_inode_hint);
 	if (ret)
 		return ret;

-	lostfound_inode->inum = packed.inode.k.p.inode;
-
-	ret = bch2_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR,
+	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
 				 &lostfound, lostfound_inode->inum, NULL,
 				 BTREE_INSERT_NOFAIL);
 	if (ret)
@ -645,13 +642,13 @@ static int check_directory_structure(struct bch_fs *c,
 restart_dfs:
 	had_unreachable = false;

-	ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO);
+	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
 	if (ret) {
 		bch_err(c, "memory allocation failure in inode_bitmap_set()");
 		goto err;
 	}

-	ret = path_down(&path, BCACHE_ROOT_INO);
+	ret = path_down(&path, BCACHEFS_ROOT_INO);
 	if (ret) {
 		return ret;
 	}
@ -792,7 +789,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 	u64 d_inum;
 	int ret;

-	inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false);
+	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);

 	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
 		switch (k.k->type) {
--- a/libbcachefs/fsck.h
+++ b/libbcachefs/fsck.h
@ -1,7 +1,7 @@
-#ifndef _BCACHE_FS_GC_H
-#define _BCACHE_FS_GC_H
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H

 s64 bch2_count_inode_sectors(struct bch_fs *, u64);
 int bch2_fsck(struct bch_fs *, bool);

-#endif /* _BCACHE_FS_GC_H */
+#endif /* _BCACHEFS_FSCK_H */
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -206,6 +206,11 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
 			return "blockdev inode in fs range";

+		return NULL;
+	case BCH_INODE_GENERATION:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+			return "incorrect value size";
+
 		return NULL;
 	default:
 		return "invalid type";
@ -257,9 +262,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->i_otime	= now;
 }

-int bch2_inode_create(struct bch_fs *c, struct bkey_i *inode,
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 		      u64 min, u64 max, u64 *hint)
 {
+	struct bkey_inode_buf inode_p;
 	struct btree_iter iter;
 	bool searched_from_start = false;
 	int ret;
@ -281,6 +287,7 @@ again:

 	while (1) {
 		struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+		u32 i_generation = 0;

 		ret = btree_iter_err(k);
 		if (ret) {
@ -288,31 +295,51 @@ again:
 			return ret;
 		}

-		if (k.k->type < BCH_INODE_FS) {
-			inode->k.p = k.k->p;
+		switch (k.k->type) {
+		case BCH_INODE_BLOCKDEV:
+		case BCH_INODE_FS:
+			/* slot used */
+			if (iter.pos.inode == max)
+				goto out;

-			pr_debug("inserting inode %llu (size %u)",
-				 inode->k.p.inode, inode->k.u64s);
+			bch2_btree_iter_advance_pos(&iter);
+			break;
+
+		case BCH_INODE_GENERATION: {
+			struct bkey_s_c_inode_generation g =
+				bkey_s_c_to_inode_generation(k);
+			i_generation = le32_to_cpu(g.v->i_generation);
+			/* fallthrough: */
+		}
+		default:
+			inode_u->i_generation = i_generation;
+
+			bch2_inode_pack(&inode_p, inode_u);
+			inode_p.inode.k.p = k.k->p;

 			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
 					BTREE_INSERT_ATOMIC,
-					BTREE_INSERT_ENTRY(&iter, inode));
+					BTREE_INSERT_ENTRY(&iter,
+							   &inode_p.inode.k_i));
+
+			if (ret != -EINTR) {
+				bch2_btree_iter_unlock(&iter);
+
+				if (!ret) {
+					inode_u->inum =
+						inode_p.inode.k.p.inode;
+					*hint = inode_p.inode.k.p.inode + 1;
+				}
+
+				return ret;
+			}

 			if (ret == -EINTR)
 				continue;

-			bch2_btree_iter_unlock(&iter);
-			if (!ret)
-				*hint = k.k->p.inode + 1;
-
-			return ret;
-		} else {
-			if (iter.pos.inode == max)
-				break;
-			/* slot used */
-			bch2_btree_iter_advance_pos(&iter);
 		}
 	}
+out:
 	bch2_btree_iter_unlock(&iter);

 	if (!searched_from_start) {
@ -337,7 +364,8 @@ int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,

 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 {
-	struct bkey_i delete;
+	struct btree_iter iter;
+	struct bkey_i_inode_generation delete;
 	int ret;

 	ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
@ -366,11 +394,51 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	if (ret < 0)
 		return ret;

+	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
+			     BTREE_ITER_INTENT);
+	do {
+		struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+		u32 i_generation = 0;
+
+		ret = btree_iter_err(k);
+		if (ret) {
+			bch2_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		switch (k.k->type) {
+		case BCH_INODE_FS: {
+			struct bch_inode_unpacked inode_u;
+
+			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+				i_generation = cpu_to_le32(inode_u.i_generation) + 1;
+			break;
+		}
+		case BCH_INODE_GENERATION: {
+			struct bkey_s_c_inode_generation g =
+				bkey_s_c_to_inode_generation(k);
+			i_generation = le32_to_cpu(g.v->i_generation);
+			break;
+		}
+		}
+
+		if (!i_generation) {
 			bkey_init(&delete.k);
 			delete.k.p.inode = inode_nr;
+		} else {
+			bkey_inode_generation_init(&delete.k_i);
+			delete.k.p.inode = inode_nr;
+			delete.v.i_generation = cpu_to_le32(i_generation);
+		}

-	return bch2_btree_insert(c, BTREE_ID_INODES, &delete, NULL,
-				NULL, NULL, BTREE_INSERT_NOFAIL);
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL,
+				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
+	} while (ret == -EINTR);
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
 }

 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_INODE_H
-#define _BCACHE_INODE_H
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H

 #include <linux/math64.h>

@ -29,7 +29,8 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);

 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		    uid_t, gid_t, umode_t, dev_t);
-int bch2_inode_create(struct bch_fs *, struct bkey_i *, u64, u64, u64 *);
+int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+		      u64, u64, u64 *);
 int bch2_inode_truncate(struct bch_fs *, u64, u64,
 		       struct extent_insert_hook *, u64 *);
 int bch2_inode_rm(struct bch_fs *, u64);
@ -60,4 +61,4 @@ void bch2_inode_pack_test(void);
 static inline void bch2_inode_pack_test(void) {}
 #endif

-#endif
+#endif /* _BCACHEFS_INODE_H */
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -79,6 +79,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 /* Bios with headers */

 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
 			       const struct bkey_i *k)
 {
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
@ -122,6 +123,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			n->bio.bi_opf |= REQ_FUA;

 		if (likely(percpu_ref_tryget(&ca->io_ref))) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
 			n->have_io_ref		= true;
 			n->bio.bi_bdev		= ca->disk_sb.bdev;
 			submit_bio(&n->bio);
@ -423,17 +427,12 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
 					 orig, &src_len,
 					 &fragment_compression_type);

-			BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
-			BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
-			BUG_ON(dst_len & (block_bytes(c) - 1));
-			BUG_ON(src_len & (block_bytes(c) - 1));
-
-			swap(bio->bi_iter.bi_size, dst_len);
 			nonce = extent_nonce(op->version,
 					     crc_nonce,
 					     src_len >> 9,
-					     fragment_compression_type),
+					     fragment_compression_type);

+			swap(bio->bi_iter.bi_size, dst_len);
 			bch2_encrypt_bio(c, csum_type, nonce, bio);

 			csum = bch2_checksum_bio(c, csum_type, nonce, bio);
@ -496,7 +495,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)

 	closure_get(bio->bi_private);

-	bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
+	bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+				  key_to_write);
 	return more;
 }

@ -661,9 +661,9 @@ void bch2_write(struct closure *cl)

 	/* Don't call bch2_next_delay() if rate is >= 1 GB/sec */

-	if (c->foreground_write_ratelimit_enabled &&
-	    c->foreground_write_pd.rate.rate < (1 << 30) &&
-	    op->wp->throttle) {
+	if ((op->flags & BCH_WRITE_THROTTLE) &&
+	    c->foreground_write_ratelimit_enabled &&
+	    c->foreground_write_pd.rate.rate < (1 << 30)) {
 		unsigned long flags;
 		u64 delay;

@ -715,7 +715,8 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->error	= 0;
 	op->flags	= flags;
 	op->csum_type	= bch2_data_checksum_type(c);
-	op->compression_type = c->opts.compression;
+	op->compression_type =
+		bch2_compression_opt_to_type(c->opts.compression);
 	op->nr_replicas	= res.nr_replicas;
 	op->alloc_reserve = RESERVE_NONE;
 	op->nonce	= 0;
@ -1203,6 +1204,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (bounce)
 		trace_read_bounce(&rbio->bio);

+	this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+		     bio_sectors(&rbio->bio));
+
 	if (likely(!(flags & BCH_READ_IN_RETRY))) {
 		submit_bio(&rbio->bio);
 	} else {
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_IO_H
-#define _BCACHE_IO_H
+#ifndef _BCACHEFS_IO_H
+#define _BCACHEFS_IO_H

 #include <linux/hash.h>
 #include "io_types.h"
@ -14,19 +14,19 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);

 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       const struct bkey_i *);
+			       enum bch_data_type, const struct bkey_i *);

 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
 	BCH_WRITE_FLUSH			= (1 << 2),
 	BCH_WRITE_DATA_COMPRESSED	= (1 << 3),
+	BCH_WRITE_THROTTLE		= (1 << 4),

 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 4),
-	BCH_WRITE_DONE			= (1 << 5),
-	BCH_WRITE_LOOPED		= (1 << 6),
-	__BCH_WRITE_KEYLIST_LOCKED	= 8,
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 5),
+	BCH_WRITE_DONE			= (1 << 6),
+	BCH_WRITE_LOOPED		= (1 << 7),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -105,4 +105,4 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio)
 	return rbio;
 }

-#endif /* _BCACHE_IO_H */
+#endif /* _BCACHEFS_IO_H */
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_IO_TYPES_H
-#define _BCACHE_IO_TYPES_H
+#ifndef _BCACHEFS_IO_TYPES_H
+#define _BCACHEFS_IO_TYPES_H

 #include "btree_types.h"
 #include "buckets_types.h"
@ -148,4 +148,4 @@ struct bch_write_op {
 	struct bch_write_bio	wbio;
 };

-#endif /* _BCACHE_IO_TYPES_H */
+#endif /* _BCACHEFS_IO_TYPES_H */
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -1274,10 +1274,15 @@ static int journal_entry_sectors(struct journal *j)

 	lockdep_assert_held(&j->lock);

-	spin_lock(&j->devs.lock);
-	group_for_each_dev(ca, &j->devs, i) {
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
 		unsigned buckets_required = 0;

+		if (!ja->nr)
+			continue;
+
 		sectors_available = min_t(unsigned, sectors_available,
 					  ca->mi.bucket_size);

@ -1288,11 +1293,11 @@ static int journal_entry_sectors(struct journal *j)
 		 * it too:
 		 */
 		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-			if (j->prev_buf_sectors > ca->journal.sectors_free)
+			if (j->prev_buf_sectors > ja->sectors_free)
 				buckets_required++;

 			if (j->prev_buf_sectors + sectors_available >
-			    ca->journal.sectors_free)
+			    ja->sectors_free)
 				buckets_required++;
 		} else {
 			if (j->prev_buf_sectors + sectors_available >
@ -1306,7 +1311,7 @@ static int journal_entry_sectors(struct journal *j)
 			nr_devs++;
 		nr_online++;
 	}
-	spin_unlock(&j->devs.lock);
+	rcu_read_unlock();

 	if (nr_online < c->opts.metadata_replicas_required)
 		return -EROFS;
@ -1542,7 +1547,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	 */

 	if (bch2_disk_reservation_get(c, &disk_res,
-			(nr - ja->nr) << ca->bucket_bits, 0))
+			bucket_to_sector(ca, nr - ja->nr), 0))
 		return -ENOSPC;

 	mutex_lock(&c->sb_lock);
@ -1566,7 +1571,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,

 	while (ja->nr < nr) {
 		/* must happen under journal lock, to avoid racing with gc: */
-		long b = bch2_bucket_alloc(c, ca, RESERVE_NONE);
+		long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
 		if (b < 0) {
 			if (!closure_wait(&c->freelist_wait, &cl)) {
 				spin_unlock(&j->lock);
@ -1969,7 +1974,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	struct bch_extent_ptr *ptr;
 	struct journal_device *ja;
 	struct bch_dev *ca;
-	bool swapped;
+	struct dev_alloc_list devs_sorted;
 	unsigned i, replicas, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);

@ -1996,26 +2001,18 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)

 	replicas = bch2_extent_nr_ptrs(e.c);

-	spin_lock(&j->devs.lock);
+	rcu_read_lock();
+	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+					 &c->rw_devs[BCH_DATA_JOURNAL]);

-	/* Sort by tier: */
-	do {
-		swapped = false;
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;

-		for (i = 0; i + 1 < j->devs.nr; i++)
-			if (j->devs.d[i + 0].dev->mi.tier >
-			    j->devs.d[i + 1].dev->mi.tier) {
-				swap(j->devs.d[i], j->devs.d[i + 1]);
-				swapped = true;
-			}
-	} while (swapped);
-
-	/*
-	 * Pick devices for next journal write:
-	 * XXX: sort devices by free journal space?
-	 */
-	group_for_each_dev(ca, &j->devs, i) {
 		ja = &ca->journal;
+		if (!ja->nr)
+			continue;

 		if (replicas >= replicas_want)
 			break;
@ -2029,6 +2026,9 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 		    sectors > ca->mi.bucket_size)
 			continue;

+		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
+		bch2_wp_rescale(c, ca, &j->wp);
+
 		ja->sectors_free = ca->mi.bucket_size - sectors;
 		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 		ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
@ -2041,7 +2041,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 		});
 		replicas++;
 	}
-	spin_unlock(&j->devs.lock);
+	rcu_read_unlock();

 	j->prev_buf_sectors = 0;
 	spin_unlock(&j->lock);
@ -2280,7 +2280,8 @@ static void journal_write(struct closure *cl)
 			continue;
 		}

-		atomic64_add(sectors, &ca->meta_sectors_written);
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+			     sectors);

 		ca->journal.ptr_idx	= ptr_idx++;
 		bio = ca->journal.bio;
@ -2682,6 +2683,7 @@ int bch2_journal_flush(struct journal *j)

 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union journal_res_state *s = &j->reservations;
 	struct bch_dev *ca;
 	unsigned iter;
@ -2714,10 +2716,13 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 			 journal_entry_is_open(j),
 			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));

-	spin_lock(&j->devs.lock);
-	group_for_each_dev(ca, &j->devs, iter) {
+	for_each_member_device_rcu(ca, c, iter,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;

+		if (!ja->nr)
+			continue;
+
 		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				 "dev %u:\n"
 				 "\tnr\t\t%u\n"
@ -2727,7 +2732,6 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
 				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
-	spin_unlock(&j->devs.lock);

 	spin_unlock(&j->lock);
 	rcu_read_unlock();
@ -2911,7 +2915,6 @@ int bch2_fs_journal_init(struct journal *j)
 	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
-	spin_lock_init(&j->devs.lock);
 	mutex_init(&j->reclaim_lock);

 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_JOURNAL_H
-#define _BCACHE_JOURNAL_H
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H

 /*
 * THE JOURNAL:
@ -402,4 +402,4 @@ int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
 int bch2_fs_journal_init(struct journal *);

-#endif /* _BCACHE_JOURNAL_H */
+#endif /* _BCACHEFS_JOURNAL_H */
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -1,10 +1,11 @@
-#ifndef _BCACHE_JOURNAL_TYPES_H
-#define _BCACHE_JOURNAL_TYPES_H
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H

 #include <linux/cache.h>
 #include <linux/workqueue.h>

 #include "alloc_types.h"
+#include "super_types.h"
 #include "fifo.h"

 struct journal_res;
@ -176,7 +177,7 @@ struct journal {
 	struct list_head	seq_blacklist;

 	BKEY_PADDED(key);
-	struct dev_group	devs;
+	struct write_point	wp;

 	struct delayed_work	reclaim_work;
 	unsigned long		last_flushed;
@ -234,4 +235,4 @@ struct journal_device {
 	struct closure		read;
 };

-#endif /* _BCACHE_JOURNAL_TYPES_H */
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_KEYLIST_H
-#define _BCACHE_KEYLIST_H
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H

 #include "keylist_types.h"

@ -65,4 +65,4 @@ void bch2_verify_keylist_sorted(struct keylist *);
 static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
 #endif

-#endif /* _BCACHE_KEYLIST_H */
+#endif /* _BCACHEFS_KEYLIST_H */
--- a/libbcachefs/keylist_types.h
+++ b/libbcachefs/keylist_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_KEYLIST_TYPES_H
-#define _BCACHE_KEYLIST_TYPES_H
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H

 struct keylist {
 	union {
@ -12,4 +12,4 @@ struct keylist {
 	};
 };

-#endif /* _BCACHE_KEYLIST_TYPES_H */
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
--- a/libbcachefs/lz4.h
+++ b/libbcachefs/lz4.h
@ -1,87 +1,7 @@
-#ifndef __LZ4_H__
-#define __LZ4_H__
-/*
- * LZ4 Kernel Interface
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define LZ4_MEM_COMPRESS	(16384)
-#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
+#ifndef __BCH_LZ4_H__
+#define __BCH_LZ4_H__

-/*
- * lz4_compressbound()
- * Provides the maximum size that LZ4 may output in a "worst case" scenario
- * (input data not compressible)
- */
-static inline size_t lz4_compressbound(size_t isize)
-{
-	return isize + (isize / 255) + 16;
-}
-
-/*
- * lz4_compress()
- *	src     : source address of the original data
- *	src_len : size of the original data
- *	dst	: output buffer address of the compressed data
- *		This requires 'dst' of size LZ4_COMPRESSBOUND.
- *	dst_len : is the output size, which is returned after compress done
- *	workmem : address of the working memory.
- *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer and workmem must be already allocated with
- *		the defined size.
- */
-int lz4_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
-
- /*
-  * lz4hc_compress()
-  *	 src	 : source address of the original data
-  *	 src_len : size of the original data
-  *	 dst	 : output buffer address of the compressed data
-  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
-  *	 dst_len : is the output size, which is returned after compress done
-  *	 workmem : address of the working memory.
-  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
-  *	 return  : Success if return 0
-  *		   Error if return (< 0)
-  *	 note :  Destination buffer and workmem must be already allocated with
-  *		 the defined size.
-  */
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
-
-/*
- * lz4_decompress()
- *	src     : source address of the compressed data
- *	src_len : is the input size, whcih is returned after decompress done
- *	dest	: output buffer address of the decompressed data
- *	actual_dest_len: is the size of uncompressed data, supposing it's known
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
- *		slightly faster than lz4_decompress_unknownoutputsize()
- */
-int lz4_decompress(const unsigned char *src, size_t *src_len,
+int bch2_lz4_decompress(const unsigned char *src, size_t *src_len,
 			unsigned char *dest, size_t actual_dest_len);

-/*
- * lz4_decompress_unknownoutputsize()
- *	src     : source address of the compressed data
- *	src_len : is the input size, therefore the compressed size
- *	dest	: output buffer address of the decompressed data
- *	dest_len: is the max size of the destination buffer, which is
- *			returned with actual size of decompressed data after
- *			decompress done
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
- */
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len);
 #endif
--- a/libbcachefs/lz4_compress.c
+++ b/libbcachefs/lz4_compress.c
@ -1,228 +0,0 @@
-/*
- * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
- *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <asm/unaligned.h>
-#include "lz4.h"
-#include "lz4defs.h"
-
-#define LZ4_HASH_VALUE(p, _table)				\
-	__HASH_VALUE(p, MEMORY_USAGE - ilog2(sizeof(_table[0])))
-
-struct lz4_hash_table {
-	const u8	*(*add)(const struct lz4_hash_table, const u8 *);
-	void		*ctx;
-	const u8	*base;
-};
-
-#if __SIZEOF_POINTER__ == 4
-static inline const u8 *hash_table_add32(const struct lz4_hash_table hash,
-					 const u8 *ip)
-{
-	const u8 **table = hash.ctx;
-
-	swap(table[LZ4_HASH_VALUE(ip, table)], ip);
-	return ip;
-}
-#else
-static inline const u8 *hash_table_add32(const struct lz4_hash_table hash,
-					 const u8 *ip)
-{
-	u32 *table = hash.ctx;
-	size_t offset = ip - hash.base;
-
-	swap(table[LZ4_HASH_VALUE(ip, table)], offset);
-	return hash.base + offset;
-}
-#endif
-
-static inline const u8 *hash_table_add16(const struct lz4_hash_table hash,
-					 const u8 *ip)
-{
-	u16 *table = hash.ctx;
-	size_t offset = ip - hash.base;
-
-	swap(table[LZ4_HASH_VALUE(ip, table)], offset);
-	return hash.base + offset;
-}
-
-static inline const u8 *find_match(const struct lz4_hash_table hash,
-				   const u8 **ip, const u8 *anchor,
-				   const u8 *start, const u8 *mflimit)
-{
-	int findmatchattempts = (1U << SKIPSTRENGTH) + 3;
-
-	while (*ip <= mflimit) {
-		const u8 *ref = hash.add(hash, *ip);
-
-		if (ref >= *ip - MAX_DISTANCE && A32(ref) == A32(*ip)) {
-			/* found match: */
-			while (*ip > anchor &&
-			       ref > start &&
-			       unlikely((*ip)[-1] == ref[-1])) {
-				(*ip)--;
-				ref--;
-			}
-
-			return ref;
-		}
-
-		*ip += findmatchattempts++ >> SKIPSTRENGTH;
-	}
-
-	return NULL;
-}
-
-static inline int length_len(unsigned length)
-{
-	return length / 255 + 1;
-}
-
-/*
- * LZ4_compressCtx :
- * -----------------
- * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
- * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
- * will stop, and result of the function will be zero.
- * return : the number of bytes written in buffer 'dest', or 0 if the
- * compression fails
- */
-static inline int lz4_compressctx(const struct lz4_hash_table hash,
-				  const u8 *src, size_t src_len,
-				  u8 *dst, size_t *dst_len)
-{
-	const u8 *ip = src, *anchor = ip, *ref;
-	const u8 *const iend = ip + src_len;
-	const u8 *const mflimit = iend - MFLIMIT;
-	const u8 *const matchlimit = iend - LASTLITERALS;
-	u8 *op = dst, *token;
-	u8 *const oend = op + *dst_len;
-	size_t literal_len, match_len, match_offset;
-
-	/* Init */
-	memset(hash.ctx, 0, LZ4_MEM_COMPRESS);
-	hash.add(hash, ip);
-
-	/* Always start with a literal: */
-	ip++;
-
-	while ((ref = find_match(hash, &ip, anchor, src, mflimit))) {
-		/*
-		 * We found a match; @ip now points to the match and @ref points
-		 * to the prior part of the input we matched with. Everything up
-		 * to @anchor has been encoded; the range from @anchor to @ip
-		 * didn't match and now has to be encoded as a literal:
-		 */
-		literal_len = ip - anchor;
-		match_offset = ip - ref;
-
-		/* MINMATCH bytes already matched from find_match(): */
-		ip += MINMATCH;
-		ref += MINMATCH;
-		match_len = common_length(ip, ref, matchlimit);
-		ip += match_len;
-
-		/* check output limit */
-		if (unlikely(op +
-			     1 + /* token */
-			     2 + /* match ofset */
-			     literal_len +
-			     length_len(literal_len) +
-			     length_len(match_len) +
-			     LASTLITERALS > oend))
-			break;
-
-		token = op++;
-		*token = encode_length(&op, literal_len) << ML_BITS;
-		MEMCPY_ADVANCE_CHUNKED(op, anchor, literal_len);
-		PUT_LE16_ADVANCE(op, match_offset);
-		*token += encode_length(&op, match_len);
-
-		anchor = ip;
-	}
-
-	/* Encode remaining input as literal: */
-	literal_len = iend - anchor;
-	if (unlikely(op +
-		     1 +
-		     literal_len +
-		     length_len(literal_len) > oend)) {
-		/* Return how much would be able to fit: */
-		ssize_t remaining = oend - op;
-		ssize_t encoded = anchor - src;
-
-		remaining -= length_len(remaining) + 1;
-
-		return -max(encoded + remaining, 1L);
-	}
-
-	token = op++;
-	*token = encode_length(&op, literal_len) << ML_BITS;
-	MEMCPY_ADVANCE(op, anchor, literal_len);
-
-	/* End */
-	BUG_ON(op > oend);
-	*dst_len = op - dst;
-	return 0;
-}
-
-__attribute__((flatten))
-int lz4_compress(const unsigned char *src, size_t src_len,
-		 unsigned char *dst, size_t *dst_len, void *wrkmem)
-{
-	if (src_len < LZ4_64KLIMIT) {
-		const struct lz4_hash_table hash = {
-			.add	= hash_table_add16,
-			.ctx	= wrkmem,
-			.base	= src,
-		};
-
-		return lz4_compressctx(hash, src, src_len, dst, dst_len);
-	} else {
-		const struct lz4_hash_table hash = {
-			.add	= hash_table_add32,
-			.ctx	= wrkmem,
-			.base	= src,
-		};
-
-		return lz4_compressctx(hash, src, src_len, dst, dst_len);
-	}
-}
-EXPORT_SYMBOL(lz4_compress);
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4 compressor");
--- a/libbcachefs/lz4_decompress.c
+++ b/libbcachefs/lz4_decompress.c
@ -43,7 +43,110 @@
 #endif

 #include "lz4.h"
-#include "lz4defs.h"
+
+/*
+ * Detects 64 bits mode
+ */
+#if defined(CONFIG_64BIT)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+#include <asm/unaligned.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#define A32(_p) get_unaligned((u32 *) (_p))
+#define A16(_p) get_unaligned((u16 *) (_p))
+
+#define GET_LE16_ADVANCE(_src)				\
+({							\
+	u16 _r = get_unaligned_le16(_src);		\
+	(_src) += 2;					\
+	_r;						\
+})
+
+#define PUT_LE16_ADVANCE(_dst, _v)			\
+do {							\
+	put_unaligned_le16((_v), (_dst));		\
+	(_dst) += 2;					\
+} while (0)
+
+#define LENGTH_LONG		15
+#define COPYLENGTH		8
+#define ML_BITS			4
+#define ML_MASK			((1U << ML_BITS) - 1)
+#define RUN_BITS		(8 - ML_BITS)
+#define RUN_MASK		((1U << RUN_BITS) - 1)
+#define MEMORY_USAGE		14
+#define MINMATCH		4
+#define SKIPSTRENGTH		6
+#define LASTLITERALS		5
+#define MFLIMIT			(COPYLENGTH + MINMATCH)
+#define MINLENGTH		(MFLIMIT + 1)
+#define MAXD_LOG		16
+#define MAXD			(1 << MAXD_LOG)
+#define MAXD_MASK		(u32)(MAXD - 1)
+#define MAX_DISTANCE		(MAXD - 1)
+#define HASH_LOG		(MAXD_LOG - 1)
+#define HASHTABLESIZE		(1 << HASH_LOG)
+#define MAX_NB_ATTEMPTS		256
+#define OPTIMAL_ML		(int)((ML_MASK-1)+MINMATCH)
+#define LZ4_64KLIMIT		((1<<16) + (MFLIMIT - 1))
+
+#define __HASH_VALUE(p, bits)				\
+	(((A32(p)) * 2654435761U) >> (32 - (bits)))
+
+#define HASH_VALUE(p)		__HASH_VALUE(p, HASH_LOG)
+
+#define MEMCPY_ADVANCE(_dst, _src, length)		\
+do {							\
+	typeof(length) _length = (length);		\
+	memcpy(_dst, _src, _length);			\
+	_src += _length;				\
+	_dst += _length;				\
+} while (0)
+
+#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length)	\
+do {							\
+	const u8 *_end = (_src) + (_length);		\
+	while ((_src) < _end)				\
+		*_dst++ = *_src++;			\
+} while (0)
+
+#define STEPSIZE		__SIZEOF_LONG__
+
+#define LZ4_COPYPACKET(_src, _dst)			\
+do {							\
+	MEMCPY_ADVANCE(_dst, _src, STEPSIZE);		\
+	MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\
+} while (0)
+
+/*
+ * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by
+ * COPYLENGTH:
+ *
+ * Note: src and dst may overlap (with src < dst) - we must do the copy in
+ * STEPSIZE chunks for correctness
+ *
+ * Note also: length may be negative - we must not call memcpy if length is
+ * negative, but still adjust dst and src by length
+ */
+#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length)	\
+do {							\
+	u8 *_end = (_dst) + (_length);			\
+	while ((_dst) < _end)				\
+		LZ4_COPYPACKET(_src, _dst);		\
+	_src -= (_dst) - _end;				\
+	_dst = _end;					\
+} while (0)
+
+#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\
+do {							\
+	while ((_dst) < (_end))				\
+		LZ4_COPYPACKET((_src), (_dst));		\
+} while (0)

 static const int dec32table[8] = {0, 3, 2, 3, 0, 0, 0, 0};
 #if LZ4_ARCH64
@ -157,123 +260,7 @@ _output_error:
 	return -1;
 }

-static inline ssize_t get_length_safe(const u8 **ip, ssize_t length)
-{
-	if (length == 15) {
-		size_t len;
-
-		do {
-			length += (len = *(*ip)++);
-			if (unlikely((ssize_t) length < 0))
-				return -1;
-
-			length += len;
-		} while (len == 255);
-	}
-
-	return length;
-}
-
-static int lz4_uncompress_unknownoutputsize(const u8 *source, u8 *dest,
-				int isize, size_t maxoutputsize)
-{
-	const u8 *ip = source;
-	const u8 *const iend = ip + isize;
-	const u8 *ref;
-	u8 *op = dest;
-	u8 * const oend = op + maxoutputsize;
-	u8 *cpy;
-	unsigned token, offset;
-	size_t length;
-
-	/* Main Loop */
-	while (ip < iend) {
-		/* get runlength */
-		token = *ip++;
-		length = get_length_safe(&ip, token >> ML_BITS);
-		if (unlikely((ssize_t) length < 0))
-			goto _output_error;
-
-		/* copy literals */
-		if ((op + length > oend - COPYLENGTH) ||
-		    (ip + length > iend - COPYLENGTH)) {
-
-			if (op + length > oend)
-				goto _output_error;/* writes beyond buffer */
-
-			if (ip + length != iend)
-				goto _output_error;/*
-						    * Error: LZ4 format requires
-						    * to consume all input
-						    * at this stage
-						    */
-			MEMCPY_ADVANCE(op, ip, length);
-			break;/* Necessarily EOF, due to parsing restrictions */
-		}
-		MEMCPY_ADVANCE_CHUNKED(op, ip, length);
-
-		/* get match offset */
-		offset = GET_LE16_ADVANCE(ip);
-		ref = op - offset;
-
-		/* Error: offset create reference outside destination buffer */
-		if (ref < (u8 * const) dest)
-			goto _output_error;
-
-		/* get match length */
-		length = get_length_safe(&ip, token & ML_MASK);
-		if (unlikely((ssize_t) length < 0))
-			goto _output_error;
-
-		length += MINMATCH;
-
-		/* copy first STEPSIZE bytes of match: */
-		if (unlikely(offset < STEPSIZE)) {
-			MEMCPY_ADVANCE_BYTES(op, ref, 4);
-			ref -= dec32table[offset];
-
-			memcpy(op, ref, 4);
-			op += STEPSIZE - 4;
-			ref -= dec64table[offset];
-		} else {
-			MEMCPY_ADVANCE(op, ref, STEPSIZE);
-		}
-		length -= STEPSIZE;
-
-		/* copy rest of match: */
-		cpy = op + length;
-		if (cpy > oend - COPYLENGTH) {
-			/* Error: request to write beyond destination buffer */
-			if (cpy              > oend ||
-			    ref + COPYLENGTH > oend)
-				goto _output_error;
-#if !LZ4_ARCH64
-			if (op  + COPYLENGTH > oend)
-				goto _output_error;
-#endif
-			MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH);
-			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-		} else {
-			MEMCPY_ADVANCE_CHUNKED(op, ref, length);
-		}
-	}
-	/* end of decoding */
-	return op - dest;
-
-	/* write overflow error detected */
-_output_error:
-	return -1;
-}
-
-int lz4_decompress(const unsigned char *src, size_t *src_len,
+int bch2_lz4_decompress(const unsigned char *src, size_t *src_len,
 			unsigned char *dest, size_t actual_dest_len)
 {
 	int ret = -1;
@ -288,29 +275,3 @@ int lz4_decompress(const unsigned char *src, size_t *src_len,
 exit_0:
 	return ret;
 }
-#ifndef STATIC
-EXPORT_SYMBOL(lz4_decompress);
-#endif
-
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len)
-{
-	int ret = -1;
-	int out_len = 0;
-
-	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
-					*dest_len);
-	if (out_len < 0)
-		goto exit_0;
-	*dest_len = out_len;
-
-	return 0;
-exit_0:
-	return ret;
-}
-#ifndef STATIC
-EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4 Decompressor");
-#endif
--- a/libbcachefs/lz4defs.h
+++ b/libbcachefs/lz4defs.h
@ -1,182 +0,0 @@
-/*
- * lz4defs.h -- architecture specific defines
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * Detects 64 bits mode
- */
-#if defined(CONFIG_64BIT)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
-
-#include <asm/unaligned.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-
-#define A32(_p) get_unaligned((u32 *) (_p))
-#define A16(_p) get_unaligned((u16 *) (_p))
-
-#define GET_LE16_ADVANCE(_src)				\
-({							\
-	u16 _r = get_unaligned_le16(_src);		\
-	(_src) += 2;					\
-	_r;						\
-})
-
-#define PUT_LE16_ADVANCE(_dst, _v)			\
-do {							\
-	put_unaligned_le16((_v), (_dst));		\
-	(_dst) += 2;					\
-} while (0)
-
-#define LENGTH_LONG		15
-#define COPYLENGTH		8
-#define ML_BITS			4
-#define ML_MASK			((1U << ML_BITS) - 1)
-#define RUN_BITS		(8 - ML_BITS)
-#define RUN_MASK		((1U << RUN_BITS) - 1)
-#define MEMORY_USAGE		14
-#define MINMATCH		4
-#define SKIPSTRENGTH		6
-#define LASTLITERALS		5
-#define MFLIMIT			(COPYLENGTH + MINMATCH)
-#define MINLENGTH		(MFLIMIT + 1)
-#define MAXD_LOG		16
-#define MAXD			(1 << MAXD_LOG)
-#define MAXD_MASK		(u32)(MAXD - 1)
-#define MAX_DISTANCE		(MAXD - 1)
-#define HASH_LOG		(MAXD_LOG - 1)
-#define HASHTABLESIZE		(1 << HASH_LOG)
-#define MAX_NB_ATTEMPTS		256
-#define OPTIMAL_ML		(int)((ML_MASK-1)+MINMATCH)
-#define LZ4_64KLIMIT		((1<<16) + (MFLIMIT - 1))
-
-#define __HASH_VALUE(p, bits)				\
-	(((A32(p)) * 2654435761U) >> (32 - (bits)))
-
-#define HASH_VALUE(p)		__HASH_VALUE(p, HASH_LOG)
-
-#define MEMCPY_ADVANCE(_dst, _src, length)		\
-do {							\
-	typeof(length) _length = (length);		\
-	memcpy(_dst, _src, _length);			\
-	_src += _length;				\
-	_dst += _length;				\
-} while (0)
-
-#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length)	\
-do {							\
-	const u8 *_end = (_src) + (_length);		\
-	while ((_src) < _end)				\
-		*_dst++ = *_src++;			\
-} while (0)
-
-#define STEPSIZE		__SIZEOF_LONG__
-
-#define LZ4_COPYPACKET(_src, _dst)			\
-do {							\
-	MEMCPY_ADVANCE(_dst, _src, STEPSIZE);		\
-	MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\
-} while (0)
-
-/*
- * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by
- * COPYLENGTH:
- *
- * Note: src and dst may overlap (with src < dst) - we must do the copy in
- * STEPSIZE chunks for correctness
- *
- * Note also: length may be negative - we must not call memcpy if length is
- * negative, but still adjust dst and src by length
- */
-#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length)	\
-do {							\
-	u8 *_end = (_dst) + (_length);			\
-	while ((_dst) < _end)				\
-		LZ4_COPYPACKET(_src, _dst);		\
-	_src -= (_dst) - _end;				\
-	_dst = _end;					\
-} while (0)
-
-#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\
-do {							\
-	while ((_dst) < (_end))				\
-		LZ4_COPYPACKET((_src), (_dst));		\
-} while (0)
-
-struct lz4_hashtable {
-#if LZ4_ARCH64
-	const u8 * const	base;
-	u32			*table;
-#else
-	const int		base;
-	const u8		*table;
-#endif
-};
-
-#if LZ4_ARCH64
-#define HTYPE u32
-#else	/* 32-bit */
-#define HTYPE const u8*
-#endif
-
-#ifdef __BIG_ENDIAN
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clzl(val) >> 3)
-#else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzl(val) >> 3)
-#endif
-
-static inline unsigned common_length(const u8 *l, const u8 *r,
-				     const u8 *const l_end)
-{
-	const u8 *l_start = l;
-
-	while (likely(l <= l_end - sizeof(long))) {
-		unsigned long diff =
-			get_unaligned((unsigned long *) l) ^
-			get_unaligned((unsigned long *) r);
-
-		if (diff)
-			return l + LZ4_NBCOMMONBYTES(diff) - l_start;
-
-		l += sizeof(long);
-		r += sizeof(long);
-	}
-#if LZ4_ARCH64
-	if (l <= l_end - 4 && A32(r) == A32(l)) {
-		l += 4;
-		r += 4;
-	}
-#endif
-	if (l <= l_end - 2 && A16(r) == A16(l)) {
-		l += 2;
-		r += 2;
-	}
-	if (l <= l_end - 1 && *r == *l) {
-		l++;
-		r++;
-	}
-
-	return l - l_start;
-}
-
-static inline unsigned encode_length(u8 **op, unsigned length)
-{
-	if (length >= LENGTH_LONG) {
-		length -= LENGTH_LONG;
-
-		for (; length > 254 ; length -= 255)
-			*(*op)++ = 255;
-		*(*op)++ = length;
-		return LENGTH_LONG;
-	} else
-		return length;
-}
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@ -1,8 +1,8 @@
-#ifndef _BCACHE_MIGRATE_H
-#define _BCACHE_MIGRATE_H
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H

 int bch2_move_data_off_device(struct bch_dev *);
 int bch2_move_metadata_off_device(struct bch_dev *);
 int bch2_flag_data_bad(struct bch_dev *);

-#endif /* _BCACHE_MIGRATE_H */
+#endif /* _BCACHEFS_MIGRATE_H */
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -17,13 +17,13 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
 					    struct bch_extent_ptr ptr)
 {
 	struct bch_extent_ptr *ptr2;
-	unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits;
+	struct bch_dev *ca = c->devs[ptr.dev];

 	extent_for_each_ptr(e, ptr2)
 		if (ptr2->dev == ptr.dev &&
 		    ptr2->gen == ptr.gen &&
-		    (ptr2->offset >> bucket_bits) ==
-		    (ptr.offset >> bucket_bits))
+		    PTR_BUCKET_NR(ca, ptr2) ==
+		    PTR_BUCKET_NR(ca, &ptr))
 			return ptr2;

 	return NULL;
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@ -1,9 +1,8 @@
-#ifndef _BCACHE_MOVE_H
-#define _BCACHE_MOVE_H
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H

 #include "buckets.h"
 #include "io_types.h"
-#include "move_types.h"

 enum moving_flag_bitnos {
 	MOVING_FLAG_BITNO_READ = 0,
@ -83,4 +82,4 @@ void bch2_move_ctxt_exit(struct moving_context *);
 void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
 			unsigned);

-#endif /* _BCACHE_MOVE_H */
+#endif /* _BCACHEFS_MOVE_H */
--- a/libbcachefs/move_types.h
+++ b/libbcachefs/move_types.h
@ -1,4 +0,0 @@
-#ifndef _BCACHE_MOVE_TYPES_H
-#define _BCACHE_MOVE_TYPES_H
-
-#endif /* _BCACHE_MOVE_TYPES_H */
--- a/libbcachefs/movinggc.h
+++ b/libbcachefs/movinggc.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_MOVINGGC_H
-#define _BCACHE_MOVINGGC_H
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H

 /*
 * We can't use the entire copygc reserve in one iteration of copygc: we may
@ -27,4 +27,4 @@ void bch2_moving_gc_stop(struct bch_dev *);
 int bch2_moving_gc_start(struct bch_dev *);
 void bch2_dev_moving_gc_init(struct bch_dev *);

-#endif
+#endif /* _BCACHEFS_MOVINGGC_H */
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -32,6 +32,15 @@ const char * const bch2_str_hash_types[] = {
 	NULL
 };

+const char * const bch2_data_types[] = {
+	"none",
+	"sb",
+	"journal",
+	"btree",
+	"data",
+	NULL
+};
+
 const char * const bch2_cache_replacement_policies[] = {
 	"lru",
 	"fifo",
@ -237,6 +246,6 @@ ssize_t bch2_opt_show(struct bch_opts *opts, const char *name,
 	opt = &bch2_opt_table[id];

 	return opt->type == BCH_OPT_STR
-		? bch2_snprint_string_list(buf, size, opt->choices, v)
-		: snprintf(buf, size, "%lli\n", v);
+		? bch2_scnprint_string_list(buf, size, opt->choices, v)
+		: scnprintf(buf, size, "%lli", v);
 }
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_OPTS_H
-#define _BCACHE_OPTS_H
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H

 #include <linux/bug.h>
 #include <linux/log2.h>
@ -10,6 +10,7 @@ extern const char * const bch2_error_actions[];
 extern const char * const bch2_csum_types[];
 extern const char * const bch2_compression_types[];
 extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_cache_modes[];
 extern const char * const bch2_dev_state[];
@ -167,4 +168,4 @@ enum bch_opt_id bch2_parse_sysfs_opt(const char *, const char *, u64 *);

 ssize_t bch2_opt_show(struct bch_opts *, const char *, char *, size_t);

-#endif /* _BCACHE_OPTS_H */
+#endif /* _BCACHEFS_OPTS_H */
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@ -1,6 +1,5 @@
-
-#ifndef _BCACHE_SIX_H
-#define _BCACHE_SIX_H
+#ifndef _BCACHEFS_SIX_H
+#define _BCACHEFS_SIX_H

 #include <linux/lockdep.h>
 #include <linux/osq_lock.h>
@ -133,4 +132,4 @@ __SIX_LOCK(read)
 __SIX_LOCK(intent)
 __SIX_LOCK(write)

-#endif /* _BCACHE_SIX_H */
+#endif /* _BCACHEFS_SIX_H */
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_STR_HASH_H
-#define _BCACHE_STR_HASH_H
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H

 #include "btree_iter.h"
 #include "btree_update.h"
@ -404,4 +404,4 @@ err:
 	return ret;
 }

-#endif /* _BCACHE_STR_HASH_H */
+#endif /* _BCACHEFS_STR_HASH_H */
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -314,16 +314,12 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
 	const char *err;
 	u16 block_size;

-	switch (le64_to_cpu(sb->version)) {
-	case BCACHE_SB_VERSION_CDEV_V4:
-		break;
-	default:
+	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
+	    le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
 		return"Unsupported superblock version";
-	}

-	if (BCH_SB_INITIALIZED(sb) &&
-	    le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4)
-		return "Unsupported superblock version";
+	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
+		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);

 	block_size = le16_to_cpu(sb->block_size);

@ -397,15 +393,22 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
 	sb_mi = bch2_sb_get_members(sb);
 	mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx);

+	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
+		struct bch_member *m;
+
+		for (m = sb_mi->members;
+		     m < sb_mi->members + sb->nr_devices;
+		     m++)
+			SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
+	}
+
 	if (mi.nbuckets > LONG_MAX)
 		return "Too many buckets";

 	if (mi.nbuckets - mi.first_bucket < 1 << 10)
 		return "Not enough buckets";

-	if (!is_power_of_2(mi.bucket_size) ||
-	    mi.bucket_size < PAGE_SECTORS ||
-	    mi.bucket_size < block_size)
+	if (mi.bucket_size < block_size)
 		return "Bad bucket size";

 	if (get_capacity(disk_sb->bdev->bd_disk) <
@ -420,6 +423,8 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
 	if (err)
 		return err;

+	sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
+
 	return NULL;
 }

@ -463,6 +468,7 @@ static void bch2_sb_update(struct bch_fs *c)
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.str_hash_type	= BCH_SB_STR_HASH_TYPE(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
+	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
 	c->sb.time_base_lo	= le64_to_cpu(src->time_base_lo);
 	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
 	c->sb.time_precision	= le32_to_cpu(src->time_precision);
@ -570,8 +576,9 @@ reread:
 	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
 		return "Not a bcachefs superblock";

-	if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4)
-		return "Unsupported superblock version";
+	if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
+	    le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
+		return"Unsupported superblock version";

 	bytes = vstruct_bytes(sb->sb);

@ -729,6 +736,9 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
 	bch2_bio_map(bio, sb);

+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+		     bio_sectors(bio));
+
 	percpu_ref_get(&ca->io_ref);
 	closure_bio_submit(bio, &c->sb_write);
 }
@ -784,7 +794,7 @@ void bch2_write_super(struct bch_fs *c)
 		if (ca->sb_write_error)
 			__clear_bit(ca->dev_idx, sb_written.d);

-	nr_wrote = bitmap_weight(sb_written.d, BCH_SB_MEMBERS_MAX);
+	nr_wrote = dev_mask_nr(&sb_written);

 	can_mount_with_written =
 		bch2_have_enough_devs(c,
@ -824,17 +834,6 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 	return (void *) r->entries + r->entry_size * i;
 }

-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
 				     unsigned dev)
 {
@ -939,7 +938,7 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 }

 static void bkey_to_replicas(struct bkey_s_c_extent e,
-			     enum bch_data_types data_type,
+			     enum bch_data_type data_type,
 			     struct bch_replicas_cpu_entry *r,
 			     unsigned *max_dev)
 {
@ -967,7 +966,7 @@ static void bkey_to_replicas(struct bkey_s_c_extent e,
 static int bch2_update_gc_replicas(struct bch_fs *c,
 				   struct bch_replicas_cpu *gc_r,
 				   struct bkey_s_c_extent e,
-				   enum bch_data_types data_type)
+				   enum bch_data_type data_type)
 {
 	struct bch_replicas_cpu_entry new_e;
 	struct bch_replicas_cpu *new;
@ -1009,7 +1008,7 @@ static int bch2_update_gc_replicas(struct bch_fs *c,

 static bool replicas_has_extent(struct bch_replicas_cpu *r,
 				struct bkey_s_c_extent e,
-				enum bch_data_types data_type)
+				enum bch_data_type data_type)
 {
 	struct bch_replicas_cpu_entry search;
 	unsigned max_dev;
@ -1023,7 +1022,7 @@ static bool replicas_has_extent(struct bch_replicas_cpu *r,
 }

 bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_types data_type)
+			  enum bch_data_type data_type)
 {
 	bool ret;

@ -1038,7 +1037,7 @@ bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
 noinline
 static int bch2_check_mark_super_slowpath(struct bch_fs *c,
 					  struct bkey_s_c_extent e,
-					  enum bch_data_types data_type)
+					  enum bch_data_type data_type)
 {
 	struct bch_replicas_cpu *gc_r;
 	const struct bch_extent_ptr *ptr;
@ -1103,7 +1102,7 @@ err:
 }

 int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_types data_type)
+			  enum bch_data_type data_type)
 {
 	struct bch_replicas_cpu *gc_r;
 	bool marked;
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_SUPER_IO_H
-#define _BCACHE_SUPER_IO_H
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H

 #include "extents.h"
 #include "eytzinger.h"
@ -104,6 +104,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 		.tier		= BCH_MEMBER_TIER(mi),
 		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
 		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
 		.valid		= !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
 	};
 }
@ -122,10 +123,25 @@ const char *bch2_read_super(struct bcache_superblock *,
 			   struct bch_opts, const char *);
 void bch2_write_super(struct bch_fs *);

+/* replicas: */
+
+/* iterate over bch_sb_field_replicas: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
 bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
-			  enum bch_data_types);
+			  enum bch_data_type);
 int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
-			  enum bch_data_types);
+			  enum bch_data_type);

 struct replicas_status {
 	struct {
@ -145,4 +161,4 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);

-#endif /* _BCACHE_SUPER_IO_H */
+#endif /* _BCACHEFS_SUPER_IO_H */
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -100,7 +100,7 @@ struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev)
 	rcu_read_lock();

 	list_for_each_entry(c, &bch_fs_list, list)
-		for_each_member_device_rcu(ca, c, i)
+		for_each_member_device_rcu(ca, c, i, NULL)
 			if (ca->disk_sb.bdev == bdev) {
 				closure_get(&c->cl);
 				goto found;
@ -159,10 +159,11 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
 	} else {
 		/* Writes prefer fastest tier: */
 		struct bch_tier *tier = READ_ONCE(c->fastest_tier);
-		struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
+		struct bch_devs_mask *devs =
+			tier ? &tier->devs : &c->rw_devs[BCH_DATA_USER];

 		rcu_read_lock();
-		group_for_each_dev(ca, grp, i) {
+		for_each_member_device_rcu(ca, c, i, devs) {
 			bdi = ca->disk_sb.bdev->bd_bdi;

 			if (bdi_congested(bdi, bdi_bits)) {
@ -554,6 +555,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 	}

+	c->block_bits		= ilog2(c->sb.block_size);
+
 	mutex_unlock(&c->sb_lock);

 	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
@ -564,8 +567,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->opts.nochanges	|= c->opts.noreplay;
 	c->opts.read_only	|= c->opts.nochanges;

-	c->block_bits		= ilog2(c->sb.block_size);
-
 	if (bch2_fs_init_fault("fs_alloc"))
 		goto err;

@ -590,7 +591,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_page_pool(&c->bio_bounce_pages,
 				   max_t(unsigned,
 					 c->sb.btree_node_size,
-					 BCH_ENCODED_EXTENT_MAX) /
+					 c->sb.encoded_extent_max) /
 				   PAGE_SECTORS, 0) ||
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
@ -662,7 +663,7 @@ static const char *__bch2_fs_online(struct bch_fs *c)
 	mutex_lock(&c->state_lock);

 	err = "error creating sysfs objects";
-	__for_each_member_device(ca, c, i)
+	__for_each_member_device(ca, c, i, NULL)
 		if (bch2_dev_sysfs_online(ca))
 			goto err;

@ -692,7 +693,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 	LIST_HEAD(journal);
 	struct jset *j;
 	struct closure cl;
-	u64 journal_seq = 0;
 	time64_t now;
 	unsigned i;
 	int ret = -EINVAL;
@ -790,17 +790,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		if (ret)
 			goto err;
 		bch_verbose(c, "fsck done");
-
-		for_each_rw_member(ca, c, i)
-			if (ca->need_alloc_write) {
-				ret = bch2_alloc_write(c, ca, &journal_seq);
-				if (ret) {
-					percpu_ref_put(&ca->io_ref);
-					goto err;
-				}
-			}
-
-		bch2_journal_flush_seq(&c->journal, journal_seq);
 	} else {
 		struct bch_inode_unpacked inode;
 		struct bkey_inode_buf packed_inode;
@ -842,7 +831,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)

 		bch2_inode_init(c, &inode, 0, 0,
 			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
-		inode.inum = BCACHE_ROOT_INO;
+		inode.inum = BCACHEFS_ROOT_INO;

 		bch2_inode_pack(&packed_inode, &inode);

@ -878,7 +867,6 @@ recovery_done:

 	SET_BCH_SB_INITIALIZED(c->disk_sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb, false);
-	c->disk_sb->version = BCACHE_SB_VERSION_CDEV;

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
@ -988,9 +976,10 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_free_super(&ca->disk_sb);
 	bch2_dev_journal_exit(ca);

-	free_percpu(ca->sectors_written);
+	free_percpu(ca->io_done);
 	bioset_exit(&ca->replica_set);
 	free_percpu(ca->usage_percpu);
+	kvpfree(ca->bucket_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->buckets,	 ca->mi.nbuckets * sizeof(struct bucket));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
 	free_heap(&ca->copygc_heap);
@ -1108,10 +1097,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	init_completion(&ca->stop_complete);
 	init_completion(&ca->offline_complete);

-	spin_lock_init(&ca->self.lock);
-	ca->self.nr = 1;
-	rcu_assign_pointer(ca->self.d[0].dev, ca);
 	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+
+	ca->copygc_write_point.type = BCH_DATA_USER;

 	spin_lock_init(&ca->freelist_lock);
 	bch2_dev_moving_gc_init(ca);
@ -1125,7 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)

 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
-	ca->bucket_bits = ilog2(ca->mi.bucket_size);
 	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);

 	/* XXX: tune these */
@ -1161,10 +1149,13 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	    !(ca->buckets	= kvpmalloc(ca->mi.nbuckets *
 					    sizeof(struct bucket),
 					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(ca->bucket_dirty	= kvpmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
-	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
+	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;

 	total_reserve = ca->free_inc.size;
@ -1172,7 +1163,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 		total_reserve += ca->free[i].size;

 	ca->copygc_write_point.group = &ca->self;
-	ca->tiering_write_point.group = &ca->self;

 	ca->fs = c;
 	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
@ -1238,19 +1228,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
 		bch2_mark_dev_metadata(c, ca);
 	lg_local_unlock(&c->usage_lock);

-	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
-		struct bch_sb_field_journal *journal_buckets =
-			bch2_sb_get_journal(ca->disk_sb.sb);
-		bool has_journal =
-			bch2_nr_journal_buckets(journal_buckets) >=
-			BCH_JOURNAL_BUCKETS_MIN;
-
-		bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
-		bch2_dev_group_add(&c->all_devs, ca);
-
-		if (has_journal)
-			bch2_dev_group_add(&c->journal.devs, ca);
-	}
+	if (ca->mi.state == BCH_MEMBER_STATE_RW)
+		bch2_dev_allocator_add(c, ca);

 	percpu_ref_reinit(&ca->io_ref);
 	return 0;
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -1,35 +1,28 @@
-#ifndef _BCACHE_SUPER_H
-#define _BCACHE_SUPER_H
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H

 #include "extents.h"

 #include "bcachefs_ioctl.h"

+#include <linux/math64.h>
+
 static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
 {
-	return s >> ca->bucket_bits;
+	return div_u64(s, ca->mi.bucket_size);
 }

 static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
 {
-	return ((sector_t) b) << ca->bucket_bits;
+	return ((sector_t) b) * ca->mi.bucket_size;
 }

 static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
 {
-	return s & (ca->mi.bucket_size - 1);
-}
+	u32 remainder;

-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter)
-{
-	struct bch_dev *ca = NULL;
-
-	while (*iter < c->sb.nr_devices &&
-	       !(ca = rcu_dereference_check(c->devs[*iter],
-					    lockdep_is_held(&c->state_lock))))
-		(*iter)++;
-
-	return ca;
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
 }

 static inline bool bch2_dev_is_online(struct bch_dev *ca)
@ -37,18 +30,38 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
 	return !percpu_ref_is_zero(&ca->io_ref);
 }

-#define __for_each_member_device(ca, c, iter)				\
-	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter))); (iter)++)
+static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}

-#define for_each_member_device_rcu(ca, c, iter)				\
-	__for_each_member_device(ca, c, iter)
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define __for_each_member_device(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	__for_each_member_device(ca, c, iter, mask)

 static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
 {
 	struct bch_dev *ca;

 	rcu_read_lock();
-	if ((ca = __bch2_next_dev(c, iter)))
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
 		percpu_ref_get(&ca->ref);
 	rcu_read_unlock();

@ -70,7 +83,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	struct bch_dev *ca;

 	rcu_read_lock();
-	while ((ca = __bch2_next_dev(c, iter)) &&
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
 	       (!((1 << ca->mi.state) & state_mask) ||
 		!percpu_ref_tryget(&ca->io_ref)))
 		(*iter)++;
@ -94,6 +107,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	__for_each_online_member(ca, c, iter,				\
 		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))

+/* XXX kill, move to struct bch_fs */
 static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 {
 	struct bch_devs_mask devs;
@ -135,4 +149,4 @@ const char *bch2_fs_open(char * const *, unsigned, struct bch_opts,
 			struct bch_fs **);
 const char *bch2_fs_open_incremental(const char *path);

-#endif /* _BCACHE_SUPER_H */
+#endif /* _BCACHEFS_SUPER_H */
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_SUPER_TYPES_H
-#define _BCACHE_SUPER_TYPES_H
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H

 struct bcache_superblock {
 	struct bch_sb		*sb;
@ -13,4 +13,4 @@ struct bch_devs_mask {
 	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
 };

-#endif /* _BCACHE_SUPER_TYPES_H */
+#endif /* _BCACHEFS_SUPER_TYPES_H */
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -5,6 +5,8 @@
 * Copyright 2012 Google, Inc.
 */

+#ifndef NO_BCACHEFS_SYSFS
+
 #include "bcachefs.h"
 #include "alloc.h"
 #include "compress.h"
@ -53,7 +55,7 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
+		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
 } while (0)

 #define sysfs_print(file, var)						\
@ -134,6 +136,7 @@ read_attribute(block_size);
 read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
+read_attribute(iostats);
 read_attribute(read_priority_stats);
 read_attribute(write_priority_stats);
 read_attribute(fragmentation_stats);
@ -141,9 +144,6 @@ read_attribute(oldest_gen_stats);
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
-read_attribute(written);
-read_attribute(btree_written);
-read_attribute(metadata_written);
 read_attribute(journal_debug);
 read_attribute(journal_pins);

@ -160,7 +160,6 @@ read_attribute(cached_buckets);
 read_attribute(meta_buckets);
 read_attribute(alloc_buckets);
 read_attribute(has_data);
-read_attribute(has_metadata);
 read_attribute(alloc_debug);

 read_attribute(read_realloc_races);
@ -301,7 +300,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 		}
 	bch2_btree_iter_unlock(&iter);

-	return snprintf(buf, PAGE_SIZE,
+	return scnprintf(buf, PAGE_SIZE,
 			"uncompressed data:\n"
 			"	nr extents:			%llu\n"
 			"	size (bytes):			%llu\n"
@ -527,9 +526,13 @@ struct attribute *bch2_fs_internal_files[] = {

 SHOW(bch2_fs_opts_dir)
 {
+	char *out = buf, *end = buf + PAGE_SIZE;
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);

-	return bch2_opt_show(&c->opts, attr->name, buf, PAGE_SIZE);
+	out += bch2_opt_show(&c->opts, attr->name, out, end - out);
+	out += scnprintf(out, end - out, "\n");
+
+	return out - buf;
 }

 STORE(bch2_fs_opts_dir)
@ -728,15 +731,32 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
 }

-static u64 sectors_written(struct bch_dev *ca)
+const char * const bch2_rw[] = {
+	"read",
+	"write",
+	NULL
+};
+
+static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
 {
-	u64 ret = 0;
-	int cpu;
+	char *out = buf, *end = buf + PAGE_SIZE;
+	int rw, i, cpu;
+
+	for (rw = 0; rw < 2; rw++) {
+		out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]);
+
+		for (i = 1; i < BCH_DATA_NR; i++) {
+			u64 n = 0;

 			for_each_possible_cpu(cpu)
-		ret += *per_cpu_ptr(ca->sectors_written, cpu);
+				n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];

-	return ret;
+			out += scnprintf(out, end - out, "%-12s:%12llu\n",
+					 bch2_data_types[i], n << 9);
+		}
+	}
+
+	return out - buf;
 }

 SHOW(bch2_dev)
@ -744,6 +764,7 @@ SHOW(bch2_dev)
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	char *out = buf, *end = buf + PAGE_SIZE;

 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);

@ -752,12 +773,6 @@ SHOW(bch2_dev)
 	sysfs_print(first_bucket,	ca->mi.first_bucket);
 	sysfs_print(nbuckets,		ca->mi.nbuckets);
 	sysfs_print(discard,		ca->mi.discard);
-	sysfs_hprint(written, sectors_written(ca) << 9);
-	sysfs_hprint(btree_written,
-		     atomic64_read(&ca->btree_sectors_written) << 9);
-	sysfs_hprint(metadata_written,
-		     (atomic64_read(&ca->meta_sectors_written) +
-		      atomic64_read(&ca->btree_sectors_written)) << 9);

 	sysfs_hprint(dirty_data,	stats.sectors[S_DIRTY] << 9);
 	sysfs_print(dirty_bytes,	stats.sectors[S_DIRTY] << 9);
@ -769,26 +784,37 @@ SHOW(bch2_dev)
 	sysfs_print(alloc_buckets,	stats.buckets_alloc);
 	sysfs_print(available_buckets,	dev_buckets_available(ca));
 	sysfs_print(free_buckets,	dev_buckets_free(ca));
-	sysfs_print(has_data,		bch2_dev_has_data(c, ca) &
-		    (1 << BCH_DATA_USER));
-	sysfs_print(has_metadata,	bch2_dev_has_data(c, ca) &
-		    ((1 << BCH_DATA_JOURNAL)|
-		     (1 << BCH_DATA_BTREE)));
+
+	if (attr == &sysfs_has_data) {
+		out += bch2_scnprint_flag_list(out, end - out,
+					       bch2_data_types,
+					       bch2_dev_has_data(c, ca));
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}

 	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);

-	if (attr == &sysfs_cache_replacement_policy)
-		return bch2_snprint_string_list(buf, PAGE_SIZE,
+	if (attr == &sysfs_cache_replacement_policy) {
+		out += bch2_scnprint_string_list(out, end - out,
 						 bch2_cache_replacement_policies,
 						 ca->mi.replacement);
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}

 	sysfs_print(tier,		ca->mi.tier);

-	if (attr == &sysfs_state_rw)
-		return bch2_snprint_string_list(buf, PAGE_SIZE,
+	if (attr == &sysfs_state_rw) {
+		out += bch2_scnprint_string_list(out, end - out,
 						 bch2_dev_state,
 						 ca->mi.state);
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
+	}

+	if (attr == &sysfs_iostats)
+		return show_dev_iostats(ca, buf);
 	if (attr == &sysfs_read_priority_stats)
 		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
 	if (attr == &sysfs_write_priority_stats)
@ -859,8 +885,8 @@ STORE(bch2_dev)
 		SET_BCH_MEMBER_TIER(mi, v);
 		bch2_write_super(c);

-		bch2_dev_group_remove(&c->tiers[prev_tier].devs, ca);
-		bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+		clear_bit(ca->dev_idx, c->tiers[prev_tier].devs.d);
+		set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
 		mutex_unlock(&c->sb_lock);

 		bch2_recalc_capacity(c);
@ -885,12 +911,7 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_state_rw,

 	&sysfs_has_data,
-	&sysfs_has_metadata,
-
-	/* io stats: */
-	&sysfs_written,
-	&sysfs_btree_written,
-	&sysfs_metadata_written,
+	&sysfs_iostats,

 	/* alloc info - data: */
 	&sysfs_dirty_data,
@ -919,3 +940,5 @@ struct attribute *bch2_dev_files[] = {
 	sysfs_pd_controller_files(copy_gc),
 	NULL
 };
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
--- a/libbcachefs/sysfs.h
+++ b/libbcachefs/sysfs.h
@ -1,9 +1,9 @@
-#ifndef _BCACHE_SYSFS_H_
-#define _BCACHE_SYSFS_H_
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_

 #include <linux/sysfs.h>

-#ifndef NO_BCACHE_SYSFS
+#ifndef NO_BCACHEFS_SYSFS

 struct attribute;
 struct sysfs_ops;
@ -34,6 +34,6 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
 static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
 static const struct sysfs_ops bch2_dev_sysfs_ops;

-#endif
+#endif /* NO_BCACHEFS_SYSFS */

-#endif  /* _BCACHE_SYSFS_H_ */
+#endif  /* _BCACHEFS_SYSFS_H_ */
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@ -24,7 +24,7 @@ struct tiering_state {
 };

 static bool tiering_pred(struct bch_fs *c,
-			 struct tiering_state *s,
+			 struct bch_tier *tier,
 			 struct bkey_s_c k)
 {
 	if (bkey_extent_is_data(k.k)) {
@ -38,7 +38,7 @@ static bool tiering_pred(struct bch_fs *c,
 			return false;

 		extent_for_each_ptr(e, ptr)
-			if (c->devs[ptr->dev]->mi.tier >= s->tier->idx)
+			if (c->devs[ptr->dev]->mi.tier >= tier->idx)
 				replicas++;

 		return replicas < c->opts.data_replicas;
@ -47,49 +47,18 @@ static bool tiering_pred(struct bch_fs *c,
 	return false;
 }

-static void tier_put_device(struct tiering_state *s)
-{
-	if (s->ca)
-		percpu_ref_put(&s->ca->io_ref);
-	s->ca = NULL;
-}
-
-/**
- * refill_next - move on to refilling the next cache's tiering keylist
- */
-static void tier_next_device(struct bch_fs *c, struct tiering_state *s)
-{
-	if (!s->ca || s->sectors > s->stripe_size) {
-		tier_put_device(s);
-		s->sectors = 0;
-		s->dev_idx++;
-
-		spin_lock(&s->tier->devs.lock);
-		if (s->dev_idx >= s->tier->devs.nr)
-			s->dev_idx = 0;
-
-		if (s->tier->devs.nr) {
-			s->ca = s->tier->devs.d[s->dev_idx].dev;
-			percpu_ref_get(&s->ca->io_ref);
-		}
-		spin_unlock(&s->tier->devs.lock);
-	}
-}
-
 static int issue_tiering_move(struct bch_fs *c,
-			      struct tiering_state *s,
+			      struct bch_tier *tier,
 			      struct moving_context *ctxt,
 			      struct bkey_s_c k)
 {
 	int ret;

-	ret = bch2_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL);
-	if (!ret) {
+	ret = bch2_data_move(c, ctxt, &tier->wp, k, NULL);
+	if (!ret)
 		trace_tiering_copy(k.k);
-		s->sectors += k.k->size;
-	} else {
+	else
 		trace_tiering_alloc_fail(c, k.k->size);
-	}

 	return ret;
 }
@ -101,10 +70,9 @@ static int issue_tiering_move(struct bch_fs *c,
 static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
 {
 	struct moving_context ctxt;
-	struct tiering_state s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned nr_devices = READ_ONCE(tier->devs.nr);
+	unsigned nr_devices = dev_mask_nr(&tier->devs);
 	int ret;

 	if (!nr_devices)
@ -112,10 +80,6 @@ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)

 	trace_tiering_start(c);

-	memset(&s, 0, sizeof(s));
-	s.tier		= tier;
-	s.stripe_size	= 2048; /* 1 mb for now */
-
 	bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
 			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
 	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
@ -125,14 +89,10 @@ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
 	       !bch2_move_ctxt_wait(&ctxt) &&
 	       (k = bch2_btree_iter_peek(&iter)).k &&
 	       !btree_iter_err(k)) {
-		if (!tiering_pred(c, &s, k))
+		if (!tiering_pred(c, tier, k))
 			goto next;

-		tier_next_device(c, &s);
-		if (!s.ca)
-			break;
-
-		ret = issue_tiering_move(c, &s, &ctxt, k);
+		ret = issue_tiering_move(c, tier, &ctxt, k);
 		if (ret) {
 			bch2_btree_iter_unlock(&iter);

@ -150,7 +110,6 @@ next:
 	}

 	bch2_btree_iter_unlock(&iter);
-	tier_put_device(&s);
 	bch2_move_ctxt_exit(&ctxt);
 	trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);

@ -171,7 +130,7 @@ static int bch2_tiering_thread(void *arg)

 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   tier->devs.nr))
+					   dev_mask_nr(&tier->devs)))
 			break;

 		while (1) {
@ -183,15 +142,18 @@ static int bch2_tiering_thread(void *arg)
 			for (faster_tier = c->tiers;
 			     faster_tier != tier;
 			     faster_tier++) {
-				spin_lock(&faster_tier->devs.lock);
-				group_for_each_dev(ca, &faster_tier->devs, i) {
+				rcu_read_lock();
+				for_each_member_device_rcu(ca, c, i,
+						&faster_tier->devs) {
 					tier_capacity +=
-						(ca->mi.nbuckets -
-						 ca->mi.first_bucket) << ca->bucket_bits;
+						bucket_to_sector(ca,
+							ca->mi.nbuckets -
+							ca->mi.first_bucket);
 					available_sectors +=
-						dev_buckets_available(ca) << ca->bucket_bits;
+						bucket_to_sector(ca,
+							dev_buckets_available(ca));
 				}
-				spin_unlock(&faster_tier->devs.lock);
+				rcu_read_unlock();
 			}

 			if (available_sectors < (tier_capacity >> 1))
@ -255,7 +217,7 @@ int bch2_tiering_start(struct bch_fs *c)
 		return 0;

 	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-		if (!tier->devs.nr)
+		if (!dev_mask_nr(&tier->devs))
 			continue;

 		if (have_faster_tier) {
@ -279,5 +241,6 @@ void bch2_fs_tiering_init(struct bch_fs *c)
 	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
 		c->tiers[i].idx = i;
 		bch2_pd_controller_init(&c->tiers[i].pd);
+		c->tiers[i].wp.group = &c->tiers[i].devs;
 	}
 }
--- a/libbcachefs/tier.h
+++ b/libbcachefs/tier.h
@ -1,8 +1,8 @@
-#ifndef _BCACHE_TIER_H
-#define _BCACHE_TIER_H
+#ifndef _BCACHEFS_TIER_H
+#define _BCACHEFS_TIER_H

 void bch2_tiering_stop(struct bch_fs *);
 int bch2_tiering_start(struct bch_fs *);
 void bch2_fs_tiering_init(struct bch_fs *);

-#endif
+#endif /* _BCACHEFS_TIER_H */
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@ -98,44 +98,95 @@ ssize_t bch2_hprint(char *buf, s64 v)
 	 * to turn it into [-9, 9]
 	 */
 	if (v < 100 && v > -100)
-		snprintf(dec, sizeof(dec), ".%i", t / 103);
+		scnprintf(dec, sizeof(dec), ".%i", t / 103);

 	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
 }

-ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[],
+ssize_t bch2_scnprint_string_list(char *buf, size_t size,
+				  const char * const list[],
 				  size_t selected)
 {
 	char *out = buf;
 	size_t i;

+	if (size)
+		*out = '\0';
+
 	for (i = 0; list[i]; i++)
-		out += snprintf(out, buf + size - out,
+		out += scnprintf(out, buf + size - out,
 				 i == selected ? "[%s] " : "%s ", list[i]);

-	out[-1] = '\n';
+	if (out != buf)
+		*--out = '\0';
+
 	return out - buf;
 }

 ssize_t bch2_read_string_list(const char *buf, const char * const list[])
 {
-	size_t i;
-	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+	size_t i, len;
+
+	buf = skip_spaces(buf);
+
+	len = strlen(buf);
+	while (len && isspace(buf[len - 1]))
+		--len;
+
+	for (i = 0; list[i]; i++)
+		if (strlen(list[i]) == len &&
+		    !memcmp(buf, list[i], len))
+			break;
+
+	return list[i] ? i : -EINVAL;
+}
+
+ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
+				const char * const list[], u64 flags)
+{
+	char *out = buf, *end = buf + size;
+	unsigned bit, nr = 0;
+
+	while (list[nr])
+		nr++;
+
+	if (size)
+		*out = '\0';
+
+	while (flags && (bit = __ffs(flags)) < nr) {
+		out += scnprintf(out, end - out, "%s,", list[bit]);
+		flags ^= 1 << bit;
+	}
+
+	if (out != buf)
+		*--out = '\0';
+
+	return out - buf;
+}
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+	u64 ret = 0;
+	char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+
 	if (!d)
 		return -ENOMEM;

 	s = strim(d);

-	for (i = 0; list[i]; i++)
-		if (!strcmp(list[i], s))
+	while ((p = strsep(&s, ","))) {
+		int flag = bch2_read_string_list(p, list);
+		if (flag < 0) {
+			ret = -1;
 			break;
+		}
+
+		ret |= 1 << flag;
+	}

 	kfree(d);

-	if (!list[i])
-		return -EINVAL;
-
-	return i;
+	return ret;
 }

 bool bch2_is_zero(const void *_p, size_t n)
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_UTIL_H
-#define _BCACHE_UTIL_H
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H

 #include <linux/bio.h>
 #include <linux/blkdev.h>
@ -356,10 +356,12 @@ ssize_t bch2_hprint(char *buf, s64 v);

 bool bch2_is_zero(const void *, size_t);

-ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[],
-			    size_t selected);
+ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);

-ssize_t bch2_read_string_list(const char *buf, const char * const list[]);
+ssize_t bch2_read_string_list(const char *, const char * const[]);
+
+ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
+u64 bch2_read_flag_list(char *, const char * const[]);

 struct time_stats {
 	spinlock_t	lock;
@ -787,4 +789,4 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));

-#endif /* _BCACHE_UTIL_H */
+#endif /* _BCACHEFS_UTIL_H */
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@ -1,5 +1,5 @@
-#ifndef _BCACHE_XATTR_H
-#define _BCACHE_XATTR_H
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H

 #include "str_hash.h"

@ -20,4 +20,4 @@ ssize_t bch2_xattr_list(struct dentry *, char *, size_t);

 extern const struct xattr_handler *bch2_xattr_handlers[];

-#endif /* _BCACHE_XATTR_H */
+#endif /* _BCACHEFS_XATTR_H */