Update bcachefs sources to da7fefde29 bcachefs: shim for userspace raid library

2025-03-28 00:00:03 +03:00 · 2018-11-23 03:04:34 -05:00 · 2018-11-23 03:04:34 -05:00 · bca8b084ad
commit bca8b084ad
parent c416528eaa
41 changed files with 3295 additions and 1018 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-a9f14c773fb122a4b283fc7b79d9f98703a18890
+da7fefde294e3c56359ee498a62a77182a4733cd
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -6,6 +6,8 @@
 #include <linux/kobject.h>
 #include <linux/types.h>

+#define BIO_MAX_PAGES	256
+
 typedef unsigned fmode_t;

 struct bio;
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -9,6 +9,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "journal_io.h"

@ -82,7 +83,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	case BCH_ALLOC: {
 		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);

-		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+		/* allow for unknown fields */
+		if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
 			return "incorrect value size";
 		break;
 	}
@ -235,6 +237,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
 	struct bucket *g;
 	struct bkey_i_alloc *a;
+	int ret;
 	u8 *d;

 	percpu_down_read_preempt_disable(&c->usage_lock);
@ -258,32 +261,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,

 	bch2_btree_iter_set_pos(iter, a->k.p);

-	return bch2_btree_insert_at(c, NULL, journal_seq,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE|
-				    BTREE_INSERT_USE_ALLOC_RESERVE|
-				    flags,
-				    BTREE_INSERT_ENTRY(iter, &a->k_i));
+	ret = bch2_btree_insert_at(c, NULL, journal_seq,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_USE_RESERVE|
+				   BTREE_INSERT_USE_ALLOC_RESERVE|
+				   flags,
+				   BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+	if (!ret && ca->buckets_written)
+		set_bit(b, ca->buckets_written);
+
+	return ret;
 }

-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
 	struct bch_dev *ca;
 	struct btree_iter iter;
 	int ret;

-	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+	if (k->k.p.inode >= c->sb.nr_devices ||
+	    !c->devs[k->k.p.inode])
 		return 0;

-	ca = bch_dev_bkey_exists(c, pos.inode);
+	ca = bch_dev_bkey_exists(c, k->k.p.inode);

-	if (pos.offset >= ca->mi.nbuckets)
+	if (k->k.p.offset >= ca->mi.nbuckets)
 		return 0;

-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
+			     BTREE_ITER_INTENT);

-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;
+
+	/* check buckets_written with btree node locked: */
+
+	ret = test_bit(k->k.p.offset, ca->buckets_written)
+		? 0
+		: bch2_btree_insert_at(c, NULL, NULL,
+				       BTREE_INSERT_NOFAIL|
+				       BTREE_INSERT_JOURNAL_REPLAY,
+				       BTREE_INSERT_ENTRY(&iter, k));
+err:
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }
@ -909,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
 		pr_debug("free_inc now empty");

 		do {
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@ -1112,6 +1127,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);

+	while (1) {
+		struct open_bucket *ob;
+
+		spin_lock(&c->freelist_lock);
+		if (!ca->open_buckets_partial_nr) {
+			spin_unlock(&c->freelist_lock);
+			break;
+		}
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+
+		bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_ec_stop_dev(c, ca);
+
 	/*
 	 * Wake up threads that were blocked on allocation, so they can notice
 	 * the device can no longer be removed and the capacity has changed:
@ -1254,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	bool invalidating_data = false;
 	int ret = 0;

-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
 	if (test_alloc_startup(c)) {
 		invalidating_data = true;
 		goto not_enough;
@ -1264,51 +1294,47 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)

 	/* Scan for buckets that are already invalidated: */
 	for_each_rw_member(ca, c, dev_iter) {
-		struct btree_iter iter;
+		struct bucket_array *buckets;
 		struct bucket_mark m;
-		struct bkey_s_c k;

-		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-			if (k.k->type != BCH_ALLOC)
+		down_read(&ca->bucket_lock);
+		percpu_down_read_preempt_disable(&c->usage_lock);
+
+		buckets = bucket_array(ca);
+
+		for (bu = buckets->first_bucket;
+		     bu < buckets->nbuckets; bu++) {
+			m = READ_ONCE(buckets->b[bu].mark);
+
+			if (!m.gen_valid ||
+			    !is_available_bucket(m) ||
+			    m.cached_sectors)
 				continue;

-			bu = k.k->p.offset;
-			m = READ_ONCE(bucket(ca, bu)->mark);
-
-			if (!is_available_bucket(m) || m.cached_sectors)
-				continue;
-
-			percpu_down_read_preempt_disable(&c->usage_lock);
 			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			percpu_up_read_preempt_enable(&c->usage_lock);
+					gc_pos_alloc(c, NULL), 0);

 			fifo_push(&ca->free_inc, bu);

-			if (fifo_full(&ca->free_inc))
+			discard_invalidated_buckets(c, ca);
+
+			if (fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 		}
-		bch2_btree_iter_unlock(&iter);
+		percpu_up_read_preempt_enable(&c->usage_lock);
+		up_read(&ca->bucket_lock);
 	}

 	/* did we find enough buckets? */
 	for_each_rw_member(ca, c, dev_iter)
-		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+		if (!fifo_full(&ca->free[RESERVE_BTREE])) {
 			percpu_ref_put(&ca->io_ref);
 			goto not_enough;
 		}

 	return 0;
 not_enough:
-	pr_debug("did not find enough empty buckets; issuing discards");
-
-	/* clear out free_inc, we'll be using it again below: */
-	for_each_rw_member(ca, c, dev_iter)
-		discard_invalidated_buckets(c, ca);
-
-	pr_debug("scanning for reclaimable buckets");
+	pr_debug("not enough empty buckets; scanning for reclaimable buckets");

 	for_each_rw_member(ca, c, dev_iter) {
 		find_reclaimable_buckets(c, ca);
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@ -16,7 +16,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 }

 int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);

 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@ -61,6 +61,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "io.h"

 #include <linux/math64.h>
@ -94,6 +95,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);

+	if (ob->ec) {
+		bch2_ec_bucket_written(c, ob);
+		return;
+	}
+
 	percpu_down_read_preempt_disable(&c->usage_lock);
 	spin_lock(&ob->lock);

@ -113,6 +119,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	closure_wake_up(&c->open_buckets_wait);
 }

+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ptr.dev == dev &&
+		    ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 {
 	struct open_bucket *ob;
@ -128,15 +147,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }

 static void open_bucket_free_unused(struct bch_fs *c,
-				    struct write_point *wp,
-				    struct open_bucket *ob)
+				    struct open_bucket *ob,
+				    bool may_realloc)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);

 	BUG_ON(ca->open_buckets_partial_nr >=
 	       ARRAY_SIZE(ca->open_buckets_partial));

-	if (wp->type == BCH_DATA_USER) {
+	if (ca->open_buckets_partial_nr <
+	    ARRAY_SIZE(ca->open_buckets_partial) &&
+	    may_realloc) {
 		spin_lock(&c->freelist_lock);
 		ob->on_partial_list = true;
 		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
@ -284,18 +305,18 @@ out:
 	return ob;
 }

-static int __dev_alloc_cmp(struct write_point *wp,
-			   unsigned l, unsigned r)
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
 {
-	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-		(wp->next_alloc[l] < wp->next_alloc[r]));
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
 }

-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)

-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-					 struct write_point *wp,
-					 struct bch_devs_mask *devs)
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
 {
 	struct dev_alloc_list ret = { .nr = 0 };
 	struct bch_dev *ca;
@ -304,14 +325,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
 	for_each_member_device_rcu(ca, c, i, devs)
 		ret.devs[ret.nr++] = i;

-	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
 	return ret;
 }

-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-		     struct write_point *wp)
+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
 {
-	u64 *v = wp->next_alloc + ca->dev_idx;
+	u64 *v = stripe->next_alloc + ca->dev_idx;
 	u64 free_space = dev_buckets_free(c, ca);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
@ -323,26 +344,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 	else
 		*v = U64_MAX;

-	for (v = wp->next_alloc;
-	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
 		*v = *v < scale ? 0 : *v - scale;
 }

+#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
+#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
+
 static int bch2_bucket_alloc_set(struct bch_fs *c,
 				 struct open_buckets *ptrs,
-				 struct write_point *wp,
+				 struct dev_stripe_state *stripe,
 				 struct bch_devs_mask *devs_may_alloc,
 				 unsigned nr_replicas,
 				 unsigned *nr_effective,
 				 bool *have_cache,
 				 enum alloc_reserve reserve,
+				 unsigned flags,
 				 struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted =
-		bch2_wp_alloc_list(c, wp, devs_may_alloc);
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
 	bool alloc_failure = false;
-	unsigned i;
+	unsigned i, durability;

 	BUG_ON(*nr_effective >= nr_replicas);

@ -353,13 +378,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 		if (!ca)
 			continue;

-		if (!ca->mi.durability &&
-		    (*have_cache ||
-		     wp->type != BCH_DATA_USER))
+		if (!ca->mi.durability && *have_cache)
 			continue;

 		ob = bch2_bucket_alloc(c, ca, reserve,
-				       wp->type == BCH_DATA_USER, cl);
+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 		if (IS_ERR(ob)) {
 			enum bucket_alloc_ret ret = -PTR_ERR(ob);

@ -374,13 +397,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 			continue;
 		}

+		durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
+			? ca->mi.durability : 1;
+
 		__clear_bit(ca->dev_idx, devs_may_alloc->d);
-		*nr_effective	+= ca->mi.durability;
-		*have_cache	|= !ca->mi.durability;
+		*nr_effective	+= durability;
+		*have_cache	|= !durability;

 		ob_push(c, ptrs, ob);

-		bch2_wp_rescale(c, ca, wp);
+		bch2_dev_stripe_increment(c, ca, stripe);

 		if (*nr_effective >= nr_replicas)
 			return 0;
@ -389,15 +415,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 	return alloc_failure ? -ENOSPC : -EROFS;
 }

+/* Allocate from stripes: */
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	unsigned i, nr_have = 0, nr_data =
+		min_t(unsigned, h->nr_active_devs,
+		      EC_STRIPE_MAX) - h->redundancy;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(h->blocks.nr > nr_data);
+	BUG_ON(h->parity.nr > h->redundancy);
+
+	devs = h->devs;
+
+	open_bucket_for_each(c, &h->parity, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+	open_bucket_for_each(c, &h->blocks, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	percpu_down_read_preempt_disable(&c->usage_lock);
+	rcu_read_lock();
+
+	if (h->parity.nr < h->redundancy) {
+		nr_have = h->parity.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->parity,
+					    &h->parity_stripe,
+					    &devs,
+					    h->redundancy,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	if (h->blocks.nr < nr_data) {
+		nr_have = h->blocks.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->blocks,
+					    &h->block_stripe,
+					    &devs,
+					    nr_data,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	rcu_read_unlock();
+	percpu_up_read_preempt_enable(&c->usage_lock);
+
+	return bch2_ec_stripe_new_alloc(c, h);
+err:
+	rcu_read_unlock();
+	percpu_up_read_preempt_enable(&c->usage_lock);
+	return -1;
+}
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static void bucket_alloc_from_stripe(struct bch_fs *c,
+				     struct open_buckets *ptrs,
+				     struct write_point *wp,
+				     struct bch_devs_mask *devs_may_alloc,
+				     u16 target,
+				     unsigned erasure_code,
+				     unsigned nr_replicas,
+				     unsigned *nr_effective,
+				     bool *have_cache)
+{
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned i, ec_idx;
+
+	if (!erasure_code)
+		return;
+
+	if (nr_replicas < 2)
+		return;
+
+	if (ec_open_bucket(c, ptrs))
+		return;
+
+	h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+	if (!h)
+		return;
+
+	if (!h->s && ec_stripe_alloc(c, h))
+		goto out_put_head;
+
+	rcu_read_lock();
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+	rcu_read_unlock();
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+			if (ob->ptr.dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+	goto out_put_head;
+got_bucket:
+	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+
+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	*nr_effective	+= ca->mi.durability;
+	*have_cache	|= !ca->mi.durability;
+
+	ob_push(c, ptrs, ob);
+	atomic_inc(&h->s->pin);
+out_put_head:
+	bch2_ec_stripe_head_put(h);
+}
+
 /* Sector allocator */

-static int get_buckets_from_writepoint(struct bch_fs *c,
-				       struct open_buckets *ptrs,
-				       struct write_point *wp,
-				       struct bch_devs_mask *devs_may_alloc,
-				       unsigned nr_replicas,
-				       unsigned *nr_effective,
-				       bool *have_cache)
+static void get_buckets_from_writepoint(struct bch_fs *c,
+					struct open_buckets *ptrs,
+					struct write_point *wp,
+					struct bch_devs_mask *devs_may_alloc,
+					unsigned nr_replicas,
+					unsigned *nr_effective,
+					bool *have_cache,
+					bool need_ec)
 {
 	struct open_buckets ptrs_skip = { .nr = 0 };
 	struct open_bucket *ob;
@ -409,7 +570,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		if (*nr_effective < nr_replicas &&
 		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
 		    (ca->mi.durability ||
-		     (wp->type == BCH_DATA_USER && !*have_cache))) {
+		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
+		    (ob->ec || !need_ec)) {
 			__clear_bit(ob->ptr.dev, devs_may_alloc->d);
 			*nr_effective	+= ca->mi.durability;
 			*have_cache	|= !ca->mi.durability;
@ -420,8 +582,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		}
 	}
 	wp->ptrs = ptrs_skip;
-
-	return *nr_effective < nr_replicas ? -ENOSPC : 0;
 }

 static int open_bucket_add_buckets(struct bch_fs *c,
@ -429,22 +589,25 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 				   struct write_point *wp,
 				   struct bch_devs_list *devs_have,
 				   u16 target,
+				   unsigned erasure_code,
 				   unsigned nr_replicas,
 				   unsigned *nr_effective,
 				   bool *have_cache,
 				   enum alloc_reserve reserve,
-				   struct closure *cl)
+				   struct closure *_cl)
 {
 	struct bch_devs_mask devs;
-	const struct bch_devs_mask *t;
 	struct open_bucket *ob;
-	unsigned i;
+	struct closure *cl = NULL;
+	unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
 	int ret;

-	percpu_down_read_preempt_disable(&c->usage_lock);
-	rcu_read_lock();
+	if (wp->type == BCH_DATA_USER)
+		flags |= BUCKET_MAY_ALLOC_PARTIAL;

-	devs = c->rw_devs[wp->type];
+	rcu_read_lock();
+	devs = target_rw_devs(c, wp->type, target);
+	rcu_read_unlock();

 	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < devs_have->nr; i++)
@ -453,50 +616,83 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->ptr.dev, devs.d);

-	t = bch2_target_to_mask(c, target);
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	if (erasure_code) {
+		get_buckets_from_writepoint(c, ptrs, wp, &devs,
+					    nr_replicas, nr_effective,
+					    have_cache, true);
+		if (*nr_effective >= nr_replicas)
+			return 0;

-	ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache);
-	if (!ret)
-		goto out;
+		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+					 target, erasure_code,
+					 nr_replicas, nr_effective,
+					 have_cache);
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}

+	get_buckets_from_writepoint(c, ptrs, wp, &devs,
+				    nr_replicas, nr_effective,
+				    have_cache, false);
+	if (*nr_effective >= nr_replicas)
+		return 0;
+
+	percpu_down_read_preempt_disable(&c->usage_lock);
+	rcu_read_lock();
+
+retry_blocking:
 	/*
 	 * Try nonblocking first, so that if one device is full we'll try from
 	 * other devices:
 	 */
-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
-				reserve, NULL);
-	if (!ret || ret == -EROFS || !cl)
-		goto out;
+				reserve, flags, cl);
+	if (ret && ret != -EROFS && !cl && _cl) {
+		cl = _cl;
+		goto retry_blocking;
+	}

-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache,
-				reserve, cl);
-out:
 	rcu_read_unlock();
 	percpu_up_read_preempt_enable(&c->usage_lock);

 	return ret;
 }

+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
+				struct open_buckets *obs,
+				enum bch_data_type data_type)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob, *ob2;
+	unsigned i, j;
+
+	open_bucket_for_each(c, obs, ob, i) {
+		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+
+		if (!drop && ob->ec) {
+			mutex_lock(&ob->ec->lock);
+			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		if (drop)
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	}
+
+	*obs = ptrs;
+}
+
 void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 			  struct write_point *wp)
 {
-	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
 	mutex_lock(&wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (!ca || ob->ptr.dev == ca->dev_idx)
-			open_bucket_free_unused(c, wp, ob);
-		else
-			ob_push(c, &ptrs, ob);
-
-	wp->ptrs = ptrs;
+	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
 	mutex_unlock(&wp->lock);
 }

@ -629,6 +825,7 @@ out:
 */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 				unsigned target,
+				unsigned erasure_code,
 				struct write_point_specifier write_point,
 				struct bch_devs_list *devs_have,
 				unsigned nr_replicas,
@ -648,26 +845,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
 	write_points_nr = c->write_points_nr;
+
 	wp = writepoint_find(c, write_point.v);

+	/* metadata may not allocate on cache devices: */
+	if (wp->type != BCH_DATA_USER)
+		have_cache = true;
+
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, NULL);
 		if (!ret)
 			goto alloc_done;

-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	}
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);

+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
 	if (ret == -EROFS &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
@ -677,7 +885,7 @@ alloc_done:

 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, wp, ob);
+		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);

 	wp->ptrs = ptrs;

@ -696,7 +904,8 @@ err:
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, wp, ob);
+			open_bucket_free_unused(c, ob,
+					wp->type == BCH_DATA_USER);
 	wp->ptrs = ptrs;

 	mutex_unlock(&wp->lock);
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@ -16,11 +16,11 @@ struct dev_alloc_list {
 	u8		devs[BCH_SB_MEMBERS_MAX];
 };

-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-					 struct write_point *,
-					 struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-		     struct write_point *);
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
+			       struct dev_stripe_state *);

 long bch2_bucket_alloc_new_fs(struct bch_dev *);

@ -42,6 +42,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
 	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
 	     (_i)++)

+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
 void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);

 static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@ -75,7 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 }

 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned,
+					     unsigned, unsigned,
 					     struct write_point_specifier,
 					     struct bch_devs_list *,
 					     unsigned, unsigned,
@ -87,6 +103,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);

+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
+				struct open_buckets *, enum bch_data_type);
+
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);

--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@ -7,6 +7,8 @@
 #include "clock_types.h"
 #include "fifo.h"

+struct ec_bucket_buf;
+
 /* There's two of these clocks, one for reads and one for writes: */
 struct bucket_clock {
 	/*
@ -55,8 +57,10 @@ struct open_bucket {
 	u8			freelist;
 	bool			valid;
 	bool			on_partial_list;
+	u8			ec_idx;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
+	struct ec_stripe_new	*ec;
 };

 #define OPEN_BUCKET_LIST_MAX	15
@ -66,18 +70,23 @@ struct open_buckets {
 	u8			v[OPEN_BUCKET_LIST_MAX];
 };

+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
 struct write_point {
 	struct hlist_node	node;
 	struct mutex		lock;
 	u64			last_used;
 	unsigned long		write_point;
 	enum bch_data_type	type;
+	bool			is_ec;

 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;

 	struct open_buckets	ptrs;
-	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+	struct dev_stripe_state	stripe;
 };

 struct write_point_specifier {
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -201,7 +201,7 @@

 #include <linux/dynamic_fault.h>

-#define bch2_fs_init_fault(name)						\
+#define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
 	 dynamic_fault("bcachefs:meta:read:" name)
@ -270,7 +270,10 @@ do {									\
 	BCH_DEBUG_PARAM(test_alloc_startup,				\
 		"Force allocator startup to use the slowpath where it"	\
 		"can't find enough free buckets without invalidating"	\
-		"cached data")
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")

 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()

@ -308,6 +311,7 @@ enum bch_time_stats {
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
@ -330,13 +334,16 @@ enum gc_phase {
 	GC_PHASE_START,
 	GC_PHASE_SB,

-#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+	GC_PHASE_BTREE_EC,
+	GC_PHASE_BTREE_EXTENTS,
+	GC_PHASE_BTREE_INODES,
+	GC_PHASE_BTREE_DIRENTS,
+	GC_PHASE_BTREE_XATTRS,
+	GC_PHASE_BTREE_ALLOC,
+	GC_PHASE_BTREE_QUOTAS,

 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
 };

 struct gc_pos {
@ -381,14 +388,14 @@ struct bch_dev {
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
+	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_dirty;
+	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;

-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu *usage[2];

 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@ -466,7 +473,6 @@ enum {

 	/* errors: */
 	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,

 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
@ -602,8 +608,8 @@ struct bch_fs {

 	atomic64_t		sectors_available;

-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
+	struct bch_fs_usage __percpu *usage[2];
+
 	struct percpu_rw_semaphore usage_lock;

 	struct closure_waitlist	freelist_wait;
@ -644,9 +650,6 @@ struct bch_fs {
 	 *
 	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
 	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
-	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
 	 */
@ -681,6 +684,21 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;

+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
+	GENRADIX(struct ec_stripe) ec_stripes;
+	struct mutex		ec_stripes_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	spinlock_t		ec_stripes_heap_lock;
+
+	struct bio_set		ec_bioset;
+
+	struct work_struct	ec_stripe_delete_work;
+	struct llist_head	ec_stripe_delete_list;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -233,6 +233,9 @@ struct bkey_packed {
 } __attribute__((packed, aligned(8)));

 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
 #define KEY_PACKED_BITS_START		24

 #define KEY_FORMAT_LOCAL_BTREE		0
@ -460,8 +463,9 @@ enum bch_compression_type {
 	x(ptr,			0)		\
 	x(crc32,		1)		\
 	x(crc64,		2)		\
-	x(crc128,		3)
-#define BCH_EXTENT_ENTRY_MAX	4
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)
+#define BCH_EXTENT_ENTRY_MAX	5

 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@ -552,7 +556,7 @@ struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:1,
 				cached:1,
-				erasure_coded:1,
+				unused:1,
 				reservation:1,
 				offset:44, /* 8 petabytes */
 				dev:8,
@ -562,23 +566,35 @@ struct bch_extent_ptr {
 				dev:8,
 				offset:44,
 				reservation:1,
-				erasure_coded:1,
+				unused:1,
 				cached:1,
 				type:1;
 #endif
 } __attribute__((packed, aligned(8)));

-struct bch_extent_reservation {
+struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:5,
-				unused:23,
+				block:8,
+				idx:51;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:51,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:22,
 				replicas:4,
 				generation:32;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			generation:32,
 				replicas:4,
-				unused:23,
-				type:5;
+				unused:22,
+				type:6;
 #endif
 };

@ -701,7 +717,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas,		8)	\
 	BCH_INODE_FIELD(bi_promote_target,		16)	\
 	BCH_INODE_FIELD(bi_foreground_target,		16)	\
-	BCH_INODE_FIELD(bi_background_target,		16)
+	BCH_INODE_FIELD(bi_background_target,		16)	\
+	BCH_INODE_FIELD(bi_erasure_code,		16)

 #define BCH_INODE_FIELDS_INHERIT()				\
 	BCH_INODE_FIELD(bi_data_checksum)			\
@ -711,7 +728,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas)			\
 	BCH_INODE_FIELD(bi_promote_target)			\
 	BCH_INODE_FIELD(bi_foreground_target)			\
-	BCH_INODE_FIELD(bi_background_target)
+	BCH_INODE_FIELD(bi_background_target)			\
+	BCH_INODE_FIELD(bi_erasure_code)

 enum {
 	/*
@ -871,6 +889,27 @@ struct bch_quota {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(quota,	BCH_QUOTA);

+/* Erasure coding */
+
+enum {
+	BCH_STRIPE		= 128,
+};
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(stripe,	BCH_STRIPE);
+
 /* Optional/variable size superblock sections: */

 struct bch_sb_field {
@ -1060,7 +1099,7 @@ struct bch_sb_field_quota {
 struct bch_disk_group {
 	__u8			label[BCH_SB_LABEL_SIZE];
 	__le64			flags[2];
-};
+} __attribute__((packed, aligned(8)));

 LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
 LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
@ -1069,7 +1108,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
 	struct bch_disk_group	entries[0];
-};
+} __attribute__((packed, aligned(8)));

 /*
 * On clean shutdown, store btree roots and current journal sequence number in
@ -1235,12 +1274,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 					struct bch_sb, flags[2],  0,  4);
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);

+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
 	BCH_FEATURE_GZIP		= 1,
 	BCH_FEATURE_ZSTD		= 2,
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
+	BCH_FEATURE_EC			= 4,
 	BCH_FEATURE_NR,
 };

@ -1407,7 +1449,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
 	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
 	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")
+	DEF_BTREE_ID(QUOTAS,	5, "quotas")			\
+	DEF_BTREE_ID(EC,	6, "erasure_coding")

 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,

--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -579,6 +579,8 @@ BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);

 BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);

+BKEY_VAL_ACCESSORS(stripe,		BCH_STRIPE);
+
 /* byte order helpers */

 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@ -4,6 +4,7 @@
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
@ -17,6 +18,7 @@ const struct bkey_ops bch2_bkey_ops[] = {
 	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
 	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
 	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_EC]		= bch2_bkey_ec_ops,
 	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
 };

--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -14,6 +14,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@ -113,6 +114,7 @@ static bool bkey_type_needs_gc(enum bkey_type type)
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
 		return true;
 	default:
 		return false;
@ -153,6 +155,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++)
+				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+		}
+		}
 	default:
 		break;
 	}
@ -214,6 +227,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++) {
+				ret = ptr_gen_check(c, type, ptr);
+				if (ret)
+					return ret;
+			}
+		}
+		}
+		break;
 	default:
 		break;
 	}
@ -229,8 +257,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 {
 	struct gc_pos pos = { 0 };
 	unsigned flags =
-		BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-		BCH_BUCKET_MARK_GC_LOCK_HELD|
+		BCH_BUCKET_MARK_GC|
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
 	int ret = 0;

@ -359,15 +386,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	return 0;
 }

+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 			  bool initial)
 {
+	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;

-	for (i = 0; i < BTREE_ID_NR; i++) {
-		enum bkey_type type = bkey_type(0, i);
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);

-		int ret = bch2_gc_btree(c, i, initial);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		enum btree_id id = ids[i];
+		enum bkey_type type = bkey_type(0, id);
+
+		int ret = bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;

@ -441,9 +480,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 				      BCH_DATA_SB, flags);
 	}

-	if (c)
-		spin_lock(&c->journal.lock);
-
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
 		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@ -453,7 +489,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,

 	if (c) {
 		percpu_up_read_preempt_enable(&c->usage_lock);
-		spin_unlock(&c->journal.lock);
 	} else {
 		preempt_enable();
 	}
@ -468,9 +503,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 	gc_pos_set(c, gc_phase(GC_PHASE_SB));

 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca,
-					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					 BCH_BUCKET_MARK_GC_LOCK_HELD);
+		bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
 	mutex_unlock(&c->sb_lock);
 }

@ -478,7 +511,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
 	struct gc_pos pos = { 0 };
-	struct bch_fs_usage stats = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;

@ -490,13 +522,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 			bch2_mark_key(c, BKEY_TYPE_BTREE,
 				      bkey_i_to_s_c(&d->key),
 				      true, 0,
-				      pos, &stats, 0,
-				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-				      BCH_BUCKET_MARK_GC_LOCK_HELD);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+				      pos, NULL, 0,
+				      BCH_BUCKET_MARK_GC);

 	mutex_unlock(&c->btree_interior_update_lock);
 }
@ -517,8 +544,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		fifo_for_each_entry(i, &ca->free_inc, iter)
 			bch2_mark_alloc_bucket(c, ca, i, true,
 					       gc_pos_alloc(c, NULL),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);



@ -526,8 +552,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			fifo_for_each_entry(i, &ca->free[j], iter)
 				bch2_mark_alloc_bucket(c, ca, i, true,
 						       gc_pos_alloc(c, NULL),
-						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+						       BCH_BUCKET_MARK_GC);
 	}

 	spin_unlock(&c->freelist_lock);
@ -541,8 +566,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);
 		}
 		spin_unlock(&ob->lock);
 	}
@ -550,121 +574,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	percpu_up_read_preempt_enable(&c->usage_lock);
 }

-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets[1] = NULL;
+
+		free_percpu(ca->usage[1]);
+		ca->usage[1] = NULL;
+	}
+
+	free_percpu(c->usage[1]);
+	c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket_mark new;
 	unsigned i;
-	size_t b;
 	int cpu;

+	for_each_member_device(ca, c, i) {
+		struct bucket_array *src = __bucket_array(ca, 1);
+
+		memcpy(__bucket_array(ca, 0), src,
+		       sizeof(struct bucket_array) +
+		       sizeof(struct bucket) * src->nbuckets);
+	};
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage *p;
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(ca->usage[0], cpu);
+			memset(p, 0, sizeof(*p));
+		}
+
+		preempt_disable();
+		*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+		preempt_enable();
+	}
+
+	{
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		memcpy(this_cpu_ptr(c->usage[0]),
+		       &src,
+		       offsetof(typeof(*p), online_reserved));
+		preempt_enable();
+	}
+
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int cpu;
+
+#define copy_field(_f, _msg, ...)					\
+	if (dst._f != src._f) {						\
+		pr_info(_msg ": got %llu, should be %llu, fixing"	\
+			, ##__VA_ARGS__, dst._f, src._f);		\
+		dst._f = src._f;					\
+	}
+#define copy_bucket_field(_f)						\
+	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
+		pr_info("dev %u bucket %zu has wrong " #_f		\
+			": got %u, should be %u, fixing",		\
+			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
+		dst->b[b]._mark._f = src->b[b].mark._f;			\
+	}
+#define copy_dev_field(_f, _msg, ...)					\
+	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)					\
+	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
 	percpu_down_write(&c->usage_lock);

-	/*
-	 * Indicates to buckets code that gc is now in progress - done under
-	 * usage_lock to avoid racing with bch2_mark_key():
-	 */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+	if (initial) {
+		bch2_gc_done_nocheck(c);
+		goto out;
+	}

-	/* Save a copy of the existing bucket stats while we recompute them: */
 	for_each_member_device(ca, c, i) {
-		ca->usage_cached = __bch2_dev_usage_read(ca);
+		struct bucket_array *dst = __bucket_array(ca, 0);
+		struct bucket_array *src = __bucket_array(ca, 1);
+		size_t b;
+
+		if (initial) {
+			memcpy(dst, src,
+			       sizeof(struct bucket_array) +
+			       sizeof(struct bucket) * dst->nbuckets);
+		}
+
+		for (b = 0; b < src->nbuckets; b++) {
+			copy_bucket_field(gen);
+			copy_bucket_field(data_type);
+			copy_bucket_field(owned_by_allocator);
+			copy_bucket_field(stripe);
+			copy_bucket_field(dirty_sectors);
+			copy_bucket_field(cached_sectors);
+		}
+	};
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+		struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+		struct bch_dev_usage *p;
+		unsigned b;
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(buckets[b],
+				       "buckets[%s]", bch2_data_types[b]);
+		copy_dev_field(buckets_alloc, "buckets_alloc");
+		copy_dev_field(buckets_ec, "buckets_ec");
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(sectors[b],
+				       "sectors[%s]", bch2_data_types[b]);
+		copy_dev_field(sectors_fragmented,
+			       "sectors_fragmented");
+
 		for_each_possible_cpu(cpu) {
-			struct bch_dev_usage *p =
-				per_cpu_ptr(ca->usage_percpu, cpu);
+			p = per_cpu_ptr(ca->usage[0], cpu);
 			memset(p, 0, sizeof(*p));
 		}
+
+		preempt_disable();
+		p = this_cpu_ptr(ca->usage[0]);
+		*p = dst;
+		preempt_enable();
+	}
+
+	{
+		struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
+		unsigned r, b;
+
+		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+			for (b = 0; b < BCH_DATA_NR; b++)
+				copy_fs_field(replicas[r].data[b],
+					      "replicas[%i].data[%s]",
+					      r, bch2_data_types[b]);
+			copy_fs_field(replicas[r].ec_data,
+				      "replicas[%i].ec_data", r);
+			copy_fs_field(replicas[r].persistent_reserved,
+				      "replicas[%i].persistent_reserved", r);
+		}
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_fs_field(buckets[b],
+				      "buckets[%s]", bch2_data_types[b]);
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		p = this_cpu_ptr(c->usage[0]);
+		memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+		preempt_enable();
+	}
+out:
+	percpu_up_write(&c->usage_lock);
+
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	BUG_ON(c->usage[1]);
+
+	c->usage[1] = alloc_percpu(struct bch_fs_usage);
+	if (!c->usage[1])
+		return -ENOMEM;
+
+	for_each_member_device(ca, c, i) {
+		BUG_ON(ca->buckets[1]);
+		BUG_ON(ca->usage[1]);
+
+		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!ca->buckets[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
+
+		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
 	}

-	c->usage_cached = __bch2_fs_usage_read(c);
-	for_each_possible_cpu(cpu) {
-		struct bch_fs_usage *p =
-			per_cpu_ptr(c->usage_percpu, cpu);
+	percpu_down_write(&c->usage_lock);

-		memset(p->replicas, 0, sizeof(p->replicas));
-		memset(p->buckets, 0, sizeof(p->buckets));
-	}
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 1);
+		struct bucket_array *src = __bucket_array(ca, 0);
+		size_t b;
+
+		dst->first_bucket	= src->first_bucket;
+		dst->nbuckets		= src->nbuckets;
+
+		for (b = 0; b < src->nbuckets; b++)
+			dst->b[b]._mark.gen = src->b[b].mark.gen;
+	};

 	percpu_up_write(&c->usage_lock);

-	/* Clear bucket marks: */
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			bucket_cmpxchg(buckets->b + b, new, ({
-				new.owned_by_allocator	= 0;
-				new.data_type		= 0;
-				new.cached_sectors	= 0;
-				new.dirty_sectors	= 0;
-			}));
-			ca->oldest_gens[b] = new.gen;
-		}
-		up_read(&ca->bucket_lock);
-	}
+	return 0;
 }

 /**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
 */
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
-	unsigned i;
+	unsigned i, iter = 0;
 	int ret;

-	/*
-	 * Walk _all_ references to buckets, and recompute them:
-	 *
-	 * Order matters here:
-	 *  - Concurrent GC relies on the fact that we have a total ordering for
-	 *    everything that GC walks - see  gc_will_visit_node(),
-	 *    gc_will_visit_root()
-	 *
-	 *  - also, references move around in the course of index updates and
-	 *    various other crap: everything needs to agree on the ordering
-	 *    references are allowed to move around in - e.g., we're allowed to
-	 *    start with a reference owned by an open_bucket (the allocator) and
-	 *    move it to the btree, but not the reverse.
-	 *
-	 *    This is necessary to ensure that gc doesn't miss references that
-	 *    move around - if references move backwards in the ordering GC
-	 *    uses, GC could skip past them
-	 */
 	trace_gc_start(c);

-	/*
-	 * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-	 * gc_lock if sectors_available goes to 0:
-	 */
-	bch2_recalc_sectors_available(c);
-
 	down_write(&c->gc_lock);
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+	ret = bch2_gc_start(c);
+	if (ret)
 		goto out;

-	bch2_gc_start(c);
-
 	bch2_mark_superblocks(c);

-	ret = bch2_gc_btrees(c, NULL, false);
-	if (ret) {
-		bch_err(c, "btree gc failed: %d", ret);
-		set_bit(BCH_FS_GC_FAILURE, &c->flags);
+	ret = bch2_gc_btrees(c, journal, initial);
+	if (ret)
 		goto out;
-	}

 	bch2_mark_pending_btree_node_frees(c);
 	bch2_mark_allocator_buckets(c);

-	/* Indicates that gc is no longer in progress: */
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	c->gc_count++;
 out:
+	if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		if (iter++ <= 2) {
+			bch_info(c, "Fixed gens, restarting mark and sweep:");
+			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			goto again;
+		}
+
+		bch_info(c, "Unable to fix bucket gens, looping");
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		bch2_gc_done(c, initial);
+
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	bch2_gc_free(c);
 	up_write(&c->gc_lock);
+
+	if (!ret && initial)
+		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
 	trace_gc_end(c);
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);

@ -680,6 +893,7 @@ out:
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
 	closure_wake_up(&c->freelist_wait);
+	return ret;
 }

 /* Btree coalescing */
@ -995,9 +1209,6 @@ void bch2_coalesce(struct bch_fs *c)
 {
 	enum btree_id id;

-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return;
-
 	down_read(&c->gc_lock);
 	trace_gc_coalesce_start(c);

@ -1009,7 +1220,6 @@ void bch2_coalesce(struct bch_fs *c)
 		if (ret) {
 			if (ret != -ESHUTDOWN)
 				bch_err(c, "btree coalescing failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
 			return;
 		}
 	}
@ -1024,6 +1234,7 @@ static int bch2_gc_thread(void *arg)
 	struct io_clock *clock = &c->io_clock[WRITE];
 	unsigned long last = atomic_long_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;

 	set_freezable();

@ -1057,7 +1268,9 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);

-		bch2_gc(c);
+		ret = bch2_gc(c, NULL, false);
+		if (ret)
+			bch_err(c, "btree gc failed: %i", ret);

 		debug_check_no_locks_held();
 	}
@ -1098,30 +1311,7 @@ int bch2_gc_thread_start(struct bch_fs *c)

 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
-	unsigned iter = 0;
-	int ret = 0;
-
-	down_write(&c->gc_lock);
-again:
-	bch2_gc_start(c);
-
-	bch2_mark_superblocks(c);
-
-	ret = bch2_gc_btrees(c, journal, true);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-		if (iter++ > 2) {
-			bch_info(c, "Unable to fix bucket gens, looping");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-		clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-		goto again;
-	}
+	int ret = bch2_gc(c, journal, true);

 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@ -1130,9 +1320,5 @@ again:
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);

-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
-	up_write(&c->gc_lock);
 	return ret;
 }
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@ -6,7 +6,7 @@
 enum bkey_type;

 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@ -54,11 +54,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 	return 0;
 }

+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+	default:
+		BUG();
+	}
+}
+
 static inline struct gc_pos gc_pos_btree(enum btree_id id,
 					 struct bpos pos, unsigned level)
 {
 	return (struct gc_pos) {
-		.phase	= GC_PHASE_BTREE_EXTENTS + id,
+		.phase	= btree_id_to_gc_phase(id),
 		.pos	= pos,
 		.level	= level,
 	};
@ -93,14 +104,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
 	};
 }

-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
 	bool ret;

 	do {
 		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
 	} while (read_seqcount_retry(&c->gc_pos_lock, seq));

 	return ret;
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -817,7 +817,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
 			 */
 			iter->level = depth_want;
 			iter->l[iter->level].b = NULL;
-			return 0;
+			return 1;
 		}

 		lock_type = __btree_lock_want(iter, iter->level);
@ -1044,6 +1044,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 			? btree_iter_down(iter)
 			: btree_iter_lock_root(iter, depth_want);
 		if (unlikely(ret)) {
+			if (ret == 1)
+				return 0;
+
 			iter->level = depth_want;
 			iter->l[iter->level].b = BTREE_ITER_NOT_END;
 			return ret;
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -159,7 +159,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	unsigned replicas;

 	/*
 	 * btree_update lock is only needed here to avoid racing with
@ -177,15 +176,6 @@ found:
 	BUG_ON(d->index_update_done);
 	d->index_update_done = true;

-	/*
-	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
-	 * freed from the index:
-	 */
-	replicas = bch2_extent_nr_dirty_ptrs(k);
-	if (replicas)
-		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size * replicas;
-
 	/*
 	 * We're dropping @k from the btree, but it's still live until the
 	 * index update is persistent so we need to keep a reference around for
@ -207,15 +197,16 @@ found:
 	 * bch2_mark_key() compares the current gc pos to the pos we're
 	 * moving this reference from, hence one comparison here:
 	 */
-	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bch_fs_usage tmp = { 0 };
+	if (gc_pos_cmp(c->gc_pos, b
+		       ? gc_pos_btree_node(b)
+		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct gc_pos pos = { 0 };

 		bch2_mark_key(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&d->key),
-			      false, 0, b
-			      ? gc_pos_btree_node(b)
-			      : gc_pos_btree_root(as->btree_id),
-			      &tmp, 0, 0);
+			      false, 0, pos,
+			      NULL, 0, BCH_BUCKET_MARK_GC);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@ -286,19 +277,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 					struct pending_btree_node_free *pending)
 {
-	struct bch_fs_usage stats = { 0 };
-
 	BUG_ON(!pending->index_update_done);

 	bch2_mark_key(c, BKEY_TYPE_BTREE,
 		      bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
-		      &stats, 0, 0);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+		      NULL, 0, 0);
 }

 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -339,7 +324,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);

 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
@ -637,12 +622,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
 	int ret;

 	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret < 0)
-		goto err;
-	if (!ret) {
+	if (ret == -EAGAIN) {
 		continue_at(cl, btree_update_wait_on_journal, system_wq);
 		return;
 	}
+	if (ret < 0)
+		goto err;

 	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -343,19 +343,40 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	trans_for_each_entry(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);

-	u64s = 0;
-	trans_for_each_entry(trans, i)
-		u64s += jset_u64s(i->k->k.u64s);
-
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));

-	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-		? bch2_journal_res_get(&c->journal,
-				      &trans->journal_res,
-				      u64s, u64s)
-		: 0;
-	if (ret)
-		return ret;
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64s = 0;
+		trans_for_each_entry(trans, i)
+			u64s += jset_u64s(i->k->k.u64s);
+
+		while ((ret = bch2_journal_res_get(&c->journal,
+					&trans->journal_res, u64s,
+					JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
+			struct btree_iter *iter = trans->entries[0].iter;
+			struct closure cl;
+
+			bch2_btree_iter_unlock(iter);
+
+			closure_init_stack(&cl);
+
+			while ((ret = bch2_journal_open_seq_async(&c->journal,
+							trans->journal_res.seq,
+							&cl)) == -EAGAIN)
+				closure_sync(&cl);
+
+			if (ret)
+				return ret;
+
+			if (!bch2_btree_iter_relock(iter)) {
+				trans_restart(" (iter relock after journal res get blocked)");
+				return -EINTR;
+			}
+		}
+
+		if (ret)
+			return ret;
+	}

 	multi_lock_write(c, trans);

--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -68,6 +68,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "ec.h"
 #include "error.h"
 #include "movinggc.h"

@ -83,8 +84,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);

 static void bch2_fs_stats_verify(struct bch_fs *c)
 {
-	struct bch_fs_usage stats =
-		__bch2_fs_usage_read(c);
+	struct bch_fs_usage stats =_bch2_fs_usage_read(c);
 	unsigned i, j;

 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@ -207,43 +207,24 @@ do {									\
 	_acc;								\
 })

-#define bch2_usage_read_cached(_c, _cached, _uncached)			\
-({									\
-	typeof(_cached) _ret;						\
-	unsigned _seq;							\
-									\
-	do {								\
-		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
-		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
-			? bch2_usage_read_raw(_uncached)			\
-			: (_cached);					\
-	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
-									\
-	_ret;								\
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
 {
-	return bch2_usage_read_raw(ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[gc]);
 }

 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[0]);
 }

-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
 {
-	return bch2_usage_read_raw(c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[gc]);
 }

-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
-	return bch2_usage_read_cached(c,
-				     c->usage_cached,
-				     c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[0]);
 }

 struct fs_usage_sum {
@ -269,6 +250,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
 		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
 		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
+		sum.data	+= stats.replicas[i].ec_data;
 		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
 		sum.reserved	+= stats.replicas[i].persistent_reserved;
 	}
@ -324,13 +306,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
 		: m.data_type;
 }

-static bool bucket_became_unavailable(struct bch_fs *c,
-				      struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
 				      struct bucket_mark new)
 {
 	return is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+	       !is_available_bucket(new);
 }

 void bch2_fs_usage_apply(struct bch_fs *c,
@ -360,12 +340,14 @@ void bch2_fs_usage_apply(struct bch_fs *c,

 	percpu_down_read_preempt_disable(&c->usage_lock);
 	/* online_reserved not subject to gc: */
-	this_cpu_ptr(c->usage_percpu)->online_reserved +=
+	this_cpu_ptr(c->usage[0])->online_reserved +=
 		stats->online_reserved;
 	stats->online_reserved = 0;

-	if (!gc_will_visit(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+	bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+	if (gc_visited(c, gc_pos))
+		bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);

 	bch2_fs_stats_verify(c);
 	percpu_up_read_preempt_enable(&c->usage_lock);
@ -374,8 +356,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }

 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bch_fs_usage *stats,
-				  struct bucket_mark old, struct bucket_mark new)
+				  struct bch_fs_usage *fs_usage,
+				  struct bucket_mark old, struct bucket_mark new,
+				  bool gc)
 {
 	struct bch_dev_usage *dev_usage;

@ -387,16 +370,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_data_types[old.data_type],
 		bch2_data_types[new.data_type]);

-	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
+	dev_usage = this_cpu_ptr(ca->usage[gc]);

-	dev_usage = this_cpu_ptr(ca->usage_percpu);
-
-	dev_usage->buckets[bucket_type(old)]--;
-	dev_usage->buckets[bucket_type(new)]++;
+	if (bucket_type(old) != bucket_type(new)) {
+		if (bucket_type(old)) {
+			fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(old)]--;
+		} else {
+			fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(new)]++;
+		}
+	}

 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_ec +=
+		(int) new.stripe - (int) old.stripe;
 	dev_usage->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);

@ -417,21 +406,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, stats, _old, new);		\
+	bch2_dev_usage_update(c, ca, stats, _old, new, gc);	\
 	_old;							\
 })

-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark *old,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark new;

-	percpu_rwsem_assert_held(&c->usage_lock);
-
-	g = bucket(ca, b);
-
 	*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		BUG_ON(!is_available_bucket(new));

@ -442,38 +428,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));

-	/*
-	 * This isn't actually correct yet, since fs usage is still
-	 * uncompressed sectors:
-	 */
 	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	__bch2_invalidate_bucket(c, ca, b, old, false);

 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
 }

-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, bool owned_by_allocator,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;

-	percpu_rwsem_assert_held(&c->usage_lock);
-	g = bucket(ca, b);
-
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		return;
-
 	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));

-	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	BUG_ON(!gc &&
+	       !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	if (!(flags & BCH_BUCKET_MARK_GC))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 }

 #define checked_add(a, b)					\
@ -483,35 +480,47 @@ do {								\
 	BUG_ON((a) != _res);					\
 } while (0)

+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+					size_t b, enum bch_data_type type,
+					unsigned sectors, bool gc)
+{
+	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
+
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+		new.data_type	= type;
+		checked_add(new.dirty_sectors, sectors);
+	}));
+
+	fs_usage->replicas[0].data[type] += sectors;
+}
+
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       size_t b, enum bch_data_type type,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
-	struct bch_fs_usage *stats;
-	struct bucket *g;
-	struct bucket_mark old, new;
-
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);

 	if (likely(c)) {
 		percpu_rwsem_assert_held(&c->usage_lock);

-		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-		    gc_will_visit(c, pos))
-			return;
-
-		stats = this_cpu_ptr(c->usage_percpu);
-
-		g = bucket(ca, b);
-		old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-			new.data_type = type;
-			checked_add(new.dirty_sectors, sectors);
-		}));
-
-		stats->replicas[0].data[type] += sectors;
+		if (!(flags & BCH_BUCKET_MARK_GC))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    false);
+		if ((flags & BCH_BUCKET_MARK_GC) ||
+		    gc_visited(c, pos))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    true);
 	} else {
+		struct bucket *g;
+		struct bucket_mark old, new;
+
 		rcu_read_lock();

 		g = bucket(ca, b);
@ -522,9 +531,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,

 		rcu_read_unlock();
 	}
-
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
 }

 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@ -569,23 +575,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags)
+			      u64 journal_seq, unsigned flags,
+			      bool gc)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+	struct bucket *g = __bucket(ca, b, gc);
 	u64 v;

-	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-		if (journal_seq)
-			bucket_cmpxchg(g, new, ({
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}));
-
-		return;
-	}
-
 	v = atomic64_read(&g->_mark.v);
 	do {
 		new.v.counter = old.v.counter = v;
@ -627,17 +625,59 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);

-	bch2_dev_usage_update(c, ca, fs_usage, old, new);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);

-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
+	BUG_ON(!gc && bucket_became_unavailable(old, new));
+}
+
+static void bch2_mark_stripe_ptr(struct bch_fs *c,
+				 struct bch_extent_stripe_ptr p,
+				 s64 sectors, unsigned flags,
+				 s64 *adjusted_disk_sectors,
+				 unsigned *redundancy)
+{
+	struct ec_stripe *m;
+	unsigned old, new, nr_data;
+	int blocks_nonempty_delta;
+	s64 parity_sectors;
+
+	m = genradix_ptr(&c->ec_stripes, p.idx);
+	if (WARN_ON(!m))
+		return;
+
+	if (WARN_ON(!m->alive))
+		return;
+
+	nr_data = m->nr_blocks - m->nr_redundant;
+
+	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
+
+	if (sectors < 0)
+		parity_sectors = -parity_sectors;
+
+	*adjusted_disk_sectors += parity_sectors;
+
+	*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+
+	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
+	old = new - sectors;
+
+	blocks_nonempty_delta = (int) !!new - (int) !!old;
+	if (!blocks_nonempty_delta)
+		return;
+
+	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+
+	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+
+	bch2_stripes_heap_update(c, m, p.idx);
 }

 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     s64 sectors, enum bch_data_type data_type,
-			     struct gc_pos pos,
 			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags)
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
 {
 	BUG_ON(!sectors);

@ -649,28 +689,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		struct extent_ptr_decoded p;
 		s64 cached_sectors	= 0;
 		s64 dirty_sectors	= 0;
+		s64 ec_sectors		= 0;
 		unsigned replicas	= 0;
+		unsigned ec_redundancy	= 0;
+		unsigned i;

 		extent_for_each_ptr_decode(e, p, entry) {
 			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
+			s64 adjusted_disk_sectors = disk_sectors;

 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-					  stats, journal_seq, flags);
+					  stats, journal_seq, flags, gc);

+			if (!p.ptr.cached)
+				for (i = 0; i < p.ec_nr; i++)
+					bch2_mark_stripe_ptr(c, p.ec[i],
+							disk_sectors, flags,
+							&adjusted_disk_sectors,
+							&ec_redundancy);
 			if (!p.ptr.cached)
 				replicas++;

 			if (p.ptr.cached)
-				cached_sectors	+= disk_sectors;
+				cached_sectors	+= adjusted_disk_sectors;
+			else if (!p.ec_nr)
+				dirty_sectors	+= adjusted_disk_sectors;
 			else
-				dirty_sectors	+= disk_sectors;
+				ec_sectors	+= adjusted_disk_sectors;
 		}

 		replicas	= clamp_t(unsigned,	replicas,
 					  1, ARRAY_SIZE(stats->replicas));
+		ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
+					  1, ARRAY_SIZE(stats->replicas));

 		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
 		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+		stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
 		break;
 	}
 	case BCH_RESERVATION: {
@ -686,6 +741,105 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	}
 }

+static void bucket_set_stripe(struct bch_fs *c,
+			      const struct bch_stripe *v,
+			      bool enabled,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq,
+			      bool gc)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		size_t b = PTR_BUCKET_NR(ca, ptr);
+		struct bucket *g = __bucket(ca, b, gc);
+		struct bucket_mark new, old;
+
+		BUG_ON(ptr_stale(ca, ptr));
+
+		old = bucket_cmpxchg(g, new, ({
+			new.stripe			= enabled;
+			if (journal_seq) {
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}
+		}));
+
+		BUG_ON(old.stripe == enabled);
+
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	}
+}
+
+static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			     bool inserting,
+			     struct bch_fs_usage *fs_usage,
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
+{
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		size_t idx = s.k->p.offset;
+		struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+		unsigned i;
+
+		BUG_ON(!m);
+		BUG_ON(m->alive == inserting);
+
+		BUG_ON(atomic_read(&m->blocks_nonempty));
+
+		for (i = 0; i < EC_STRIPE_MAX; i++)
+			BUG_ON(atomic_read(&m->block_sectors[i]));
+
+		if (inserting) {
+			m->sectors	= le16_to_cpu(s.v->sectors);
+			m->algorithm	= s.v->algorithm;
+			m->nr_blocks	= s.v->nr_blocks;
+			m->nr_redundant	= s.v->nr_redundant;
+		}
+
+		if (inserting)
+			bch2_stripes_heap_insert(c, m, idx);
+		else
+			bch2_stripes_heap_del(c, m, idx);
+
+		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
+		break;
+	}
+	}
+}
+
+static void __bch2_mark_key(struct bch_fs *c,
+			    enum bkey_type type, struct bkey_s_c k,
+			    bool inserting, s64 sectors,
+			    struct bch_fs_usage *stats,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+		bch2_mark_extent(c, k, inserting
+				 ?  c->opts.btree_node_size
+				 : -c->opts.btree_node_size,
+				 BCH_DATA_BTREE,
+				 stats, journal_seq, flags, gc);
+		break;
+	case BKEY_TYPE_EXTENTS:
+		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+				 stats, journal_seq, flags, gc);
+		break;
+	case BKEY_TYPE_EC:
+		bch2_mark_stripe(c, k, inserting,
+				 stats, journal_seq, flags, gc);
+		break;
+	default:
+		break;
+	}
+}
+
 void bch2_mark_key(struct bch_fs *c,
 		   enum bkey_type type, struct bkey_s_c k,
 		   bool inserting, s64 sectors,
@ -693,57 +847,23 @@ void bch2_mark_key(struct bch_fs *c,
 		   struct bch_fs_usage *stats,
 		   u64 journal_seq, unsigned flags)
 {
-	/*
-	 * synchronization w.r.t. GC:
-	 *
-	 * Normally, bucket sector counts/marks are updated on the fly, as
-	 * references are added/removed from the btree, the lists of buckets the
-	 * allocator owns, other metadata buckets, etc.
-	 *
-	 * When GC is in progress and going to mark this reference, we do _not_
-	 * mark this reference here, to avoid double counting - GC will count it
-	 * when it gets to it.
-	 *
-	 * To know whether we should mark a given reference (GC either isn't
-	 * running, or has already marked references at this position) we
-	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @pos - with
-	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @pos; if GC's current position is
-	 * greater than @pos GC has either already walked this position, or
-	 * isn't running.
-	 *
-	 * To avoid racing with GC's position changing, we have to deal with
-	 *  - GC's position being set to GC_POS_MIN when GC starts:
-	 *    usage_lock guards against this
-	 *  - GC's position overtaking @pos: we guard against this with
-	 *    whatever lock protects the data structure the reference lives in
-	 *    (e.g. the btree node lock, or the relevant allocator lock).
-	 */
-
 	percpu_down_read_preempt_disable(&c->usage_lock);
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;

-	if (!stats)
-		stats = this_cpu_ptr(c->usage_percpu);
+	if (!(flags & BCH_BUCKET_MARK_GC)) {
+		if (!stats)
+			stats = this_cpu_ptr(c->usage[0]);

-	switch (type) {
-	case BKEY_TYPE_BTREE:
-		bch2_mark_extent(c, k, inserting
-				 ?  c->opts.btree_node_size
-				 : -c->opts.btree_node_size,
-				 BCH_DATA_BTREE,
-				 pos, stats, journal_seq, flags);
-		break;
-	case BKEY_TYPE_EXTENTS:
-		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				 pos, stats, journal_seq, flags);
-		break;
-	default:
-		break;
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				stats, journal_seq, flags, false);
 	}
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos)) {
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				this_cpu_ptr(c->usage[1]),
+				journal_seq, flags, true);
+	}
+
 	percpu_up_read_preempt_enable(&c->usage_lock);
 }

@ -819,28 +939,20 @@ void bch2_mark_update(struct btree_insert *trans,

 /* Disk reservations: */

-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
 	int cpu;

 	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+		per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;

 	return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }

-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
-	percpu_down_write(&c->usage_lock);
-	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-	percpu_up_write(&c->usage_lock);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read_preempt_disable(&c->usage_lock);
-	this_cpu_sub(c->usage_percpu->online_reserved,
+	this_cpu_sub(c->usage[0]->online_reserved,
 		     res->sectors);

 	bch2_fs_stats_verify(c);
@ -860,7 +972,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 	int ret;

 	percpu_down_read_preempt_disable(&c->usage_lock);
-	stats = this_cpu_ptr(c->usage_percpu);
+	stats = this_cpu_ptr(c->usage[0]);

 	if (sectors <= stats->available_cache)
 		goto out;
@ -908,7 +1020,7 @@ recalculate:
 	}

 	percpu_down_write(&c->usage_lock);
-	sectors_available = __recalc_sectors_available(c);
+	sectors_available = bch2_recalc_sectors_available(c);

 	if (sectors <= sectors_available ||
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@ -949,6 +1061,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	unsigned long *buckets_dirty = NULL;
+	unsigned long *buckets_written = NULL;
 	u8 *oldest_gens = NULL;
 	alloc_fifo	free[RESERVE_NR];
 	alloc_fifo	free_inc;
@ -962,7 +1075,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve);
-	bool resize = ca->buckets != NULL,
+	bool resize = ca->buckets[0] != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
 	unsigned i;
@ -980,6 +1093,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
 					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(buckets_written	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
 	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
 	    !init_fifo(&free[RESERVE_MOVINGGC],
 		       copygc_reserve, GFP_KERNEL) ||
@ -1014,13 +1130,17 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		memcpy(buckets_dirty,
 		       ca->buckets_dirty,
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+		memcpy(buckets_written,
+		       ca->buckets_written,
+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}

-	rcu_assign_pointer(ca->buckets, buckets);
+	rcu_assign_pointer(ca->buckets[0], buckets);
 	buckets = old_buckets;

 	swap(ca->oldest_gens, oldest_gens);
 	swap(ca->buckets_dirty, buckets_dirty);
+	swap(ca->buckets_written, buckets_written);

 	if (resize)
 		percpu_up_write(&c->usage_lock);
@ -1060,6 +1180,8 @@ err:
 		free_fifo(&free[i]);
 	kvpfree(buckets_dirty,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvpfree(buckets_written,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	kvpfree(oldest_gens,
 		nbuckets * sizeof(u8));
 	if (buckets)
@ -1077,19 +1199,21 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	free_fifo(&ca->free_inc);
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
+	kvpfree(ca->buckets_written,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->buckets_dirty,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));

-	free_percpu(ca->usage_percpu);
+	free_percpu(ca->usage[0]);
 }

 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
 		return -ENOMEM;

 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -28,23 +28,34 @@
 	_old;							\
 })

-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+						  bool gc)
 {
-	return rcu_dereference_check(ca->buckets,
+	return rcu_dereference_check(ca->buckets[gc],
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }

-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
 {
-	struct bucket_array *buckets = bucket_array(ca);
+	return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+{
+	struct bucket_array *buckets = __bucket_array(ca, gc);

 	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 	return buckets->b + b;
 }

+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@ -128,7 +139,7 @@ static inline bool bucket_unused(struct bucket_mark mark)

 /* Device usage: */

-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);

 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@ -167,7 +178,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)

 /* Filesystem usage: */

-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			 struct disk_reservation *, struct gc_pos);
@ -184,6 +195,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
 		!mark.dirty_sectors &&
+		!mark.stripe &&
 		!mark.nouse);
 }

@ -205,17 +217,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       struct gc_pos, unsigned);

 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 1)

 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		   bool, s64, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);

-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);

 static inline void bch2_disk_reservation_put(struct bch_fs *c,
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -18,7 +18,8 @@ struct bucket_mark {
 				gen_valid:1,
 				owned_by_allocator:1,
 				nouse:1,
-				journal_seq_valid:1;
+				journal_seq_valid:1,
+				stripe:1;
 		u16		dirty_sectors;
 		u16		cached_sectors;

@ -52,6 +53,7 @@ struct bucket_array {
 struct bch_dev_usage {
 	u64			buckets[BCH_DATA_NR];
 	u64			buckets_alloc;
+	u64			buckets_ec;
 	u64			buckets_unavailable;

 	/* _compressed_ sectors: */
@ -61,15 +63,18 @@ struct bch_dev_usage {

 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	u64			online_reserved;
-	u64			available_cache;

 	struct {
 		u64		data[BCH_DATA_NR];
+		u64		ec_data;
 		u64		persistent_reserved;
 	}			replicas[BCH_REPLICAS_MAX];

 	u64			buckets[BCH_DATA_NR];
+
+	/* fields starting here aren't touched by gc: */
+	u64			online_reserved;
+	u64			available_cache;
 };

 /*
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -601,11 +601,13 @@ have_compressed:
 			goto out;
 	}

-	ret = mempool_init_kmalloc_pool(
-			&c->decompress_workspace,
-			1, decompress_workspace_size);
-	if (ret)
-		goto out;
+	if (!mempool_initialized(&c->decompress_workspace)) {
+		ret = mempool_init_kmalloc_pool(
+				&c->decompress_workspace,
+				1, decompress_workspace_size);
+		if (ret)
+			goto out;
+	}
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@ -54,6 +54,19 @@ static inline struct target target_decode(unsigned target)
 }

 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);

 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@ -0,0 +1,108 @@
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "keylist_types.h"
+
+const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ec_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_ec_key_invalid,		\
+	.val_to_text	= bch2_ec_key_to_text,		\
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	void			*data[EC_STRIPE_MAX];
+
+	union {
+		struct bkey_i_stripe	key;
+		u64			pad[255];
+	};
+};
+
+struct ec_stripe_head;
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	/* counts in flight writes, stripe is created when pin == 0 */
+	atomic_t		pin;
+
+	int			err;
+
+	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct keylist		keys;
+	u64			inline_keys[BKEY_U64s * 8];
+
+	struct ec_stripe_buf	stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	struct list_head	stripes;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
+			     struct bpos, unsigned);
+
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
+					       unsigned, unsigned);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+
+void bch2_ec_flush_new_stripes(struct bch_fs *);
+
+int bch2_fs_ec_start(struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@ -0,0 +1,30 @@
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include <linux/llist.h>
+
+#define EC_STRIPE_MAX	16
+
+struct ec_stripe {
+	size_t			heap_idx;
+
+	u16			sectors;
+	u8			algorithm;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	u8			alive;
+	atomic_t		pin;
+	atomic_t		blocks_nonempty;
+	atomic_t		block_sectors[EC_STRIPE_MAX];
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }

-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-				    const struct bch_extent_ptr *ptr)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
 {
+	unsigned i, durability = 0;
 	struct bch_dev *ca;

-	if (ptr->cached)
+	if (p.ptr.cached)
 		return 0;

-	ca = bch_dev_bkey_exists(c, ptr->dev);
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);

-	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-		return 0;
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);

-	return ca->mi.durability;
+	for (i = 0; i < p.ec_nr; i++) {
+		struct ec_stripe *s =
+			genradix_ptr(&c->ec_stripes, p.idx);
+
+		if (WARN_ON(!s))
+			continue;
+
+		durability = max_t(unsigned, durability, s->nr_redundant);
+	}
+
+	return durability;
 }

 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 {
-	const struct bch_extent_ptr *ptr;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	unsigned durability = 0;

-	extent_for_each_ptr(e, ptr)
-		durability += bch2_extent_ptr_durability(c, ptr);
+	extent_for_each_ptr_decode(e, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);

 	return durability;
 }
@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }

+static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = e.v->start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
 					     struct bch_extent_ptr *ptr)
 {
-	union bch_extent_entry *dst;
-	union bch_extent_entry *src;
+	union bch_extent_entry *dst, *src, *prev;
+	bool drop_crc = true;

 	EBUG_ON(ptr < &e.v->start->ptr ||
 		ptr >= &extent_entry_last(e)->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);

-	src = to_entry(ptr + 1);
-
+	src = extent_entry_next(to_entry(ptr));
 	if (src != extent_entry_last(e) &&
-	    extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
-		dst = to_entry(ptr);
-	} else {
-		extent_for_each_entry(e, dst) {
-			if (dst == to_entry(ptr))
-				break;
+	    !extent_entry_is_crc(src))
+		drop_crc = false;

-			if (extent_entry_next(dst) == to_entry(ptr) &&
-			    extent_entry_is_crc(dst))
-				break;
+	dst = to_entry(ptr);
+	while ((prev = extent_entry_prev(e, dst))) {
+		if (extent_entry_is_ptr(prev))
+			break;
+
+		if (extent_entry_is_crc(prev)) {
+			if (drop_crc)
+				dst = prev;
+			break;
 		}
+
+		dst = prev;
 	}

 	memmove_u64s_down(dst, src,
@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc128.csum.lo = (__force __le64)
 					swab64((__force u64) entry->crc128.csum.lo);
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}
 		break;
@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
 	struct bch_dev *ca;
 	bool first = true;

@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			pr_buf(out, " ");

 		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			       crc.csum_type,
 			       crc.compression_type);
 			break;
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;

-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
 			break;
 		default:
 			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,

 		f = &failed->devs[failed->nr++];
 		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
 		f->nr_failed	= 1;
 		f->nr_retries	= 0;
 	} else {
@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
 			      const struct extent_ptr_decoded p2)
 {
-	struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-	struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);

-	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);

-	/* Pick at random, biased in favor of the faster device: */
+		/* Pick at random, biased in favor of the faster device: */

-	return bch2_rand_range(l1 + l2) > l1;
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
 }

 static int extent_pick_read_device(struct bch_fs *c,
@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c,
 			continue;

 		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f && f->nr_failed >= f->nr_retries)
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    !p.idx && p.ec_nr)
+			p.idx++;
+
+		if (p.idx >= p.ec_nr + 1)
 			continue;

 		if (ret && !ptr_better(c, p, *pick))
@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";

-			if (extent_entry_is_crc(entry))
-				return "has crc field";
+			if (!extent_entry_is_ptr(entry))
+				return "has non ptr field";
 		}

 		extent_for_each_ptr(e, ptr) {
@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 			case BCH_EXTENT_ENTRY_crc128:
 				entry->crc128.offset += e.k->size - len;
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}

 			if (extent_entry_is_crc(entry))
@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";

-			if (extent_entry_is_crc(entry)) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				ptr = entry_to_ptr(entry);
+
+				reason = extent_ptr_invalid(c, e, &entry->ptr,
+							    size_ondisk, false);
+				if (reason)
+					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+			case BCH_EXTENT_ENTRY_crc128:
 				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));

 				if (crc.offset + e.k->size >
@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 					else if (nonce != crc.offset + crc.nonce)
 						return "incorrect nonce";
 				}
-			} else {
-				ptr = entry_to_ptr(entry);
-
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}

@ -1744,6 +1815,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 {
 	struct bch_extent_crc_unpacked crc;
 	union bch_extent_entry *pos;
+	unsigned i;

 	extent_for_each_crc(extent_i_to_s(e), crc, pos)
 		if (!bch2_crc_unpacked_cmp(crc, p->crc))
@ -1754,6 +1826,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 found:
 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 	__extent_entry_insert(e, pos, to_entry(&p->ptr));
+
+	for (i = 0; i < p->ec_nr; i++) {
+		p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+	}
 }

 /*
@ -1808,26 +1885,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      unsigned target,
 				      unsigned nr_desired_replicas)
 {
-	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;

 	if (target && extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);

 			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, ptr->dev, target)) {
-				ptr->cached = true;
+			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}

 	if (extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);

 			if (n && n <= extra) {
-				ptr->cached = true;
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}
@ -1903,7 +1981,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,

 			if ((extent_entry_type(en_l) !=
 			     extent_entry_type(en_r)) ||
-			    extent_entry_is_crc(en_l))
+			    !extent_entry_is_ptr(en_l))
 				return BCH_MERGE_NOMERGE;

 			lp = &en_l->ptr;
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -95,8 +95,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);

-unsigned bch2_extent_ptr_durability(struct bch_fs *,
-				    const struct bch_extent_ptr *);
 unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);

 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@ -361,20 +359,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)

 /* Iterate over pointers, with crcs: */

-static inline struct extent_ptr_decoded
-__extent_ptr_decoded_init(const struct bkey *k)
-{
-	return (struct extent_ptr_decoded) {
-		.crc		= bch2_extent_crc_unpack(k, NULL),
-	};
-}
-
-#define EXTENT_ITERATE_EC		(1 << 0)
-
 #define __extent_ptr_next_decode(_e, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
 									\
+	(_ptr).idx	= 0;						\
+	(_ptr).ec_nr	= 0;						\
+									\
 	extent_for_each_entry_from(_e, _entry, _entry)			\
 		switch (extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
@ -386,14 +377,16 @@ __extent_ptr_decoded_init(const struct bkey *k)
 			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
 					entry_to_crc(_entry));		\
 			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
+			break;						\
 		}							\
-									\
 out:									\
 	_entry < extent_entry_last(_e);					\
 })

 #define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	for ((_ptr) = __extent_ptr_decoded_init((_e).k),		\
+	for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),		\
 	     (_entry) = (_e).v->start;					\
 	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
 	     (_entry) = extent_entry_next(_entry))
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@ -19,14 +19,18 @@ struct bch_extent_crc_unpacked {
 };

 struct extent_ptr_decoded {
+	unsigned			idx;
+	unsigned			ec_nr;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec[4];
 };

 struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
+		u8		idx;
 		u8		nr_failed;
 		u8		nr_retries;
 	}			devs[BCH_REPLICAS_MAX];
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -454,12 +454,12 @@ struct bch_page_state {
 union { struct {
 	/* existing data: */
 	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
-	unsigned		nr_replicas:4;
-	unsigned		compressed:1;

-	/* Owns PAGE_SECTORS sized reservation: */
-	unsigned		reserved:1;
-	unsigned		reservation_replicas:4;
+	/* Uncompressed, fully allocated replicas: */
+	unsigned		nr_replicas:4;
+
+	/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
+	unsigned		replicas_reserved:4;

 	/* Owns PAGE_SECTORS sized quota reservation: */
 	unsigned		quota_reserved:1;
@ -506,7 +506,7 @@ static inline struct bch_page_state *page_state(struct page *page)
 static inline unsigned page_res_sectors(struct bch_page_state s)
 {

-	return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
+	return s.replicas_reserved * PAGE_SECTORS;
 }

 static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
@ -524,8 +524,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 {
 	struct bch_page_state s;

+	EBUG_ON(!PageLocked(page));
+
 	s = page_state_cmpxchg(page_state(page), s, {
-		s.reserved		= 0;
+		s.replicas_reserved	= 0;
 		s.quota_reserved	= 0;
 	});

@ -535,62 +537,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
 				     struct page *page, bool check_enospc)
 {
-	struct bch_page_state *s = page_state(page), new, old;
+	struct bch_page_state *s = page_state(page), new;

 	/* XXX: this should not be open coded */
 	unsigned nr_replicas = inode->ei_inode.bi_data_replicas
 		? inode->ei_inode.bi_data_replicas - 1
 		: c->opts.data_replicas;
-
-	struct disk_reservation disk_res = bch2_disk_reservation_init(c,
-						nr_replicas);
+	struct disk_reservation disk_res;
 	struct quota_res quota_res = { 0 };
-	int ret = 0;
+	int ret;

-	/*
-	 * XXX: this could likely be quite a bit simpler, page reservations
-	 * _should_ only be manipulated with page locked:
-	 */
+	EBUG_ON(!PageLocked(page));

-	old = page_state_cmpxchg(s, new, {
-		if (new.reserved
-		    ? (new.reservation_replicas < disk_res.nr_replicas)
-		    : (new.sectors < PAGE_SECTORS ||
-		       new.nr_replicas < disk_res.nr_replicas ||
-		       new.compressed)) {
-			int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
-				       page_res_sectors(new) -
-				       disk_res.sectors);
+	if (s->replicas_reserved < nr_replicas) {
+		ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
+				nr_replicas - s->replicas_reserved,
+				!check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (unlikely(ret))
+			return ret;

-			if (sectors > 0) {
-				ret = bch2_disk_reservation_add(c, &disk_res, sectors,
-						!check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-				if (unlikely(ret))
-					goto err;
-			}
+		page_state_cmpxchg(s, new, ({
+			BUG_ON(new.replicas_reserved +
+			       disk_res.nr_replicas != nr_replicas);
+			new.replicas_reserved += disk_res.nr_replicas;
+		}));
+	}

-			new.reserved = 1;
-			new.reservation_replicas = disk_res.nr_replicas;
-		}
-
-		if (!new.quota_reserved &&
-		    new.sectors + new.dirty_sectors < PAGE_SECTORS) {
-			ret = bch2_quota_reservation_add(c, inode, &quota_res,
-						PAGE_SECTORS - quota_res.sectors,
-						check_enospc);
-			if (unlikely(ret))
-				goto err;
+	if (!s->quota_reserved &&
+	    s->sectors + s->dirty_sectors < PAGE_SECTORS) {
+		ret = bch2_quota_reservation_add(c, inode, &quota_res,
+						 PAGE_SECTORS,
+						 check_enospc);
+		if (unlikely(ret))
+			return ret;

+		page_state_cmpxchg(s, new, ({
+			BUG_ON(new.quota_reserved);
 			new.quota_reserved = 1;
-		}
-	});
+		}));
+	}

-	quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
-	disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
-err:
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &disk_res);
 	return ret;
 }

@ -600,6 +586,8 @@ static void bch2_clear_page_bits(struct page *page)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_page_state s;

+	EBUG_ON(!PageLocked(page));
+
 	if (!PagePrivate(page))
 		return;

@ -710,6 +698,9 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 {
 	int ret;

+	EBUG_ON(!PageLocked(page));
+	EBUG_ON(!PageLocked(newpage));
+
 	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;
@ -856,10 +847,13 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
-	bool compressed = bch2_extent_is_compressed(k);
-	unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
+	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
+		? bch2_extent_nr_dirty_ptrs(k)
+		: 0;

 	bio_for_each_segment(bv, bio, iter) {
+		/* brand new pages, don't need to be locked: */
+
 		struct bch_page_state *s = page_state(bv.bv_page);

 		/* sectors in @k from the start of this page: */
@ -867,14 +861,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)

 		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);

-		s->nr_replicas = !s->sectors
-			? nr_ptrs
-			: min_t(unsigned, s->nr_replicas, nr_ptrs);
+		s->nr_replicas = page_sectors == PAGE_SECTORS
+			? nr_ptrs : 0;

 		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
 		s->sectors += page_sectors;
-
-		s->compressed |= compressed;
 	}
 }

@ -1214,7 +1205,7 @@ static int __bch2_writepage(struct page *page,
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_writepage_state *w = data;
 	struct bch_page_state new, old;
-	unsigned offset;
+	unsigned offset, nr_replicas_this_write;
 	loff_t i_size = i_size_read(&inode->v);
 	pgoff_t end_index = i_size >> PAGE_SHIFT;

@ -1240,19 +1231,31 @@ static int __bch2_writepage(struct page *page,
 	 */
 	zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
+	EBUG_ON(!PageLocked(page));
+
 	/* Before unlocking the page, transfer reservation to w->io: */
 	old = page_state_cmpxchg(page_state(page), new, {
-		EBUG_ON(!new.reserved &&
-			(new.sectors != PAGE_SECTORS ||
-			new.compressed));
+		/*
+		 * If we didn't get a reservation, we can only write out the
+		 * number of (fully allocated) replicas that currently exist,
+		 * and only if the entire page has been written:
+		 */
+		nr_replicas_this_write =
+			max_t(unsigned,
+			      new.replicas_reserved,
+			      (new.sectors == PAGE_SECTORS
+			       ? new.nr_replicas : 0));

-		if (new.reserved)
-			new.nr_replicas = new.reservation_replicas;
-		new.reserved = 0;
+		BUG_ON(!nr_replicas_this_write);

-		new.compressed |= w->opts.compression != 0;
+		new.nr_replicas = w->opts.compression
+			? 0
+			: nr_replicas_this_write;
+
+		new.replicas_reserved = 0;

 		new.sectors += new.dirty_sectors;
+		BUG_ON(new.sectors != PAGE_SECTORS);
 		new.dirty_sectors = 0;
 	});

@ -1261,21 +1264,20 @@ do_io:
 	unlock_page(page);

 	if (w->io &&
-	    (w->io->op.op.res.nr_replicas != new.nr_replicas ||
+	    (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
 	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
 		bch2_writepage_do_io(w);

 	if (!w->io)
-		bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
+		bch2_writepage_io_alloc(c, w, inode, page,
+					nr_replicas_this_write);

 	w->io->new_sectors += new.sectors - old.sectors;

 	BUG_ON(inode != w->io->op.inode);
 	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));

-	if (old.reserved)
-		w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
-
+	w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
 	w->io->op.new_i_size = i_size;

 	if (wbc->sync_mode == WB_SYNC_ALL)
@ -2547,10 +2549,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 				&disk_res, &quota_res,
 				iter, &reservation.k_i,
 				0, true, true, NULL);
-
+btree_iter_err:
 		bch2_quota_reservation_put(c, inode, &quota_res);
 		bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
 		if (ret == -EINTR)
 			ret = 0;
 		if (ret)
@ -2612,6 +2613,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode,

 static bool page_is_data(struct page *page)
 {
+	EBUG_ON(!PageLocked(page));
+
 	/* XXX: should only have to check PageDirty */
 	return PagePrivate(page) &&
 		(page_state(page)->sectors ||
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -15,6 +15,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@ -302,6 +303,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
+	unsigned dev;
 	int ret;

 	for (src = keys->keys; src != keys->top; src = n) {
@ -345,6 +347,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		}
 	}
 out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
 	bch2_open_buckets_put(c, &op->open_buckets);
 	return;
 err:
@ -421,7 +427,8 @@ static void init_append_extent(struct bch_write_op *op,
 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 					struct write_point *wp,
 					struct bio *src,
-					bool *page_alloc_failed)
+					bool *page_alloc_failed,
+					void *buf)
 {
 	struct bch_write_bio *wbio;
 	struct bio *bio;
@ -431,11 +438,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,

 	bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
 	wbio			= wbio_init(bio);
-	wbio->bounce		= true;
 	wbio->put_bio		= true;
 	/* copy WRITE_SYNC flag */
 	wbio->bio.bi_opf	= src->bi_opf;

+	if (buf) {
+		bio->bi_iter.bi_size = output_available;
+		bch2_bio_map(bio, buf);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
 	/*
 	 * We can't use mempool for more than c->sb.encoded_extent_max
 	 * worth of pages, but we'd like to allocate more if we can:
@ -600,14 +614,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	struct bio *src = &op->wbio.bio, *dst = src;
 	struct bvec_iter saved_iter;
 	struct bkey_i *key_to_write;
+	void *ec_buf;
 	unsigned key_to_write_offset = op->insert_keys.top_p -
 		op->insert_keys.keys_p;
-	unsigned total_output = 0;
-	bool bounce = false, page_alloc_failed = false;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
 	int ret, more = 0;

 	BUG_ON(!bio_sectors(src));

+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
 	switch (bch2_write_prep_encoded_data(op, wp)) {
 	case PREP_ENCODED_OK:
 		break;
@ -617,16 +635,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	case PREP_ENCODED_CHECKSUM_ERR:
 		goto csum_err;
 	case PREP_ENCODED_DO_WRITE:
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
 		init_append_extent(op, wp, op->version, op->crc);
 		goto do_write;
 	}

-	if (op->compression_type ||
+	if (ec_buf ||
+	    op->compression_type ||
 	    (op->csum_type &&
 	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 	    (bch2_csum_type_is_encryption(op->csum_type) &&
 	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
 		bounce = true;
 	}

@ -729,7 +757,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 		if (dst != src)
 			bio_advance(dst, dst_len);
 		bio_advance(src, src_len);
-		total_output += dst_len;
+		total_output	+= dst_len;
+		total_input	+= src_len;
 	} while (dst->bi_iter.bi_size &&
 		 src->bi_iter.bi_size &&
 		 wp->sectors_free &&
@ -742,16 +771,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)

 	dst->bi_iter = saved_iter;

-	if (!bounce && more) {
-		dst = bio_split(src, total_output >> 9,
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
 				GFP_NOIO, &c->bio_write);
-		wbio_init(dst)->put_bio = true;
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
 	}

 	dst->bi_iter.bi_size = total_output;

 	/* Free unneeded pages after compressing: */
-	if (bounce)
+	if (to_wbio(dst)->bounce)
 		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
 			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
 				     &c->bio_bounce_pages);
@ -760,6 +793,10 @@ do_write:

 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);

+	bch2_ec_add_backpointer(c, wp,
+				bkey_start_pos(&key_to_write->k),
+				total_input >> 9);
+
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@ -774,10 +811,10 @@ csum_err:
 		"rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
-	if (bounce) {
+	if (to_wbio(dst)->bounce)
 		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
 		bio_put(dst);
-	}

 	return ret;
 }
@ -789,6 +826,8 @@ static void __bch2_write(struct closure *cl)
 	struct write_point *wp;
 	int ret;
 again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
 	do {
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
@ -803,6 +842,7 @@ again:

 		wp = bch2_alloc_sectors_start(c,
 			op->target,
+			op->opts.erasure_code,
 			op->write_point,
 			&op->devs_have,
 			op->nr_replicas,
@ -882,8 +922,6 @@ void bch2_write(struct closure *cl)

 	op->start_time = local_clock();

-	memset(&op->failed, 0, sizeof(op->failed));
-
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(&op->wbio.bio)->put_bio = false;

@ -1557,8 +1595,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (!pick_ret)
 		goto hole;

-	if (pick_ret < 0)
-		goto no_device;
+	if (pick_ret < 0) {
+		__bcache_io_error(c, "no device to read from");
+		goto err;
+	}

 	if (pick_ret > 0)
 		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@ -1683,31 +1723,46 @@ noclone:

 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);

-	if (!rbio->have_ioref)
-		goto no_device_postclone;
-
 	percpu_down_read_preempt_disable(&c->usage_lock);
 	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
 	percpu_up_read_preempt_enable(&c->usage_lock);

-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
-		     bio_sectors(&rbio->bio));
+	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+		bio_inc_remaining(&orig->bio);
+		trace_read_split(&orig->bio);
+	}

-	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
-			bio_inc_remaining(&orig->bio);
-			trace_read_split(&orig->bio);
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			__bcache_io_error(c, "no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
 		}

-		submit_bio(&rbio->bio);
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			submit_bio(&rbio->bio);
+		else
+			submit_bio_wait(&rbio->bio);
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
 		return 0;
 	} else {
 		int ret;

-		submit_bio_wait(&rbio->bio);
-
 		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);

@ -1722,22 +1777,12 @@ noclone:
 		return ret;
 	}

-no_device_postclone:
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-	bch2_rbio_free(rbio);
-no_device:
-	__bcache_io_error(c, "no device to read from");
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		orig->bio.bi_status = BLK_STS_IOERR;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			bch2_rbio_done(orig);
-		return 0;
-	} else {
+err:
+	if (flags & BCH_READ_IN_RETRY)
 		return READ_ERR;
-	}
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;

 hole:
 	/*
@ -1749,7 +1794,7 @@ hole:
 		orig->hole = true;

 	zero_fill_bio_iter(&orig->bio, iter);
-
+out_read_done:
 	if (flags & BCH_READ_LAST_FRAGMENT)
 		bch2_rbio_done(orig);
 	return 0;
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -134,6 +134,8 @@ static enum {
 		c->opts.block_size;
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);

+	bkey_extent_init(&buf->key);
+
 	/*
 	 * We have to set last_seq here, _before_ opening a new journal entry:
 	 *
@ -334,15 +336,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 }

 static int __journal_res_get(struct journal *j, struct journal_res *res,
-			      unsigned u64s_min, unsigned u64s_max)
+			     unsigned flags)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
 	int ret;
 retry:
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret)
-		return ret;
+	if (journal_res_get_fast(j, res))
+		return 0;

 	spin_lock(&j->lock);
 	/*
@ -350,10 +351,9 @@ retry:
 	 * that just did journal_entry_open() and call journal_entry_close()
 	 * unnecessarily
 	 */
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret) {
+	if (journal_res_get_fast(j, res)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
 	}

 	/*
@ -376,7 +376,12 @@ retry:
 		spin_unlock(&j->lock);
 		return -EROFS;
 	case JOURNAL_ENTRY_INUSE:
-		/* haven't finished writing out the previous one: */
+		/*
+		 * haven't finished writing out the previous entry, can't start
+		 * another yet:
+		 * signal to caller which sequence number we're trying to open:
+		 */
+		res->seq = journal_cur_seq(j) + 1;
 		spin_unlock(&j->lock);
 		trace_journal_entry_full(c);
 		goto blocked;
@ -388,6 +393,8 @@ retry:

 	/* We now have a new, closed journal buf - see if we can open it: */
 	ret = journal_entry_open(j);
+	if (!ret)
+		res->seq = journal_cur_seq(j);
 	spin_unlock(&j->lock);

 	if (ret < 0)
@ -407,7 +414,7 @@ retry:
 blocked:
 	if (!j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
-	return 0;
+	return -EAGAIN;
 }

 /*
@ -421,14 +428,14 @@ blocked:
 * btree node write locks.
 */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				 unsigned u64s_min, unsigned u64s_max)
+				  unsigned flags)
 {
 	int ret;

 	wait_event(j->wait,
-		   (ret = __journal_res_get(j, res, u64s_min,
-					    u64s_max)));
-	return ret < 0 ? ret : 0;
+		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
 }

 u64 bch2_journal_last_unwritten_seq(struct journal *j)
@ -452,28 +459,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
 * btree root - every journal entry contains the roots of all the btrees, so it
 * doesn't need to bother with getting a journal reservation
 */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
-	int ret;
-
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool need_reclaim = false;
+retry:
 	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));

 	if (seq < journal_cur_seq(j) ||
 	    journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
+	}
+
+	if (journal_cur_seq(j) < seq) {
+		switch (journal_buf_switch(j, false)) {
+		case JOURNAL_ENTRY_ERROR:
+			spin_unlock(&j->lock);
+			return -EROFS;
+		case JOURNAL_ENTRY_INUSE:
+			/* haven't finished writing out the previous one: */
+			trace_journal_entry_full(c);
+			goto blocked;
+		case JOURNAL_ENTRY_CLOSED:
+			break;
+		case JOURNAL_UNLOCKED:
+			goto retry;
+		}
+	}
+
+	BUG_ON(journal_cur_seq(j) < seq);
+
+	if (!journal_entry_open(j)) {
+		need_reclaim = true;
+		goto blocked;
 	}

-	ret = journal_entry_open(j);
-	if (!ret)
-		closure_wait(&j->async_wait, parent);
 	spin_unlock(&j->lock);

-	if (!ret)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return 0;
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;

-	return ret;
+	closure_wait(&j->async_wait, cl);
+	spin_unlock(&j->lock);
+
+	if (need_reclaim)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return -EAGAIN;
 }

 static int journal_seq_error(struct journal *j, u64 seq)
@ -593,11 +627,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);

 	memset(&res, 0, sizeof(res));

-	bch2_journal_res_get(j, &res, u64s, u64s);
+	bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	bch2_journal_res_put(j, &res);

 	bch2_journal_flush_seq_async(j, res.seq, parent);
@ -606,12 +639,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 int bch2_journal_meta(struct journal *j)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
 	int ret;

 	memset(&res, 0, sizeof(res));

-	ret = bch2_journal_res_get(j, &res, u64s, u64s);
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	if (ret)
 		return ret;

@ -751,9 +783,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 				ca->mi.bucket_size,
 				gc_phase(GC_PHASE_SB),
-				new_fs
-				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-				: 0);
+				0);

 		if (c) {
 			spin_unlock(&c->journal.lock);
@ -861,10 +891,6 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)

 void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 {
-	spin_lock(&j->lock);
-	bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
-	spin_unlock(&j->lock);
-
 	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
 }

@ -1000,8 +1026,6 @@ int bch2_fs_journal_init(struct journal *j)
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;

-	bkey_extent_init(&j->key);
-
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -269,12 +269,10 @@ static inline void bch2_journal_res_put(struct journal *j,
 }

 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				 unsigned, unsigned);
+				  unsigned);

 static inline int journal_res_get_fast(struct journal *j,
-				       struct journal_res *res,
-				       unsigned u64s_min,
-				       unsigned u64s_max)
+				       struct journal_res *res)
 {
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
@ -286,37 +284,37 @@ static inline int journal_res_get_fast(struct journal *j,
 		 * Check if there is still room in the current journal
 		 * entry:
 		 */
-		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
 			return 0;

-		res->offset	= old.cur_entry_offset;
-		res->u64s	= min(u64s_max, j->cur_entry_u64s -
-				      old.cur_entry_offset);
-
-		journal_state_inc(&new);
 		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);

-	res->ref = true;
-	res->idx = new.idx;
-	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
 	return 1;
 }

+#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
+
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				      unsigned u64s_min, unsigned u64s_max)
+				       unsigned u64s, unsigned flags)
 {
 	int ret;

 	EBUG_ON(res->ref);
-	EBUG_ON(u64s_max < u64s_min);
 	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));

-	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res))
 		goto out;

-	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
 	if (ret)
 		return ret;
 out:
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -426,7 +426,7 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 static int journal_read_bucket(struct bch_dev *ca,
 			       struct journal_read_buf *buf,
 			       struct journal_list *jlist,
-			       unsigned bucket, u64 *seq, bool *entries_found)
+			       unsigned bucket)
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
@ -511,7 +511,6 @@ reread:

 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
-			*entries_found = true;
 			break;
 		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
 			break;
@ -519,9 +518,6 @@ reread:
 			return ret;
 		}

-		if (le64_to_cpu(j->seq) > *seq)
-			*seq = le64_to_cpu(j->seq);
-
 		sectors = vstruct_sectors(j, c->block_bits);
 next_block:
 		pr_debug("next");
@ -535,120 +531,51 @@ next_block:

 static void bch2_journal_read_device(struct closure *cl)
 {
-#define read_bucket(b)							\
-	({								\
-		bool entries_found = false;				\
-		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
-					  &entries_found);		\
-		if (ret)						\
-			goto err;					\
-		__set_bit(b, bitmap);					\
-		entries_found;						\
-	 })
-
 	struct journal_device *ja =
 		container_of(cl, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
-	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
 	struct journal_read_buf buf = { NULL, 0 };
-
-	DECLARE_BITMAP(bitmap, ja->nr);
-	unsigned i, l, r;
-	u64 seq = 0;
+	u64 min_seq = U64_MAX;
+	unsigned i;
 	int ret;

 	if (!ja->nr)
 		goto out;

-	bitmap_zero(bitmap, ja->nr);
 	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
 	if (ret)
 		goto err;

 	pr_debug("%u journal buckets", ja->nr);

-	/*
-	 * If the device supports discard but not secure discard, we can't do
-	 * the fancy fibonacci hash/binary search because the live journal
-	 * entries might not form a contiguous range:
-	 */
-	for (i = 0; i < ja->nr; i++)
-		read_bucket(i);
-	goto search_done;
-
-	if (!blk_queue_nonrot(q))
-		goto linear_scan;
-
-	/*
-	 * Read journal buckets ordered by golden ratio hash to quickly
-	 * find a sequence of buckets with valid journal entries
-	 */
 	for (i = 0; i < ja->nr; i++) {
-		l = (i * 2654435769U) % ja->nr;
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
+	}

-		if (test_bit(l, bitmap))
-			break;
+	/* Find the journal bucket with the highest sequence number: */
+	for (i = 0; i < ja->nr; i++) {
+		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
+			ja->cur_idx = i;

-		if (read_bucket(l))
-			goto bsearch;
+		min_seq = min(ja->bucket_seq[i], min_seq);
 	}

 	/*
-	 * If that fails, check all the buckets we haven't checked
-	 * already
-	 */
-	pr_debug("falling back to linear search");
-linear_scan:
-	for (l = find_first_zero_bit(bitmap, ja->nr);
-	     l < ja->nr;
-	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
-		if (read_bucket(l))
-			goto bsearch;
-
-	/* no journal entries on this device? */
-	if (l == ja->nr)
-		goto out;
-bsearch:
-	/* Binary search */
-	r = find_next_bit(bitmap, ja->nr, l + 1);
-	pr_debug("starting binary search, l %u r %u", l, r);
-
-	while (l + 1 < r) {
-		unsigned m = (l + r) >> 1;
-		u64 cur_seq = seq;
-
-		read_bucket(m);
-
-		if (cur_seq != seq)
-			l = m;
-		else
-			r = m;
-	}
-
-search_done:
-	/*
-	 * Find the journal bucket with the highest sequence number:
-	 *
 	 * If there's duplicate journal entries in multiple buckets (which
 	 * definitely isn't supposed to happen, but...) - make sure to start
 	 * cur_idx at the last of those buckets, so we don't deadlock trying to
 	 * allocate
 	 */
-	seq = 0;
+	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
+	       ja->bucket_seq[ja->cur_idx] >
+	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
+		ja->cur_idx++;

-	for (i = 0; i < ja->nr; i++)
-		if (ja->bucket_seq[i] >= seq &&
-		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
-			/*
-			 * When journal_next_bucket() goes to allocate for
-			 * the first time, it'll use the bucket after
-			 * ja->cur_idx
-			 */
-			ja->cur_idx = i;
-			seq = ja->bucket_seq[i];
-		}
+	ja->sectors_free = 0;

 	/*
 	 * Set last_idx to indicate the entire journal is full and needs to be
@ -656,17 +583,6 @@ search_done:
 	 * pinned when it first runs:
 	 */
 	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
-	/*
-	 * Read buckets in reverse order until we stop finding more journal
-	 * entries:
-	 */
-	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
-	     i != ja->cur_idx;
-	     i = (i + ja->nr - 1) % ja->nr)
-		if (!test_bit(i, bitmap) &&
-		    !read_bucket(i))
-			break;
 out:
 	kvpfree(buf.data, buf.size);
 	percpu_ref_put(&ca->io_ref);
@ -677,7 +593,6 @@ err:
 	jlist->ret = ret;
 	mutex_unlock(&jlist->lock);
 	goto out;
-#undef read_bucket
 }

 void bch2_journal_entries_free(struct list_head *list)
@ -865,7 +780,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 	int ret = 0;

 	list_for_each_entry_safe(i, n, list, list) {
-
 		j->replay_journal_seq = le64_to_cpu(i->j.seq);

 		for_each_jset_key(k, _n, entry, &i->j) {
@ -875,7 +789,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 				 * allocation code handles replay for
 				 * BTREE_ID_ALLOC keys:
 				 */
-				ret = bch2_alloc_replay_key(c, k->k.p);
+				ret = bch2_alloc_replay_key(c, k);
 			} else {
 				/*
 				 * We might cause compressed extents to be
@ -886,9 +800,9 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 					bch2_disk_reservation_init(c, 0);

 				ret = bch2_btree_insert(c, entry->btree_id, k,
-							&disk_res, NULL,
-							BTREE_INSERT_NOFAIL|
-							BTREE_INSERT_JOURNAL_REPLAY);
+						&disk_res, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_JOURNAL_REPLAY);
 			}

 			if (ret) {
@ -932,32 +846,18 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
 }

 static unsigned journal_dev_buckets_available(struct journal *j,
-					      struct bch_dev *ca)
+					      struct journal_device *ja)
 {
-	struct journal_device *ja = &ca->journal;
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
 	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;

-	/*
-	 * Hack to avoid a deadlock during journal replay:
-	 * journal replay might require setting a new btree
-	 * root, which requires writing another journal entry -
-	 * thus, if the journal is full (and this happens when
-	 * replaying the first journal bucket's entries) we're
-	 * screwed.
-	 *
-	 * So don't let the journal fill up unless we're in
-	 * replay:
-	 */
-	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-		available = max((int) available - 2, 0);
-
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
-	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
-		available = max((int) available - 1, 0);
+	if (available &&
+	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+		--available;

 	return available;
 }
@ -967,7 +867,6 @@ int bch2_journal_entry_sectors(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	unsigned sectors_available = UINT_MAX;
 	unsigned i, nr_online = 0, nr_devs = 0;

@ -977,38 +876,39 @@ int bch2_journal_entry_sectors(struct journal *j)
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
-		unsigned buckets_required = 0;
+		unsigned buckets_this_device, sectors_this_device;

 		if (!ja->nr)
 			continue;

-		sectors_available = min_t(unsigned, sectors_available,
-					  ca->mi.bucket_size);
+		buckets_this_device = journal_dev_buckets_available(j, ja);
+		sectors_this_device = ja->sectors_free;
+
+		nr_online++;

 		/*
-		 * Note that we don't allocate the space for a journal entry
-		 * until we write it out - thus, if we haven't started the write
-		 * for the previous entry we have to make sure we have space for
-		 * it too:
+		 * We that we don't allocate the space for a journal entry
+		 * until we write it out - thus, account for it here:
 		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-			if (j->prev_buf_sectors > ja->sectors_free)
-				buckets_required++;
+		if (j->prev_buf_sectors >= sectors_this_device) {
+			if (!buckets_this_device)
+				continue;

-			if (j->prev_buf_sectors + sectors_available >
-			    ja->sectors_free)
-				buckets_required++;
-		} else {
-			if (j->prev_buf_sectors + sectors_available >
-			    ca->mi.bucket_size)
-				buckets_required++;
-
-			buckets_required++;
+			buckets_this_device--;
+			sectors_this_device = ca->mi.bucket_size;
 		}

-		if (journal_dev_buckets_available(j, ca) >= buckets_required)
-			nr_devs++;
-		nr_online++;
+		sectors_this_device -= j->prev_buf_sectors;
+
+		if (buckets_this_device)
+			sectors_this_device = ca->mi.bucket_size;
+
+		if (!sectors_this_device)
+			continue;
+
+		sectors_available = min(sectors_available,
+					sectors_this_device);
+		nr_devs++;
 	}
 	rcu_read_unlock();

@ -1021,6 +921,61 @@ int bch2_journal_entry_sectors(struct journal *j)
 	return sectors_available;
 }

+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (*replicas >= replicas_want)
+		return;
+
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    !ja->nr ||
+		    bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
+		    sectors > ja->sectors_free)
+			continue;
+
+		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
+
+		extent_ptr_append(e,
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
+				  .dev = ca->dev_idx,
+		});
+
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
+
+		if (*replicas >= replicas_want)
+			break;
+	}
+}
+
 /**
 * journal_next_bucket - move on to the next journal bucket if possible
 */
@ -1028,100 +983,49 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 			       unsigned sectors)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
-	unsigned i, replicas, replicas_want =
+	unsigned i, replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);

-	spin_lock(&j->lock);
-	e = bkey_i_to_s_extent(&j->key);
-
-	/*
-	 * Drop any pointers to devices that have been removed, are no longer
-	 * empty, or filled up their current journal bucket:
-	 *
-	 * Note that a device may have had a small amount of free space (perhaps
-	 * one sector) that wasn't enough for the smallest possible journal
-	 * entry - that's why we drop pointers to devices <= current free space,
-	 * i.e. whichever device was limiting the current journal entry size.
-	 */
-	bch2_extent_drop_ptrs(e, ptr, ({
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		ca->mi.state != BCH_MEMBER_STATE_RW ||
-		ca->journal.sectors_free <= sectors;
-	}));
-
-	extent_for_each_ptr(e, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW ||
-		       ca->journal.sectors_free <= sectors);
-		ca->journal.sectors_free -= sectors;
-	}
-
-	replicas = bch2_extent_nr_ptrs(e.c);
-
 	rcu_read_lock();
-	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
-					 &c->rw_devs[BCH_DATA_JOURNAL]);
+
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
+					  &c->rw_devs[BCH_DATA_JOURNAL]);
+
+	spin_lock(&j->lock);
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas >= replicas_want)
+		goto done;

 	for (i = 0; i < devs_sorted.nr; i++) {
 		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
 		if (!ca)
 			continue;

-		if (!ca->mi.durability)
-			continue;
-
 		ja = &ca->journal;
-		if (!ja->nr)
-			continue;

-		if (replicas >= replicas_want)
-			break;
-
-		/*
-		 * Check that we can use this device, and aren't already using
-		 * it:
-		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
-		    !journal_dev_buckets_available(j, ca) ||
-		    sectors > ca->mi.bucket_size)
-			continue;
-
-		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-		bch2_wp_rescale(c, ca, &j->wp);
-
-		ja->sectors_free = ca->mi.bucket_size - sectors;
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
-		extent_ptr_append(bkey_i_to_extent(&j->key),
-			(struct bch_extent_ptr) {
-				  .offset = bucket_to_sector(ca,
-					ja->buckets[ja->cur_idx]),
-				  .dev = ca->dev_idx,
-		});
-
-		replicas += ca->mi.durability;
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    journal_dev_buckets_available(j, ja)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+		}
 	}
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+done:
+	if (replicas >= replicas_want)
+		j->prev_buf_sectors = 0;
+
+	spin_unlock(&j->lock);
 	rcu_read_unlock();

-	j->prev_buf_sectors = 0;
-
-	bkey_copy(&w->key, &j->key);
-	spin_unlock(&j->lock);
-
-	if (replicas < c->opts.metadata_replicas_required)
-		return -EROFS;
-
-	BUG_ON(!replicas);
-
-	return 0;
+	return replicas >= replicas_want ? 0 : -EROFS;
 }

 static void journal_write_compact(struct jset *jset)
@ -1376,9 +1280,6 @@ void bch2_journal_write(struct closure *cl)
 		}

 no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
-		ptr->offset += sectors;
-
 	bch2_bucket_seq_cleanup(c);

 	continue_at(cl, journal_write_done, system_highpri_wq);
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -125,7 +125,8 @@ void bch2_journal_reclaim_fast(struct journal *j)
 	 * Unpin journal entries whose reference counts reached zero, meaning
 	 * all btree nodes got written out
 	 */
-	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 		BUG_ON(!fifo_pop(&j->pin, temp));
 		popped = true;
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -184,7 +184,6 @@ struct journal {
 	struct list_head	seq_blacklist;
 	struct journal_seq_blacklist *new_blacklist;

-	BKEY_PADDED(key);
 	struct write_point	wp;
 	spinlock_t		err_lock;

--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -278,11 +278,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
 	case Opt_background_compression:
 		ret = bch2_check_set_has_compressed_data(c, v);
 		break;
+	case Opt_erasure_code:
+		if (v &&
+		    !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
+			mutex_lock(&c->sb_lock);
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_EC);
+
+			bch2_write_super(c);
+			mutex_unlock(&c->sb_lock);
+		}
+		break;
 	}

 	return ret;
 }

+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
 	char *opt, *name, *val;
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -110,6 +110,9 @@ enum opt_type {
 	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
 		OPT_FN(bch2_opt_target),				\
 		BCH_SB_PROMOTE_TARGET,	0)				\
+	BCH_OPT(erasure_code,		u16,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_ERASURE_CODE,		false)			\
 	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
 		OPT_BOOL(),						\
 		BCH_SB_INODE_32BIT,		false)			\
@ -266,6 +269,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
 		      const struct bch_option *, u64, unsigned);

 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);

 /* inode opts: */
@ -277,7 +281,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
 	BCH_INODE_OPT(data_replicas,			8)	\
 	BCH_INODE_OPT(promote_target,			16)	\
 	BCH_INODE_OPT(foreground_target,		16)	\
-	BCH_INODE_OPT(background_target,		16)
+	BCH_INODE_OPT(background_target,		16)	\
+	BCH_INODE_OPT(erasure_code,			16)

 struct bch_io_opts {
 #define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -6,6 +6,7 @@
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
@ -212,6 +213,11 @@ int bch2_fs_recovery(struct bch_fs *c)

 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);

+	err = "cannot allocate memory";
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		goto err;
+
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
 	ret = bch2_initial_gc(c, &journal);
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -79,9 +79,33 @@ static void extent_to_replicas(struct bkey_s_c k,

 		r->nr_required	= 1;

-		extent_for_each_ptr_decode(e, p, entry)
-			if (!p.ptr.cached)
-				r->devs[r->nr_devs++] = p.ptr.dev;
+		extent_for_each_ptr_decode(e, p, entry) {
+			if (p.ptr.cached)
+				continue;
+
+			if (p.ec_nr) {
+				r->nr_devs = 0;
+				break;
+			}
+
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		}
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	if (k.k->type == BCH_STRIPE) {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		const struct bch_extent_ptr *ptr;
+
+		r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+		for (ptr = s.v->ptrs;
+		     ptr < s.v->ptrs + s.v->nr_blocks;
+		     ptr++)
+			r->devs[r->nr_devs++] = ptr->dev;
 	}
 }

@ -100,6 +124,10 @@ static void bkey_to_replicas(enum bkey_type type,
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
+	case BKEY_TYPE_EC:
+		e->data_type = BCH_DATA_USER;
+		stripe_to_replicas(k, e);
+		break;
 	default:
 		break;
 	}
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -19,6 +19,7 @@
 #include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@ -395,6 +396,7 @@ static void bch2_fs_free(struct bch_fs *c)

 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
 	bch2_fs_btree_cache_exit(c);
@ -403,7 +405,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->usage_lock);
-	free_percpu(c->usage_percpu);
+	free_percpu(c->usage[0]);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@ -576,6 +578,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);

+	INIT_LIST_HEAD(&c->ec_new_stripe_list);
+	mutex_init(&c->ec_new_stripe_lock);
+	mutex_init(&c->ec_stripes_lock);
+	spin_lock_init(&c->ec_stripes_heap_lock);
+
 	seqcount_init(&c->gc_pos_lock);

 	c->copy_gc_enabled		= 1;
@ -631,7 +638,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
 	    percpu_init_rwsem(&c->usage_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
@ -644,6 +651,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
+	    bch2_fs_ec_init(c) ||
 	    bch2_fs_fsio_init(c))
 		goto err;

@ -715,6 +723,10 @@ const char *bch2_fs_start(struct bch_fs *c)
 	if (ret)
 		goto err;

+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
 	err = "dynamic fault";
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;
@ -1054,8 +1066,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;

 	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 	mutex_unlock(&c->sb_lock);

 	bch2_dev_sysfs_online(c, ca);
@ -1340,7 +1351,7 @@ static void dev_usage_clear(struct bch_dev *ca)

 	for_each_possible_cpu(cpu) {
 		struct bch_dev_usage *p =
-			per_cpu_ptr(ca->usage_percpu, cpu);
+			per_cpu_ptr(ca->usage[0], cpu);
 		memset(p, 0, sizeof(*p));
 	}

@ -1401,8 +1412,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);

 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@ -1461,8 +1471,7 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);

-	bch2_mark_dev_superblock(c, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(c, ca, 0);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -18,6 +18,7 @@
 #include "btree_gc.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
@ -187,6 +188,8 @@ sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);

+read_attribute(new_stripes);
+
 rw_attribute(pd_controllers_update_seconds);

 read_attribute(meta_replicas_have);
@ -241,6 +244,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 			pr_buf(&out, "\t%s:\t\t%llu\n",
 			       bch2_data_types[type],
 			       stats.replicas[replicas].data[type]);
+		pr_buf(&out, "\terasure coded:\t%llu\n",
+		       stats.replicas[replicas].ec_data);
 		pr_buf(&out, "\treserved:\t%llu\n",
 		       stats.replicas[replicas].persistent_reserved);
 	}
@ -309,6 +314,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 			compressed_sectors_uncompressed << 9);
 }

+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		out += scnprintf(out, end - out,
+				 "target %u algo %u redundancy %u:\n",
+				 h->target, h->algo, h->redundancy);
+
+		if (h->s)
+			out += scnprintf(out, end - out,
+					 "\tpending: blocks %u allocated %u\n",
+					 h->s->blocks.nr,
+					 bitmap_weight(h->s->blocks_allocated,
+						       h->s->blocks.nr));
+
+		mutex_lock(&h->lock);
+		list_for_each_entry(s, &h->stripes, list)
+			out += scnprintf(out, end - out,
+					 "\tin flight: blocks %u allocated %u pin %u\n",
+					 s->blocks.nr,
+					 bitmap_weight(s->blocks_allocated,
+						       s->blocks.nr),
+					 atomic_read(&s->pin));
+		mutex_unlock(&h->lock);
+
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+
+	return out - buf;
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -368,6 +408,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_compression_stats)
 		return bch2_compression_stats(c, buf);

+	if (attr == &sysfs_new_stripes)
+		return bch2_new_stripes(c, buf);
+
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
@ -434,7 +477,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);

 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c);
+		bch2_gc(c, NULL, false);

 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
@ -536,6 +579,8 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),

+	&sysfs_new_stripes,
+
 	&sysfs_internal_uuid,

 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -764,6 +809,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
+		"    erasure coded:      %llu\n"
 		"    available:          %lli\n"
 		"sectors:\n"
 		"    sb:                 %llu\n"
@ -787,6 +833,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_BTREE],
 		stats.buckets[BCH_DATA_USER],
 		stats.buckets[BCH_DATA_CACHED],
+		stats.buckets_ec,
 		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
 		stats.sectors[BCH_DATA_SB],
 		stats.sectors[BCH_DATA_JOURNAL],