From 4de98a2712764bceb9e0f67b1ac2f2c7862feb77 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Thu, 11 Jan 2018 06:41:59 -0500
Subject: [PATCH] Update bcachefs sources to 02ae70070a bcachefs: Allocate new
 btree roots lazily

---
 .bcachefs_revision                  |   2 +-
 cmd_debug.c                         |  16 +-
 cmd_device.c                        |   8 +-
 cmd_format.c                        |   8 +-
 cmd_fsck.c                          |   8 +-
 cmd_key.c                           |  26 +-
 cmd_migrate.c                       |  18 +-
 include/linux/generic-radix-tree.h  |  10 +-
 libbcachefs.c                       |   5 +
 libbcachefs/alloc.c                 | 409 ++++++++++-----
 libbcachefs/alloc.h                 |   1 +
 libbcachefs/bcachefs.h              |  26 +-
 libbcachefs/bcachefs_format.h       |  64 ++-
 libbcachefs/bkey.h                  |   9 +-
 libbcachefs/bkey_methods.c          |   2 +
 libbcachefs/bset.c                  |   3 -
 libbcachefs/btree_cache.h           |   4 +-
 libbcachefs/btree_gc.c              |  28 +-
 libbcachefs/btree_io.c              | 110 ++--
 libbcachefs/btree_io.h              |   8 +
 libbcachefs/btree_iter.c            |  25 +-
 libbcachefs/btree_iter.h            |   4 +-
 libbcachefs/btree_locking.h         |   3 +
 libbcachefs/btree_types.h           |   2 +
 libbcachefs/btree_update_interior.c |  65 +--
 libbcachefs/btree_update_interior.h |   5 +-
 libbcachefs/buckets.c               |  48 +-
 libbcachefs/buckets.h               |   8 +-
 libbcachefs/buckets_types.h         |   3 +-
 libbcachefs/chardev.c               |  14 +-
 libbcachefs/debug.c                 |   2 +-
 libbcachefs/error.h                 |   3 -
 libbcachefs/extents.c               |   4 +-
 libbcachefs/extents.h               |  11 +
 libbcachefs/fifo.h                  |   1 +
 libbcachefs/fs-io.c                 | 298 +++++------
 libbcachefs/fs-ioctl.c              |  28 +
 libbcachefs/fs.c                    | 236 ++++++---
 libbcachefs/fs.h                    |   2 +
 libbcachefs/fsck.c                  |  46 +-
 libbcachefs/io.c                    |   3 +-
 libbcachefs/io_types.h              |   6 +-
 libbcachefs/journal.c               | 321 ++++++------
 libbcachefs/journal.h               |   4 +-
 libbcachefs/journal_types.h         |   6 -
 libbcachefs/migrate.c               | 197 ++-----
 libbcachefs/move.c                  |  81 ++-
 libbcachefs/move.h                  |  12 +-
 libbcachefs/movinggc.c              |   7 +-
 libbcachefs/opts.c                  |  32 +-
 libbcachefs/opts.h                  |  11 +-
 libbcachefs/quota.c                 | 786 ++++++++++++++++++++++++++++
 libbcachefs/quota.h                 |  48 ++
 libbcachefs/quota_types.h           |  36 ++
 libbcachefs/super-io.c              | 219 ++++----
 libbcachefs/super-io.h              |  15 +-
 libbcachefs/super.c                 | 218 ++++----
 libbcachefs/super.h                 |   3 +-
 libbcachefs/tier.c                  |   6 +-
 linux/kthread.c                     |   2 +
 60 files changed, 2353 insertions(+), 1233 deletions(-)
 create mode 100644 libbcachefs/quota.c
 create mode 100644 libbcachefs/quota.h
 create mode 100644 libbcachefs/quota_types.h

diff --git a/.bcachefs_revision b/.bcachefs_revision
index 699d6f22..92bf9ad4 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-0b8c5d0fb7b5de6fb99030565cd2d0411da37f2b
+02ae70070acc3bc4740d221efa5ff5425cf6fce5
diff --git a/cmd_debug.c b/cmd_debug.c
index 1a2c1dbd..6e395bab 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -80,9 +80,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
 int cmd_dump(int argc, char *argv[])
 {
 	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
 	struct bch_dev *ca;
-	const char *err;
 	char *out = NULL;
 	unsigned i, nr_devices = 0;
 	bool force = false;
@@ -112,9 +110,9 @@ int cmd_dump(int argc, char *argv[])
 	if (!out)
 		die("Please supply output filename");
 
-	err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
-	if (err)
-		die("error opening %s: %s", argv[optind], err);
+	struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
 
 	down_read(&c->gc_lock);
 
@@ -258,10 +256,8 @@ static const char * const list_modes[] = {
 int cmd_list(int argc, char *argv[])
 {
 	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
 	enum btree_id btree_id = BTREE_ID_EXTENTS;
 	struct bpos start = POS_MIN, end = POS_MAX;
-	const char *err;
 	u64 inum;
 	int mode = 0, opt;
 
@@ -307,9 +303,9 @@ int cmd_list(int argc, char *argv[])
 	if (optind >= argc)
 		die("Please supply device(s) to check");
 
-	err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
-	if (err)
-		die("error opening %s: %s", argv[optind], err);
+	struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
 
 	switch (mode) {
 	case 0:
diff --git a/cmd_device.c b/cmd_device.c
index 22ab016f..390c48ad 100644
--- a/cmd_device.c
+++ b/cmd_device.c
@@ -528,11 +528,9 @@ int cmd_device_resize(int argc, char *argv[])
 	} else {
 		printf("Doing offline resize of %s\n", dev);
 
-		struct bch_fs *c = NULL;
-		struct bch_opts opts = bch2_opts_empty();
-		const char *err = bch2_fs_open(&dev, 1, opts, &c);
-		if (err)
-			die("error opening %s: %s", dev, err);
+		struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+		if (IS_ERR(c))
+			die("error opening %s: %s", dev, strerror(-PTR_ERR(c)));
 
 		struct bch_dev *ca, *resize = NULL;
 		unsigned i;
diff --git a/cmd_format.c b/cmd_format.c
index 47617660..42e8d1a6 100644
--- a/cmd_format.c
+++ b/cmd_format.c
@@ -328,11 +328,11 @@ int cmd_show_super(int argc, char *argv[])
 	if (argc)
 		die("too many arguments");
 
-	const char *err;
+	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
-	err = bch2_read_super(dev, bch2_opts_empty(), &sb);
-	if (err)
-		die("Error opening %s: %s", dev, err);
+	int ret = bch2_read_super(dev, &opts, &sb);
+	if (ret)
+		die("Error opening %s: %s", dev, strerror(-ret));
 
 	bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE);
 	bch2_free_super(&sb);
diff --git a/cmd_fsck.c b/cmd_fsck.c
index 556a4e1b..6f873b1f 100644
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@@ -23,8 +23,6 @@ static void usage(void)
 int cmd_fsck(int argc, char *argv[])
 {
 	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
-	const char *err;
 	int opt;
 
 	opt_set(opts, degraded, true);
@@ -56,9 +54,9 @@ int cmd_fsck(int argc, char *argv[])
 	if (optind >= argc)
 		die("Please supply device(s) to check");
 
-	err = bch2_fs_open(argv + optind, argc - optind, opts, &c);
-	if (err)
-		die("error opening %s: %s", argv[optind], err);
+	struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c)));
 
 	bch2_fs_stop(c);
 	return 0;
diff --git a/cmd_key.c b/cmd_key.c
index 879163f1..e670b508 100644
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -9,16 +9,16 @@
 
 int cmd_unlock(int argc, char *argv[])
 {
+	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
-	const char *err;
 	char *passphrase;
 
 	if (argc != 2)
 		die("Please supply a single device");
 
-	err = bch2_read_super(argv[1], bch2_opts_empty(), &sb);
-	if (err)
-		die("Error opening %s: %s", argv[1], err);
+	int ret = bch2_read_super(argv[1], &opts, &sb);
+	if (ret)
+		die("Error opening %s: %s", argv[1], strerror(-ret));
 
 	passphrase = read_passphrase("Enter passphrase: ");
 
@@ -32,16 +32,15 @@ int cmd_unlock(int argc, char *argv[])
 int cmd_set_passphrase(int argc, char *argv[])
 {
 	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
-	const char *err;
+	struct bch_fs *c;
 
 	if (argc < 2)
 		die("Please supply one or more devices");
 
 	opt_set(opts, nostart, true);
-	err = bch2_fs_open(argv + 1, argc - 1, opts, &c);
-	if (err)
-		die("Error opening %s: %s", argv[1], err);
+	c = bch2_fs_open(argv + 1, argc - 1, opts);
+	if (IS_ERR(c))
+		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
 
 	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
 	if (!crypt)
@@ -70,16 +69,15 @@ int cmd_set_passphrase(int argc, char *argv[])
 int cmd_remove_passphrase(int argc, char *argv[])
 {
 	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
-	const char *err;
+	struct bch_fs *c;
 
 	if (argc < 2)
 		die("Please supply one or more devices");
 
 	opt_set(opts, nostart, true);
-	err = bch2_fs_open(argv + 1, argc - 1, opts, &c);
-	if (err)
-		die("Error opening %s: %s", argv[1], err);
+	c = bch2_fs_open(argv + 1, argc - 1, opts);
+	if (IS_ERR(c))
+		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
 
 	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
 	if (!crypt)
diff --git a/cmd_migrate.c b/cmd_migrate.c
index d82fee6d..1c449554 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -334,7 +334,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 			die("error reserving space in new filesystem: %s",
 			    strerror(-ret));
 
-		bch2_check_mark_super(c, extent_i_to_s_c(e), false);
+		bch2_check_mark_super(c, BCH_DATA_USER,
+				      bch2_bkey_devs(extent_i_to_s_c(e).s_c));
 
 		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
 					&res, NULL, NULL, 0);
@@ -734,19 +735,18 @@ int cmd_migrate(int argc, char *argv[])
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_fs *c = NULL;
 	char *path[1] = { dev.path };
-	const char *err;
 
 	opt_set(opts, sb,	sb_offset);
 	opt_set(opts, nostart,	true);
 	opt_set(opts, noexcl,	true);
 
-	err = bch2_fs_open(path, 1, opts, &c);
-	if (err)
-		die("Error opening new filesystem: %s", err);
+	c = bch2_fs_open(path, 1, opts);
+	if (IS_ERR(c))
+		die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
 
 	mark_unreserved_space(c, extents);
 
-	err = bch2_fs_start(c);
+	const char *err = bch2_fs_start(c);
 	if (err)
 		die("Error starting new filesystem: %s", err);
 
@@ -758,9 +758,9 @@ int cmd_migrate(int argc, char *argv[])
 	opt_set(opts, nostart,	false);
 	opt_set(opts, nochanges, true);
 
-	err = bch2_fs_open(path, 1, opts, &c);
-	if (err)
-		die("Error opening new filesystem: %s", err);
+	c = bch2_fs_open(path, 1, opts);
+	if (IS_ERR(c))
+		die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
 
 	bch2_fs_stop(c);
 	printf("fsck complete\n");
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 6ea2deb2..7f637e17 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -99,11 +99,11 @@ struct genradix_iter {
 	size_t			pos;
 };
 
-static inline void genradix_iter_init(struct genradix_iter *iter)
-{
-	iter->offset	= 0;
-	iter->pos	= 0;
-}
+#define genradix_iter_init(_radix, _idx)			\
+	((struct genradix_iter) {				\
+		.pos	= (_idx),				\
+		.offset	= __genradix_idx_to_offset((_radix), (_idx)),\
+	})
 
 void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 
diff --git a/libbcachefs.c b/libbcachefs.c
index 1481ef38..3632e30d 100644
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -454,6 +454,11 @@ static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f,
 	}
 }
 
+static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
+				enum units units)
+{
+}
+
 typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
 
 struct bch_sb_field_ops {
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index ec02adc0..f7ff8027 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -55,6 +55,8 @@
 
 #include "bcachefs.h"
 #include "alloc.h"
+#include "btree_cache.h"
+#include "btree_io.h"
 #include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
@@ -290,9 +292,6 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	unsigned i;
 	int ret;
 
-	if (!c->btree_roots[BTREE_ID_ALLOC].b)
-		return 0;
-
 	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
 		bch2_alloc_read_key(c, k);
 		bch2_btree_iter_cond_resched(&iter);
@@ -401,7 +400,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 	return ret;
 }
 
-static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct btree_iter iter;
 	unsigned long bucket;
@@ -412,7 +411,7 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_s
 
 	down_read(&ca->bucket_lock);
 	for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-		ret = __bch2_alloc_write_key(c, ca, bucket, &iter, journal_seq);
+		ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL);
 		if (ret)
 			break;
 
@@ -537,7 +536,8 @@ static void bch2_prio_timer_init(struct bch_fs *c, int rw)
 static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
 				   size_t bucket)
 {
-	if (expensive_debug_checks(c)) {
+	if (expensive_debug_checks(c) &&
+	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
 		size_t iter;
 		long i;
 		unsigned j;
@@ -692,7 +692,7 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 	return (l.key > r.key) - (l.key < r.key);
 }
 
-static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e;
@@ -740,7 +740,7 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 		bch2_invalidate_one_bucket(c, ca, e.bucket);
 }
 
-static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
@@ -762,7 +762,7 @@ static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 	}
 }
 
-static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket_mark m;
@@ -782,21 +782,21 @@ static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 	}
 }
 
-static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
 	ca->inc_gen_needs_gc			= 0;
 	ca->inc_gen_really_needs_gc		= 0;
 
 	switch (ca->mi.replacement) {
-		case CACHE_REPLACEMENT_LRU:
-			invalidate_buckets_lru(c, ca);
-			break;
-		case CACHE_REPLACEMENT_FIFO:
-			invalidate_buckets_fifo(c, ca);
-			break;
-		case CACHE_REPLACEMENT_RANDOM:
-			invalidate_buckets_random(c, ca);
-			break;
+	case CACHE_REPLACEMENT_LRU:
+		find_reclaimable_buckets_lru(c, ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		find_reclaimable_buckets_fifo(c, ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		find_reclaimable_buckets_random(c, ca);
+		break;
 	}
 }
 
@@ -807,79 +807,119 @@ static int size_t_cmp(const void *_l, const void *_r)
 	return (*l > *r) - (*l < *r);
 }
 
+static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+{
+	BUG_ON(ca->free_inc.front);
+
+	spin_lock(&c->freelist_lock);
+	sort(ca->free_inc.data,
+	     ca->free_inc.back,
+	     sizeof(ca->free_inc.data[0]),
+	     size_t_cmp, NULL);
+	spin_unlock(&c->freelist_lock);
+}
+
 static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
-				    u64 *journal_seq)
+				    u64 *journal_seq, size_t nr)
 {
 	struct btree_iter iter;
-	unsigned nr_invalidated = 0;
-	size_t b, i;
 	int ret = 0;
 
 	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 			     BTREE_ITER_INTENT);
 
-	fifo_for_each_entry(b, &ca->free_inc, i) {
+	/*
+	 * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the
+	 * btree update or journal_res_get
+	 */
+	while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
+		size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
+
 		ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq);
 		if (ret)
 			break;
 
-		nr_invalidated++;
+		ca->nr_invalidated++;
 	}
 
 	bch2_btree_iter_unlock(&iter);
-	return nr_invalidated ?: ret;
+	return ret;
 }
 
-/*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
- */
-static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       long bucket)
+static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
 {
-	if (ca->mi.discard &&
-	    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-		blkdev_issue_discard(ca->disk_sb.bdev,
-				     bucket_to_sector(ca, bucket),
-				     ca->mi.bucket_size, GFP_NOIO, 0);
+	unsigned i;
+
+	/*
+	 * Don't remove from free_inc until after it's added to
+	 * freelist, so gc can find it:
+	 */
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++)
+		if (fifo_push(&ca->free[i], bucket)) {
+			fifo_pop(&ca->free_inc, bucket);
+			--ca->nr_invalidated;
+			closure_wake_up(&c->freelist_wait);
+			spin_unlock(&c->freelist_lock);
+			return true;
+		}
+	spin_unlock(&c->freelist_lock);
+
+	return false;
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+{
+	int ret = 0;
 
 	while (1) {
-		bool pushed = false;
-		unsigned i;
-
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		/*
-		 * Don't remove from free_inc until after it's added to
-		 * freelist, so gc can find it:
-		 */
-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++)
-			if (fifo_push(&ca->free[i], bucket)) {
-				fifo_pop(&ca->free_inc, bucket);
-				closure_wake_up(&c->freelist_wait);
-				pushed = true;
-				break;
-			}
-		spin_unlock(&c->freelist_lock);
-
-		if (pushed)
+		if (__push_invalidated_bucket(c, ca, bucket))
 			break;
 
-		if (kthread_should_stop())
+		if ((current->flags & PF_KTHREAD) &&
+		    kthread_should_stop()) {
+			ret = -1;
 			break;
+		}
 
 		schedule();
 		try_to_freeze();
 	}
 
 	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+{
+	while (ca->nr_invalidated) {
+		size_t bucket = fifo_peek(&ca->free_inc);
+
+		BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
+
+		if (ca->mi.discard &&
+		    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bucket),
+					     ca->mi.bucket_size, GFP_NOIO, 0);
+
+		if (push_invalidated_bucket(c, ca, bucket))
+			return -1;
+	}
+
+	return 0;
 }
 
 /**
  * bch_allocator_thread - move buckets from free_inc to reserves
  *
- * The free_inc FIFO is populated by invalidate_buckets(), and
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
  * the reserves are depleted by bucket allocation. When we run out
  * of free_inc, try to invalidate some buckets and write out
  * prios and gens.
@@ -889,43 +929,36 @@ static int bch2_allocator_thread(void *arg)
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
 	u64 journal_seq;
-	size_t bucket;
 	int ret;
 
 	set_freezable();
 
 	while (1) {
 		while (1) {
-			while (ca->nr_invalidated) {
-				BUG_ON(fifo_empty(&ca->free_inc));
-
-				bucket = fifo_peek(&ca->free_inc);
-				discard_invalidated_bucket(c, ca, bucket);
-				if (kthread_should_stop())
-					return 0;
-				--ca->nr_invalidated;
-			}
+			ret = discard_invalidated_buckets(c, ca);
+			if (ret)
+				return 0;
 
 			if (fifo_empty(&ca->free_inc))
 				break;
 
 			journal_seq = 0;
-			ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
-			if (ret < 0)
+			ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
+			if (ret)
 				return 0;
 
-			ca->nr_invalidated = ret;
-
-			if (ca->nr_invalidated == fifo_used(&ca->free_inc)) {
-				ca->alloc_thread_started = true;
-				bch2_alloc_write(c, ca, &journal_seq);
-			}
-
 			if (ca->allocator_invalidating_data)
-				bch2_journal_flush_seq(&c->journal, journal_seq);
+				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
 			else if (ca->allocator_journal_seq_flush)
-				bch2_journal_flush_seq(&c->journal,
+				ret = bch2_journal_flush_seq(&c->journal,
 						       ca->allocator_journal_seq_flush);
+
+			/*
+			 * journal error - buckets haven't actually been
+			 * invalidated, can't discard them:
+			 */
+			if (ret)
+				return 0;
 		}
 
 		/* Reset front/back so we can easily sort fifo entries later: */
@@ -947,7 +980,7 @@ static int bch2_allocator_thread(void *arg)
 			 * another cache tier
 			 */
 
-			invalidate_buckets(c, ca);
+			find_reclaimable_buckets(c, ca);
 			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
 					  ca->free_inc.size);
 
@@ -970,14 +1003,7 @@ static int bch2_allocator_thread(void *arg)
 		}
 		up_read(&c->gc_lock);
 
-		BUG_ON(ca->free_inc.front);
-
-		spin_lock(&c->freelist_lock);
-		sort(ca->free_inc.data,
-		     ca->free_inc.back,
-		     sizeof(ca->free_inc.data[0]),
-		     size_t_cmp, NULL);
-		spin_unlock(&c->freelist_lock);
+		sort_free_inc(c, ca);
 
 		/*
 		 * free_inc is now full of newly-invalidated buckets: next,
@@ -1037,51 +1063,27 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
-/*
- * XXX: allocation on startup is still sketchy. There is insufficient
- * synchronization for bch2_bucket_alloc_startup() to work correctly after
- * bch2_alloc_write() has been called, and we aren't currently doing anything
- * to guarantee that this won't happen.
- *
- * Even aside from that, it's really difficult to avoid situations where on
- * startup we write out a pointer to a freshly allocated bucket before the
- * corresponding gen - when we're still digging ourself out of the "i need to
- * allocate to write bucket gens, but i need to write bucket gens to allocate"
- * hole.
- *
- * Fortunately, bch2_btree_mark_key_initial() will detect and repair this
- * easily enough...
- */
-static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal and btree roots on a brand new fs: */
+int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	ssize_t b;
 
-	if (!down_read_trylock(&c->gc_lock))
-		return -1;
-
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-		up_read(&c->gc_lock);
-		return -1;
-	}
-
-	spin_unlock(&c->freelist_lock);
-
-	down_read(&ca->bucket_lock);
+	rcu_read_lock();
 	buckets = bucket_array(ca);
 
-	spin_lock(&c->freelist_lock);
-
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_startup_available_bucket(buckets->b[b].mark) &&
-		    bch2_mark_alloc_bucket_startup(c, ca, b)) {
+		if (is_available_bucket(buckets->b[b].mark)) {
+			bch2_mark_alloc_bucket(c, ca, b, true,
+					gc_pos_alloc(c, NULL),
+					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					BCH_BUCKET_MARK_GC_LOCK_HELD);
 			set_bit(b, ca->buckets_dirty);
 			goto success;
 		}
 	b = -1;
 success:
-	up_read(&ca->bucket_lock);
-	up_read(&c->gc_lock);
+	rcu_read_unlock();
 	return b;
 }
 
@@ -1150,8 +1152,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		break;
 	}
 
-	if (unlikely(!ca->alloc_thread_started) &&
-	    (reserve == RESERVE_ALLOC) &&
+	if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
 	    (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
 		goto out;
 
@@ -1858,6 +1859,172 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	return 0;
 }
 
+static int __bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	size_t bu, i, devs_have_enough = 0;
+	unsigned dev_iter;
+	u64 journal_seq = 0;
+	bool invalidating_data = false;
+	int ret = 0;
+
+	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+		return -1;
+
+	/* Scan for buckets that are already invalidated: */
+	for_each_rw_member(ca, c, dev_iter) {
+		struct btree_iter iter;
+		struct bucket_mark m;
+		struct bkey_s_c k;
+
+		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
+			if (k.k->type != BCH_ALLOC)
+				continue;
+
+			bu = k.k->p.offset;
+			m = READ_ONCE(bucket(ca, bu)->mark);
+
+			if (!is_available_bucket(m) || m.cached_sectors)
+				continue;
+
+			bch2_mark_alloc_bucket(c, ca, bu, true,
+					gc_pos_alloc(c, NULL),
+					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+			fifo_push(&ca->free_inc, bu);
+			ca->nr_invalidated++;
+
+			if (fifo_full(&ca->free_inc))
+				break;
+		}
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	/* did we find enough buckets? */
+	for_each_rw_member(ca, c, dev_iter)
+		devs_have_enough += (fifo_used(&ca->free_inc) >=
+				     ca->free[RESERVE_BTREE].size);
+
+	if (devs_have_enough >= c->opts.metadata_replicas)
+		return 0;
+
+	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+	for_each_rw_member(ca, c, dev_iter)
+		discard_invalidated_buckets(c, ca);
+
+	for_each_rw_member(ca, c, dev_iter) {
+		BUG_ON(!fifo_empty(&ca->free_inc));
+		ca->free_inc.front = ca->free_inc.back	= 0;
+
+		find_reclaimable_buckets(c, ca);
+		sort_free_inc(c, ca);
+
+		invalidating_data |= ca->allocator_invalidating_data;
+
+		fifo_for_each_entry(bu, &ca->free_inc, i)
+			if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
+				break;
+	}
+
+	/*
+	 * We're moving buckets to freelists _before_ they've been marked as
+	 * invalidated on disk - we have to so that we can allocate new btree
+	 * nodes to mark them as invalidated on disk.
+	 *
+	 * However, we can't _write_ to any of these buckets yet - they might
+	 * have cached data in them, which is live until they're marked as
+	 * invalidated on disk:
+	 */
+	if (invalidating_data)
+		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+	/*
+	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
+	 * since we're holding btree writes. What then?
+	 */
+
+	for_each_rw_member(ca, c, dev_iter) {
+		ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+					       ca->free[RESERVE_BTREE].size);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	if (invalidating_data) {
+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+		if (ret)
+			return ret;
+	}
+
+	for_each_rw_member(ca, c, dev_iter)
+		while (ca->nr_invalidated) {
+			BUG_ON(!fifo_pop(&ca->free_inc, bu));
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket_to_sector(ca, bu),
+					     ca->mi.bucket_size, GFP_NOIO, 0);
+			ca->nr_invalidated--;
+		}
+
+	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
+
+	/* now flush dirty btree nodes: */
+	if (invalidating_data) {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+
+		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+		rcu_read_lock();
+		for_each_cached_btree(b, c, tbl, i, pos)
+			if (btree_node_dirty(b) && (!b->written || b->level)) {
+				rcu_read_unlock();
+				six_lock_read(&b->lock);
+				bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);
+				six_unlock_read(&b->lock);
+				goto again;
+			}
+		rcu_read_unlock();
+	}
+
+	return 0;
+}
+
+int bch2_fs_allocator_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	down_read(&c->gc_lock);
+	ret = __bch2_fs_allocator_start(c);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		return ret;
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_dev_allocator_start(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	for_each_rw_member(ca, c, i) {
+		ret = bch2_alloc_write(c, ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 void bch2_fs_allocator_init(struct bch_fs *c)
 {
 	struct open_bucket *ob;
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index ee771ee1..1b9d960b 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -118,6 +118,7 @@ static inline void writepoint_init(struct write_point *wp,
 	wp->type = type;
 }
 
+int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_init(struct bch_fs *);
 
 extern const struct bkey_ops bch2_bkey_alloc_ops;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 02e38410..78c427fa 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -281,11 +281,9 @@ do {									\
 #include "clock_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
+#include "quota_types.h"
 #include "super_types.h"
 
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX		512
-
 /*
  * Number of nodes we might have to allocate in a worst case btree split
  * operation - we split all the way up to the root, then allocate a new root.
@@ -380,7 +378,6 @@ struct bch_dev {
 	alloc_fifo		free_inc;
 	spinlock_t		freelist_lock;
 	unsigned		nr_invalidated;
-	bool			alloc_thread_started;
 
 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
@@ -423,18 +420,28 @@ struct bch_dev {
  * won't automatically reattach).
  */
 enum {
+	/* startup: */
+	BCH_FS_BRAND_NEW_FS,
 	BCH_FS_ALLOC_READ_DONE,
+	BCH_FS_ALLOCATOR_STARTED,
 	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_FSCK_DONE,
+
+	/* shutdown: */
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_GC_STOPPING,
-	BCH_FS_GC_FAILURE,
-	BCH_FS_BDEV_MOUNTED,
+
+	/* errors: */
 	BCH_FS_ERROR,
+	BCH_FS_GC_FAILURE,
+
+	/* misc: */
+	BCH_FS_BDEV_MOUNTED,
 	BCH_FS_FSCK_FIXED_ERRORS,
-	BCH_FS_FSCK_DONE,
 	BCH_FS_FIXED_GENS,
 	BCH_FS_REBUILD_REPLICAS,
+	BCH_FS_HOLD_BTREE_WRITES,
 };
 
 struct btree_debug {
@@ -517,7 +524,7 @@ struct bch_fs {
 	struct mutex		sb_lock;
 
 	/* BTREE CACHE */
-	struct bio_set		btree_read_bio;
+	struct bio_set		btree_bio;
 
 	struct btree_root	btree_roots[BTREE_ID_NR];
 	bool			btree_roots_dirty;
@@ -665,6 +672,9 @@ struct bch_fs {
 	unsigned		writeback_pages_max;
 	atomic_long_t		nr_inodes;
 
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
 	/* DEBUG JUNK */
 	struct dentry		*debug;
 	struct btree_debug	btree_debug[BTREE_ID_NR];
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index d65b5e66..cb9e450b 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -606,11 +606,13 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_generation,			32)	\
 	BCH_INODE_FIELD(bi_dev,				32)	\
 	BCH_INODE_FIELD(bi_data_checksum,		8)	\
-	BCH_INODE_FIELD(bi_compression,			8)
+	BCH_INODE_FIELD(bi_compression,			8)	\
+	BCH_INODE_FIELD(bi_project,			32)
 
 #define BCH_INODE_FIELDS_INHERIT()				\
 	BCH_INODE_FIELD(bi_data_checksum)			\
-	BCH_INODE_FIELD(bi_compression)
+	BCH_INODE_FIELD(bi_compression)				\
+	BCH_INODE_FIELD(bi_project)
 
 enum {
 	/*
@@ -737,6 +739,36 @@ struct bch_alloc {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
 
+/* Quotas: */
+
+enum {
+	BCH_QUOTA		= 128,
+};
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(quota,	BCH_QUOTA);
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -749,7 +781,8 @@ struct bch_sb_field {
 	x(journal,	0)	\
 	x(members,	1)	\
 	x(crypt,	2)	\
-	x(replicas,	3)
+	x(replicas,	3)	\
+	x(quota,	4)
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -883,6 +916,23 @@ struct bch_sb_field_replicas {
 	struct bch_replicas_entry entries[0];
 };
 
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __attribute__((packed, aligned(8)));
+
 /* Superblock: */
 
 /*
@@ -986,6 +1036,11 @@ LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
 
 LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
+
+/* 60-64 unused */
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
@@ -1181,7 +1236,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 	DEF_BTREE_ID(INODES,	1, "inodes")			\
 	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
 	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
-	DEF_BTREE_ID(ALLOC,	4, "alloc")
+	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
+	DEF_BTREE_ID(QUOTAS,	5, "quotas")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 89697956..f665e2e1 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -7,6 +7,10 @@
 #include "util.h"
 #include "vstructs.h"
 
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+
 void bch2_to_binary(char *, const u64 *, unsigned);
 
 #define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@@ -381,8 +385,7 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
 		: U64_MAX;
 }
 
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 
 int bch2_compile_bkey_format(const struct bkey_format *, void *);
 
@@ -583,6 +586,8 @@ BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
 
 BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
 
+BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
+
 /* byte order helpers */
 
 #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 1736a483..3b3a09eb 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -7,6 +7,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "quota.h"
 #include "xattr.h"
 
 const struct bkey_ops *bch2_bkey_ops[] = {
@@ -15,6 +16,7 @@ const struct bkey_ops *bch2_bkey_ops[] = {
 	[BKEY_TYPE_DIRENTS]	= &bch2_bkey_dirent_ops,
 	[BKEY_TYPE_XATTRS]	= &bch2_bkey_xattr_ops,
 	[BKEY_TYPE_ALLOC]	= &bch2_bkey_alloc_ops,
+	[BKEY_TYPE_QUOTAS]	= &bch2_bkey_quota_ops,
 	[BKEY_TYPE_BTREE]	= &bch2_bkey_btree_ops,
 };
 
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 10f3f3f3..02be5bb4 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -1550,9 +1550,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 
 	__bch2_btree_node_iter_init(iter, is_extents);
 
-	//if (bkey_cmp(search, b->curr_max_key) > 0)
-	//	return;
-
 	switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
 	case BKEY_PACK_POS_EXACT:
 		packed_search = &p;
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 46d536eb..e021d6e9 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -45,8 +45,8 @@ static inline bool btree_node_hashed(struct btree *b)
 }
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
-	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
-					  &(_c)->btree_cache_table),	\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
 	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
 		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
 
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 7d1be86f..9f1071e5 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -148,23 +148,24 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 {
 	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
 		? BCH_DATA_BTREE : BCH_DATA_USER;
+	struct bch_devs_list devs = bch2_bkey_devs(k);
 	int ret = 0;
 
+	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+	    fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
+			"superblock not marked as containing replicas (type %u)",
+			data_type)) {
+		ret = bch2_check_mark_super(c, data_type, devs);
+		if (ret)
+			return ret;
+	}
+
 	switch (k.k->type) {
 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const struct bch_extent_ptr *ptr;
 
-		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
-				"superblock not marked as containing replicas (type %u)",
-				data_type)) {
-			ret = bch2_check_mark_super(c, e, data_type);
-			if (ret)
-				return ret;
-		}
-
 		extent_for_each_ptr(e, ptr) {
 			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 			size_t b = PTR_BUCKET_NR(ca, ptr);
@@ -284,7 +285,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 	mutex_lock(&c->btree_root_lock);
 
 	b = c->btree_roots[btree_id].b;
-	bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+	if (!btree_node_fake(b))
+		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
 	mutex_unlock(&c->btree_root_lock);
@@ -991,8 +993,10 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
 	if (!c->btree_roots[id].b)
 		return 0;
 
-	ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
-			   bkey_i_to_s_c(&c->btree_roots[id].b->key));
+	b = c->btree_roots[id].b;
+	if (!btree_node_fake(b))
+		ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
+						  bkey_i_to_s_c(&b->key));
 	if (ret)
 		return ret;
 
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 87a8ddf9..3f87e91e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1352,7 +1352,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		return;
 	}
 
-	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
 	rb->start_time		= local_clock();
@@ -1438,9 +1438,9 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
-					struct bch_write_bio *wbio)
+					struct btree_write_bio *wbio)
 {
-	struct btree *b		= wbio->bio.bi_private;
+	struct btree *b		= wbio->wbio.bio.bi_private;
 	struct closure *cl	= wbio->cl;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct bkey_i_extent *new_key;
@@ -1473,7 +1473,7 @@ retry:
 	new_key = bkey_i_to_extent(&tmp.k);
 	e = extent_i_to_s(new_key);
 	extent_for_each_ptr_backwards(e, ptr)
-		if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+		if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
 			bch2_extent_drop_ptr(e, ptr);
 
 	if (!bch2_extent_nr_ptrs(e.c))
@@ -1486,7 +1486,7 @@ retry:
 		goto err;
 out:
 	bch2_btree_iter_unlock(&iter);
-	bio_put(&wbio->bio);
+	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b);
 	if (cl)
 		closure_put(cl);
@@ -1511,17 +1511,46 @@ void bch2_btree_write_error_work(struct work_struct *work)
 		if (!bio)
 			break;
 
-		bch2_btree_node_write_error(c, to_wbio(bio));
+		bch2_btree_node_write_error(c,
+			container_of(bio, struct btree_write_bio, wbio.bio));
 	}
 }
 
+static void btree_node_write_work(struct work_struct *work)
+{
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct closure *cl	= wbio->cl;
+	struct bch_fs *c	= wbio->wbio.c;
+	struct btree *b		= wbio->wbio.bio.bi_private;
+
+	btree_bounce_free(c,
+		wbio->wbio.order,
+		wbio->wbio.used_mempool,
+		wbio->data);
+
+	if (wbio->wbio.failed.nr) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+		queue_work(c->wq, &c->btree_write_error_work);
+		return;
+	}
+
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+	if (cl)
+		closure_put(cl);
+}
+
 static void btree_node_write_endio(struct bio *bio)
 {
-	struct btree *b			= bio->bi_private;
 	struct bch_write_bio *wbio	= to_wbio(bio);
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_write_bio *orig	= parent ?: wbio;
-	struct closure *cl		= !wbio->split ? wbio->cl : NULL;
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
 	unsigned long flags;
@@ -1542,27 +1571,13 @@ static void btree_node_write_endio(struct bio *bio)
 	if (parent) {
 		bio_put(bio);
 		bio_endio(&parent->bio);
-		return;
+	} else {
+		struct btree_write_bio *wb =
+			container_of(orig, struct btree_write_bio, wbio);
+
+		INIT_WORK(&wb->work, btree_node_write_work);
+		schedule_work(&wb->work);
 	}
-
-	btree_bounce_free(c,
-		wbio->order,
-		wbio->used_mempool,
-		wbio->data);
-
-	if (wbio->failed.nr) {
-		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bio_list_add(&c->btree_write_error_list, &wbio->bio);
-		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-
-		queue_work(c->wq, &c->btree_write_error_work);
-		return;
-	}
-
-	bio_put(bio);
-	btree_node_write_done(c, b);
-	if (cl)
-		closure_put(cl);
 }
 
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1586,7 +1601,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 			    struct closure *parent,
 			    enum six_lock_type lock_type_held)
 {
-	struct bch_write_bio *wbio;
+	struct btree_write_bio *wbio;
 	struct bset_tree *t;
 	struct bset *i;
 	struct btree_node *bn = NULL;
@@ -1602,6 +1617,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	unsigned long old, new;
 	void *data;
 
+	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+		return;
+
 	/*
 	 * We may only have a read lock on the btree node - the dirty bit is our
 	 * "lock" against racing with other threads that may be trying to start
@@ -1631,6 +1649,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		new ^=  (1 << BTREE_NODE_write_idx);
 	} while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+	BUG_ON(btree_node_fake(b));
 	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON((b->will_make_reachable != NULL) != !b->written);
 
@@ -1763,21 +1782,22 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	trace_btree_write(b, bytes_to_write, sectors_to_write);
 
-	wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
-	wbio->cl		= parent;
-	wbio->failed.nr		= 0;
-	wbio->order		= order;
-	wbio->used_mempool	= used_mempool;
-	wbio->data		= data;
-	wbio->bio.bi_opf	= REQ_OP_WRITE|REQ_META|REQ_FUA;
-	wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
-	wbio->bio.bi_end_io	= btree_node_write_endio;
-	wbio->bio.bi_private	= b;
+	wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
+			    struct btree_write_bio, wbio.bio);
+	wbio_init(&wbio->wbio.bio);
+	wbio->data			= data;
+	wbio->cl			= parent;
+	wbio->wbio.order		= order;
+	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.bio.bi_opf		= REQ_OP_WRITE|REQ_META|REQ_FUA;
+	wbio->wbio.bio.bi_iter.bi_size	= sectors_to_write << 9;
+	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
+	wbio->wbio.bio.bi_private	= b;
 
 	if (parent)
 		closure_get(parent);
 
-	bch2_bio_map(&wbio->bio, data);
+	bch2_bio_map(&wbio->wbio.bio, data);
 
 	/*
 	 * If we're appending to a leaf node, we don't technically need FUA -
@@ -1802,7 +1822,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	b->written += sectors_to_write;
 
-	bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
 	return;
 err:
 	set_btree_node_noevict(b);
@@ -1905,11 +1925,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
 	unsigned i;
 
 	rcu_read_lock();
-	tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
-				  &c->btree_cache.table);
-
-	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_rcu(b, pos, tbl, i, hash)
-			BUG_ON(btree_node_dirty(b));
+	for_each_cached_btree(b, c, tbl, i, pos)
+		BUG_ON(btree_node_dirty(b));
 	rcu_read_unlock();
 }
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 61165a63..c8417ac3 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -2,6 +2,7 @@
 #define _BCACHEFS_BTREE_IO_H
 
 #include "extents.h"
+#include "io_types.h"
 
 struct bch_fs;
 struct btree_write;
@@ -17,6 +18,13 @@ struct btree_read_bio {
 	struct bio		bio;
 };
 
+struct btree_write_bio {
+	struct closure		*cl;
+	void			*data;
+	struct work_struct	work;
+	struct bch_write_bio	wbio;
+};
+
 static inline void btree_node_io_unlock(struct btree *b)
 {
 	EBUG_ON(!btree_node_write_in_flight(b));
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 0b505a73..ee463f36 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -202,21 +202,20 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
 /* Btree iterator locking: */
 
-
 static void btree_iter_drop_extra_locks(struct btree_iter *iter)
 {
 	unsigned l;
 
 	while (iter->nodes_locked &&
 	       (l = __fls(iter->nodes_locked)) > iter->locks_want) {
-		if (!btree_node_locked(iter, l))
-			panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
-
 		if (l > iter->level) {
 			btree_node_unlock(iter, l);
-		} else if (btree_node_intent_locked(iter, l)) {
-			six_lock_downgrade(&iter->nodes[l]->lock);
-			iter->nodes_intent_locked ^= 1 << l;
+		} else {
+			if (btree_node_intent_locked(iter, l)) {
+				six_lock_downgrade(&iter->nodes[l]->lock);
+				iter->nodes_intent_locked ^= 1 << l;
+			}
+			break;
 		}
 	}
 }
@@ -861,7 +860,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 		     i < iter->locks_want && iter->nodes[i];
 		     i++)
 			if (!bch2_btree_node_relock(iter, i)) {
-				while (iter->nodes[iter->level] &&
+				while (iter->level < BTREE_MAX_DEPTH &&
+				       iter->nodes[iter->level] &&
 				       iter->level + 1 < iter->locks_want)
 					btree_iter_up(iter);
 				break;
@@ -872,7 +872,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	 * If the current node isn't locked, go up until we have a locked node
 	 * or run out of nodes:
 	 */
-	while (iter->nodes[iter->level] &&
+	while (iter->level < BTREE_MAX_DEPTH &&
+	       iter->nodes[iter->level] &&
 	       !(is_btree_node(iter, iter->level) &&
 		 bch2_btree_node_relock(iter, iter->level) &&
 		 btree_iter_pos_cmp(iter->pos,
@@ -884,7 +885,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 	 * If we've got a btree node locked (i.e. we aren't about to relock the
 	 * root) - advance its node iterator if necessary:
 	 */
-	if (iter->nodes[iter->level]) {
+	if (iter->level < BTREE_MAX_DEPTH &&
+	    iter->nodes[iter->level]) {
 		struct bkey_s_c k;
 
 		while ((k = __btree_iter_peek_all(iter)).k &&
@@ -956,7 +958,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
 
 	btree_iter_up(iter);
 
-	if (!iter->nodes[iter->level])
+	if (iter->level == BTREE_MAX_DEPTH ||
+	    !iter->nodes[iter->level])
 		return NULL;
 
 	/* parent node usually won't be locked: redo traversal if necessary */
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index a7fdba82..eb196a3a 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -50,10 +50,8 @@ struct btree_iter {
 	 * always fail (but since freeing a btree node takes a write lock on the
 	 * node, which increments the node's lock seq, that's not actually
 	 * necessary in that example).
-	 *
-	 * One extra slot for a sentinel NULL:
 	 */
-	struct btree		*nodes[BTREE_MAX_DEPTH + 1];
+	struct btree		*nodes[BTREE_MAX_DEPTH];
 	struct btree_node_iter	node_iters[BTREE_MAX_DEPTH];
 
 	/*
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index acfe5b59..ca2992ba 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -92,6 +92,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 	int lock_type = btree_node_locked_type(iter, level);
 
 	EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
 	if (lock_type != BTREE_NODE_UNLOCKED)
 		six_unlock_type(&iter->nodes[level]->lock, lock_type);
@@ -106,6 +107,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 				   struct btree_iter *iter,
 				   enum six_lock_type type)
 {
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
 	return likely(six_trylock_type(&b->lock, type)) ||
 		__bch2_btree_node_lock(b, pos, level, iter, type);
 }
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index f0e6896a..fb2f7e21 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -197,6 +197,7 @@ enum btree_flags {
 	BTREE_NODE_write_in_flight,
 	BTREE_NODE_just_written,
 	BTREE_NODE_dying,
+	BTREE_NODE_fake,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -209,6 +210,7 @@ BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
+BTREE_FLAG(fake);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 04854532..a0f37c4c 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -546,8 +546,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
 			goto err_free;
 		}
 
-		ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
-					    BCH_DATA_BTREE);
+		ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+					bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
 		if (ret)
 			goto err_free;
 
@@ -915,6 +915,10 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	struct bset_tree *t;
 
 	set_btree_node_dying(b);
+
+	if (btree_node_fake(b))
+		return;
+
 	btree_interior_update_add_node_reference(as, b);
 
 	/*
@@ -1052,7 +1056,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 		      gc_pos_btree_root(b->btree_id),
 		      &stats, 0, 0);
 
-	if (old)
+	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&old->key),
 					   &stats);
@@ -1422,7 +1426,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 
 	bch2_btree_node_lock_for_insert(c, b, iter);
 
-	if (bch_keylist_u64s(keys) > bch_btree_keys_u64s_remaining(c, b)) {
+	if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
 		bch2_btree_node_unlock_write(b, iter);
 		return -1;
 	}
@@ -1957,7 +1961,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 			goto err;
 	}
 
-	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+	ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+				    bch2_extent_devs(extent_i_to_s_c(new_key)));
 	if (ret)
 		goto err_free_update;
 
@@ -1993,45 +1998,43 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	bch2_btree_set_root_ondisk(c, b, READ);
 }
 
-int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
-			  struct closure *writes)
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-	struct btree_update *as;
 	struct closure cl;
 	struct btree *b;
+	int ret;
 
-	memset(&as, 0, sizeof(as));
 	closure_init_stack(&cl);
 
-	while (1) {
-		/* XXX haven't calculated capacity yet :/ */
-		as = bch2_btree_update_start(c, id, 1,
-					     BTREE_INSERT_USE_RESERVE|
-					     BTREE_INSERT_USE_ALLOC_RESERVE,
-					     &cl);
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 		closure_sync(&cl);
+	} while (ret);
 
-		if (!IS_ERR(as))
-			break;
+	b = bch2_btree_node_mem_alloc(c);
+	bch2_btree_cache_cannibalize_unlock(c);
 
-		if (PTR_ERR(as) == -ENOSPC)
-			return PTR_ERR(as);
-	}
+	set_btree_node_fake(b);
+	b->level	= 0;
+	b->btree_id	= id;
 
-	b = __btree_root_alloc(as, 0);
+	bkey_extent_init(&b->key);
+	b->key.k.p = POS_MAX;
+	bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
 
-	bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
-	btree_update_drop_new_node(c, b);
+	bch2_bset_init_first(b, &b->data->keys);
+	bch2_btree_build_aux_trees(b);
 
-	BUG_ON(btree_node_root(c, b));
+	b->data->min_key = POS_MIN;
+	b->data->max_key = POS_MAX;
+	b->data->format = bch2_btree_calc_format(b);
+	btree_node_set_format(b, b->data->format);
 
-	bch2_btree_set_root_inmem(as, b);
-	bch2_btree_set_root_ondisk(c, b, WRITE);
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+	BUG_ON(ret);
 
-	bch2_btree_open_bucket_put(c, b);
+	__bch2_btree_set_root_inmem(c, b);
+
+	six_unlock_write(&b->lock);
 	six_unlock_intent(&b->lock);
-
-	bch2_btree_update_free(as);
-
-	return 0;
 }
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index e129b24e..23ee3980 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -150,7 +150,7 @@ int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
 				enum btree_node_sibling);
 
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
 
 static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 						     struct btree *b)
@@ -280,6 +280,9 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
 static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 					      struct btree *b, unsigned u64s)
 {
+	if (unlikely(btree_node_fake(b)))
+		return false;
+
 	if (btree_node_is_extents(b)) {
 		/* The insert key might split an existing key
 		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 2dbe7d37..43133cbb 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -258,6 +258,11 @@ static u64 reserve_factor(u64 r)
 	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
 }
 
+static u64 avail_factor(u64 r)
+{
+	return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+}
+
 u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 {
 	struct fs_usage_sum sum = __fs_usage_sum(stats);
@@ -270,6 +275,11 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
+u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+}
+
 static inline int is_unavailable_bucket(struct bucket_mark m)
 {
 	return !is_available_bucket(m);
@@ -382,7 +392,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		}
 
 		new.owned_by_allocator	= 1;
-		new.touched_this_mount	= 1;
 		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
@@ -396,29 +405,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 	return true;
 }
 
-bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
-				    size_t b)
-{
-	struct bucket *g;
-	struct bucket_mark new, old;
-
-	lg_local_lock(&c->usage_lock);
-	g = bucket(ca, b);
-
-	old = bucket_data_cmpxchg(c, ca, g, new, ({
-		if (!is_startup_available_bucket(new)) {
-			lg_local_unlock(&c->usage_lock);
-			return false;
-		}
-
-		new.owned_by_allocator	= 1;
-		new.touched_this_mount	= 1;
-	}));
-	lg_local_unlock(&c->usage_lock);
-
-	return true;
-}
-
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 			    size_t b, bool owned_by_allocator,
 			    struct gc_pos pos, unsigned flags)
@@ -436,7 +422,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	old = bucket_data_cmpxchg(c, ca, g, new, ({
-		new.touched_this_mount	= 1;
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 	lg_local_unlock(&c->usage_lock);
@@ -481,7 +466,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		saturated_add(ca, new.dirty_sectors, sectors,
 			      GC_MAX_SECTORS_USED);
 		new.data_type		= type;
-		new.touched_this_mount	= 1;
 	}));
 	lg_local_unlock(&c->usage_lock);
 
@@ -539,7 +523,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 		if (journal_seq)
 			bucket_cmpxchg(g, new, ({
-				new.touched_this_mount	= 1;
 				new.journal_seq_valid	= 1;
 				new.journal_seq		= journal_seq;
 			}));
@@ -588,8 +571,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			new.data_type = data_type;
 		}
 
-		new.touched_this_mount	= 1;
-
 		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
 			g->_mark = new;
 			break;
@@ -694,17 +675,12 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
 static u64 __recalc_sectors_available(struct bch_fs *c)
 {
-	u64 avail;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-	avail = c->capacity - bch2_fs_sectors_used(c, bch2_fs_usage_read(c));
-
-	avail <<= RESERVE_FACTOR;
-	avail /= (1 << RESERVE_FACTOR) + 1;
-	return avail;
+	return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
 }
 
 /* Used by gc when it's starting: */
@@ -839,7 +815,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-	struct bucket_array *buckets = NULL, *old_buckets;
+	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	unsigned long *buckets_dirty = NULL;
 	u8 *oldest_gens = NULL;
 	alloc_fifo	free[RESERVE_NR];
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 78243129..86e72829 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -184,6 +184,7 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 
 u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
 
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
@@ -192,11 +193,6 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 		!mark.nouse);
 }
 
-static inline bool is_startup_available_bucket(struct bucket_mark mark)
-{
-	return !mark.touched_this_mount && is_available_bucket(mark);
-}
-
 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 					       u16 last_seq_ondisk)
 {
@@ -208,8 +204,6 @@ void bch2_bucket_seq_cleanup(struct bch_fs *);
 
 bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, struct bucket_mark *);
-bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
-				    size_t);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
 			    size_t, bool, struct gc_pos, unsigned);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 7cd8439a..6f52a109 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -15,8 +15,7 @@ struct bucket_mark {
 				gen_valid:1,
 				owned_by_allocator:1,
 				nouse:1,
-				journal_seq_valid:1,
-				touched_this_mount:1;
+				journal_seq_valid:1;
 		u16		dirty_sectors;
 		u16		cached_sectors;
 
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 1618ffe7..1498832b 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -64,7 +64,7 @@ found:
 static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 {
 	struct bch_ioctl_assemble arg;
-	const char *err;
+	struct bch_fs *c;
 	u64 *user_devs = NULL;
 	char **devs = NULL;
 	unsigned i;
@@ -96,14 +96,10 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 		}
 	}
 
-	err = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty(), NULL);
-	if (err) {
-		pr_err("Could not open filesystem: %s", err);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = 0;
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
 err:
 	if (devs)
 		for (i = 0; i < arg.nr_devs; i++)
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index ccfb0386..0f090ca5 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -58,7 +58,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	if (IS_ERR_OR_NULL(pick.ca))
 		return;
 
-	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
 	bio->bi_bdev		= pick.ca->disk_sb.bdev;
 	bio->bi_opf		= REQ_OP_READ|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 28fe4fce..ac3e96d2 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -143,9 +143,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define __fsck_err_on(cond, c, _flags, ...)				\
 	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
 
-#define unfixable_fsck_err_on(cond, c, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__)
-
 #define need_fsck_err_on(cond, c, ...)					\
 	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
 
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 2b4a2dc2..bceea486 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -666,7 +666,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 			goto err;
 	}
 
-	if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
+	if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), k);
 		bch2_fs_bug(c,
@@ -1803,7 +1803,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 	}
 
 	if (!bkey_extent_is_cached(e.k) &&
-	    !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
+	    !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) {
 		bch2_bkey_val_to_text(c, btree_node_type(b),
 				     buf, sizeof(buf), e.s_c);
 		bch2_fs_bug(c,
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index aeae361d..eda34381 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -426,6 +426,17 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent
 	return ret;
 }
 
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
 				 struct bch_extent_crc_unpacked);
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 98f22f6a..08739d26 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -57,6 +57,7 @@ do {									\
 #define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
 
 #define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
 
 #define fifo_push_back_ref(f)						\
 	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 2c34a85c..66374a9c 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -12,6 +12,7 @@
 #include "journal.h"
 #include "io.h"
 #include "keylist.h"
+#include "quota.h"
 
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
@@ -56,14 +57,13 @@ struct bch_writepage_io {
 struct dio_write {
 	struct closure			cl;
 	struct kiocb			*req;
-	struct bch_fs			*c;
-	loff_t				offset;
-
-	struct iovec			*iovec;
-	struct iovec			inline_vecs[UIO_FASTIOV];
-	struct iov_iter			iter;
-
 	struct task_struct		*task;
+	unsigned			loop:1,
+					sync:1,
+					free_iov:1;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
 
 	/* must be last: */
 	struct bchfs_write_op		iop;
@@ -130,6 +130,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
 static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
 {
 	inode->v.i_blocks += sectors;
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
 }
 
 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
@@ -1286,7 +1287,8 @@ static int bch2_read_single_page(struct page *page,
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(done);
 
-	rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
+			 io_opts(c, inode));
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
@@ -1439,13 +1441,15 @@ static void bch2_direct_IO_read_split_endio(struct bio *bio)
 	bio_check_pages_dirty(bio);	/* transfers ownership */
 }
 
-static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
-			       struct file *file, struct bch_inode_info *inode,
-			       struct iov_iter *iter, loff_t offset)
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 {
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_io_opts opts = io_opts(c, inode);
 	struct dio_read *dio;
 	struct bio *bio;
+	loff_t offset = req->ki_pos;
 	bool sync = is_sync_kiocb(req);
 	ssize_t ret;
 
@@ -1525,103 +1529,128 @@ start:
 	}
 }
 
-static long __bch2_dio_write_complete(struct dio_write *dio)
+static void bch2_dio_write_loop_async(struct closure *);
+
+static long bch2_dio_write_loop(struct dio_write *dio)
 {
-	struct file *file = dio->req->ki_filp;
+	struct kiocb *req = dio->req;
+	struct file *file = req->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
-
-	bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
-
-	__pagecache_block_put(&mapping->add_lock);
-	inode_dio_end(&inode->v);
-
-	if (dio->iovec && dio->iovec != dio->inline_vecs)
-		kfree(dio->iovec);
-
-	bio_put(&dio->iop.op.wbio.bio);
-	return ret;
-}
-
-static void bch2_dio_write_complete(struct closure *cl)
-{
-	struct dio_write *dio = container_of(cl, struct dio_write, cl);
-	struct kiocb *req = dio->req;
-
-	req->ki_complete(req, __bch2_dio_write_complete(dio), 0);
-}
-
-static void bch2_dio_write_done(struct dio_write *dio)
-{
+	struct bio *bio = &dio->iop.op.wbio.bio;
 	struct bio_vec *bv;
+	bool sync;
+	long ret;
 	int i;
 
-	bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
-		put_page(bv->bv_page);
+	if (dio->loop)
+		goto loop;
 
-	if (dio->iter.count)
-		bio_reset(&dio->iop.op.wbio.bio);
-}
+	inode_dio_begin(&inode->v);
+	__pagecache_block_get(&mapping->add_lock);
 
-static void bch2_do_direct_IO_write(struct dio_write *dio)
-{
-	struct file *file = dio->req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bio *bio = &dio->iop.op.wbio.bio;
-	int ret;
+	/* Write and invalidate pagecache range that we're writing to: */
+	ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
+				req->ki_pos + iov_iter_count(&dio->iter) - 1);
+	if (unlikely(ret))
+		goto err;
 
-	ret = bio_iov_iter_get_pages(bio, &dio->iter);
-	if (ret < 0) {
-		dio->iop.op.error = ret;
-		return;
+	while (1) {
+		BUG_ON(current->pagecache_lock);
+		current->pagecache_lock = &mapping->add_lock;
+		if (current != dio->task)
+			use_mm(dio->task->mm);
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		if (current != dio->task)
+			unuse_mm(dio->task->mm);
+		current->pagecache_lock = NULL;
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		dio->iop.op.pos = POS(inode->v.i_ino,
+				(req->ki_pos >> 9) + dio->iop.op.written);
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
+
+		if (!dio->sync && !dio->loop && dio->iter.count) {
+			struct iovec *iov = dio->inline_vecs;
+
+			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+				iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
+					      GFP_KERNEL);
+				if (unlikely(!iov)) {
+					dio->iop.op.error = -ENOMEM;
+					goto err_wait_io;
+				}
+
+				dio->free_iov = true;
+			}
+
+			memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
+			dio->iter.iov = iov;
+		}
+err_wait_io:
+		dio->loop = true;
+
+		if (!dio->sync) {
+			continue_at_noreturn(&dio->cl,
+					bch2_dio_write_loop_async, NULL);
+			return -EIOCBQUEUED;
+		}
+
+		closure_sync(&dio->cl);
+loop:
+		bio_for_each_segment_all(bv, bio, i)
+			put_page(bv->bv_page);
+		if (!dio->iter.count || dio->iop.op.error)
+			break;
+		bio_reset(bio);
 	}
 
-	dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
+	ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+err:
+	__pagecache_block_put(&mapping->add_lock);
+	inode_dio_end(&inode->v);
+	bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
 
-	task_io_account_write(bio->bi_iter.bi_size);
+	if (dio->free_iov)
+		kfree(dio->iter.iov);
 
-	closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
+	closure_debug_destroy(&dio->cl);
+
+	sync = dio->sync;
+	bio_put(bio);
+
+	if (!sync) {
+		req->ki_complete(req, ret, 0);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
 }
 
 static void bch2_dio_write_loop_async(struct closure *cl)
 {
-	struct dio_write *dio =
-		container_of(cl, struct dio_write, cl);
-	struct address_space *mapping = dio->req->ki_filp->f_mapping;
+	struct dio_write *dio = container_of(cl, struct dio_write, cl);
 
-	bch2_dio_write_done(dio);
-
-	if (dio->iter.count && !dio->iop.op.error) {
-		use_mm(dio->task->mm);
-		pagecache_block_get(&mapping->add_lock);
-
-		bch2_do_direct_IO_write(dio);
-
-		pagecache_block_put(&mapping->add_lock);
-		unuse_mm(dio->task->mm);
-
-		continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
-	} else {
-#if 0
-		closure_return_with_destructor(cl, bch2_dio_write_complete);
-#else
-		closure_debug_destroy(cl);
-		bch2_dio_write_complete(cl);
-#endif
-	}
+	bch2_dio_write_loop(dio);
 }
 
-static int bch2_direct_IO_write(struct bch_fs *c,
-				struct kiocb *req, struct file *file,
-				struct bch_inode_info *inode,
-				struct iov_iter *iter, loff_t offset)
+static int bch2_direct_IO_write(struct kiocb *req,
+				struct iov_iter *iter,
+				bool swap)
 {
-	struct address_space *mapping = file->f_mapping;
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct dio_write *dio;
 	struct bio *bio;
+	loff_t offset = req->ki_pos;
 	ssize_t ret;
-	bool sync = is_sync_kiocb(req);
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
@@ -1637,95 +1666,49 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
 	closure_init(&dio->cl, NULL);
 	dio->req		= req;
-	dio->c			= c;
-	dio->offset		= offset;
-	dio->iovec		= NULL;
-	dio->iter		= *iter;
 	dio->task		= current;
+	dio->loop		= false;
+	dio->sync		= is_sync_kiocb(req) ||
+		offset + iter->count > inode->v.i_size;
+	dio->free_iov		= false;
+	dio->iter		= *iter;
 	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
 	dio->iop.op.write_point	= writepoint_hashed((unsigned long) dio->task);
 	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
 
-	if ((dio->req->ki_flags & IOCB_DSYNC) &&
+	if ((req->ki_flags & IOCB_DSYNC) &&
 	    !c->opts.journal_flush_disabled)
 		dio->iop.op.flags |= BCH_WRITE_FLUSH;
 
-	if (offset + iter->count > inode->v.i_size)
-		sync = true;
-
-	/*
-	 * XXX: we shouldn't return -ENOSPC if we're overwriting existing data -
-	 * if getting a reservation fails we should check if we are doing an
-	 * overwrite.
-	 *
-	 * Have to then guard against racing with truncate (deleting data that
-	 * we would have been overwriting)
-	 */
 	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
 	if (unlikely(ret)) {
 		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
 						      offset >> 9),
-					       iter->count >> 9)) {
-			closure_debug_destroy(&dio->cl);
-			bio_put(bio);
-			return ret;
-		}
+					       iter->count >> 9))
+			goto err;
 
 		dio->iop.unalloc = true;
 	}
 
 	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
 
-	inode_dio_begin(&inode->v);
-	__pagecache_block_get(&mapping->add_lock);
-
-	if (sync) {
-		do {
-			bch2_do_direct_IO_write(dio);
-
-			closure_sync(&dio->cl);
-			bch2_dio_write_done(dio);
-		} while (dio->iter.count && !dio->iop.op.error);
-
-		closure_debug_destroy(&dio->cl);
-		return __bch2_dio_write_complete(dio);
-	} else {
-		bch2_do_direct_IO_write(dio);
-
-		if (dio->iter.count && !dio->iop.op.error) {
-			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-				dio->iovec = kmalloc(dio->iter.nr_segs *
-						     sizeof(struct iovec),
-						     GFP_KERNEL);
-				if (!dio->iovec)
-					dio->iop.op.error = -ENOMEM;
-			} else {
-				dio->iovec = dio->inline_vecs;
-			}
-
-			memcpy(dio->iovec,
-			       dio->iter.iov,
-			       dio->iter.nr_segs * sizeof(struct iovec));
-			dio->iter.iov = dio->iovec;
-		}
-
-		continue_at_noreturn(&dio->cl, bch2_dio_write_loop_async, NULL);
-		return -EIOCBQUEUED;
-	}
+	return bch2_dio_write_loop(dio);
+err:
+	bch2_disk_reservation_put(c, &dio->iop.op.res);
+	closure_debug_destroy(&dio->cl);
+	bio_put(bio);
+	return ret;
 }
 
 ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
 {
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct blk_plug plug;
 	ssize_t ret;
 
 	blk_start_plug(&plug);
-	ret = ((iov_iter_rw(iter) == WRITE)
-		? bch2_direct_IO_write
-		: bch2_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+	ret = iov_iter_rw(iter) == WRITE
+		? bch2_direct_IO_write(req, iter, false)
+		: bch2_direct_IO_read(req, iter);
 	blk_finish_plug(&plug);
 
 	return ret;
@@ -1734,26 +1717,7 @@ ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
 static ssize_t
 bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = file->f_mapping;
-	loff_t pos = iocb->ki_pos;
-	ssize_t	ret;
-
-	pagecache_block_get(&mapping->add_lock);
-
-	/* Write and invalidate pagecache range that we're writing to: */
-	ret = write_invalidate_inode_pages_range(file->f_mapping, pos,
-					pos + iov_iter_count(iter) - 1);
-	if (unlikely(ret))
-		goto err;
-
-	ret = bch2_direct_IO_write(c, iocb, file, inode, iter, pos);
-err:
-	pagecache_block_put(&mapping->add_lock);
-
-	return ret;
+	return bch2_direct_IO_write(iocb, iter, true);
 }
 
 static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 24228c8e..6ae67f92 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -4,6 +4,7 @@
 #include "chardev.h"
 #include "fs.h"
 #include "fs-ioctl.h"
+#include "quota.h"
 
 #include <linux/compat.h>
 #include <linux/mount.h>
@@ -154,10 +155,32 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 	struct fsxattr fa = { 0 };
 
 	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
 	return copy_to_user(arg, &fa, sizeof(fa));
 }
 
+static int bch2_set_projid(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+	int ret;
+
+	if (projid == inode->ei_qid.q[QTYP_PRJ])
+		return 0;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+				  inode->v.i_blocks);
+	if (ret)
+		return ret;
+
+	inode->ei_qid.q[QTYP_PRJ] = projid;
+	return 0;
+}
+
 static int bch2_ioc_fssetxattr(struct bch_fs *c,
 			       struct file *file,
 			       struct bch_inode_info *inode,
@@ -185,9 +208,14 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
+	if (ret)
+		goto err_unlock;
+
 	ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &flags);
 	if (!ret)
 		bch2_inode_flags_to_vfs(inode);
+err_unlock:
 	mutex_unlock(&inode->ei_update_lock);
 err:
 	inode_unlock(&inode->v);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 472df23a..8869ba0f 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -15,6 +15,7 @@
 #include "io.h"
 #include "journal.h"
 #include "keylist.h"
+#include "quota.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -116,6 +117,7 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
 		inode_u.bi_mode	= inode->v.i_mode;
 		inode_u.bi_uid	= i_uid_read(&inode->v);
 		inode_u.bi_gid	= i_gid_read(&inode->v);
+		inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
 		inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
 		inode_u.bi_dev	= inode->v.i_rdev;
 		inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
@@ -131,8 +133,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
 				BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
 	} while (ret == -EINTR);
 
-	if (!ret)
+	if (!ret) {
 		inode->ei_inode = inode_u;
+		inode->ei_qid	= bch_qid(&inode_u);
+	}
 out:
 	bch2_btree_iter_unlock(&iter);
 
@@ -215,7 +219,7 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
 	ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
 	if (ret) {
 		make_bad_inode(&inode->v);
-		goto err;
+		goto err_make_bad;
 	}
 #endif
 
@@ -225,16 +229,20 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
 			inode->v.i_mode, rdev,
 			&dir->ei_inode);
 
+	inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+
+	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+	if (ret) {
+		make_bad_inode(&inode->v);
+		goto err_make_bad;
+	}
+
 	ret = bch2_inode_create(c, &inode_u,
 				BLOCKDEV_INODE_MAX, 0,
 				&c->unused_inode_hint);
 	if (unlikely(ret)) {
-		/*
-		 * indicate to bch_evict_inode that the inode was never actually
-		 * created:
-		 */
-		make_bad_inode(&inode->v);
-		goto err;
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
+		goto err_make_bad;
 	}
 
 	bch2_vfs_inode_init(c, inode, &inode_u);
@@ -257,6 +265,12 @@ out:
 	posix_acl_release(default_acl);
 	posix_acl_release(acl);
 	return inode;
+err_make_bad:
+	/*
+	 * indicate to bch_evict_inode that the inode was never actually
+	 * created:
+	 */
+	make_bad_inode(&inode->v);
 err:
 	clear_nlink(&inode->v);
 	iput(&inode->v);
@@ -604,11 +618,53 @@ static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
 	return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
 }
 
+static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_qid qid = inode->ei_qid;
+	unsigned qtypes = 0;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	if (c->opts.usrquota &&
+	    (iattr->ia_valid & ATTR_UID) &&
+	    !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
+		qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
+		qtypes |= 1 << QTYP_USR;
+	}
+
+	if (c->opts.grpquota &&
+	    (iattr->ia_valid & ATTR_GID) &&
+	    !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+		qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
+		qtypes |= 1 << QTYP_GRP;
+	}
+
+	if (qtypes) {
+		ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
+					  inode->v.i_blocks);
+		if (ret)
+			goto out_unlock;
+	}
+
+	setattr_copy(&inode->v, iattr);
+
+	ret = bch2_write_inode(c, inode);
+out_unlock:
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (!ret &&
+	    iattr->ia_valid & ATTR_MODE)
+		ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
+
+	return ret;
+}
+
 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret = 0;
+	int ret;
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
@@ -616,22 +672,9 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (ret)
 		return ret;
 
-	if (iattr->ia_valid & ATTR_SIZE) {
-		ret = bch2_truncate(inode, iattr);
-	} else {
-		mutex_lock(&inode->ei_update_lock);
-		setattr_copy(&inode->v, iattr);
-		ret = bch2_write_inode(c, inode);
-		mutex_unlock(&inode->ei_update_lock);
-	}
-
-	if (unlikely(ret))
-		return ret;
-
-	if (iattr->ia_valid & ATTR_MODE)
-		ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
-
-	return ret;
+	return iattr->ia_valid & ATTR_SIZE
+		? bch2_truncate(inode, iattr)
+		: bch2_setattr_nonsize(inode, iattr);
 }
 
 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
@@ -910,6 +953,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 	inode->v.i_ctime	= bch2_time_to_timespec(c, bi->bi_ctime);
 
 	inode->ei_journal_seq	= 0;
+	inode->ei_qid		= bch_qid(bi);
 	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
 	inode->ei_inode		= *bi;
 
@@ -995,6 +1039,10 @@ static void bch2_evict_inode(struct inode *vinode)
 	clear_inode(&inode->v);
 
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+				BCH_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+				BCH_QUOTA_WARN);
 		bch2_inode_rm(c, inode->v.i_ino);
 		atomic_long_dec(&c->nr_inodes);
 	}
@@ -1009,8 +1057,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
 	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
-	buf->f_bfree	= (c->capacity -
-			   bch2_fs_sectors_used(c, bch2_fs_usage_read(c))) >>
+	buf->f_bfree	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
 			   PAGE_SECTOR_SHIFT;
 	buf->f_bavail	= buf->f_bfree;
 	buf->f_files	= atomic_long_read(&c->nr_inodes);
@@ -1037,17 +1084,83 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
 	return bch2_journal_flush(&c->journal);
 }
 
+static struct bch_fs *bch2_path_to_fs(const char *dev)
+{
+	struct bch_fs *c;
+	struct block_device *bdev = lookup_bdev(dev);
+
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	c = bch2_bdev_to_fs(bdev);
+	bdput(bdev);
+	return c ?: ERR_PTR(-ENOENT);
+}
+
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+					       unsigned nr_devs, struct bch_opts opts)
+{
+	struct bch_fs *c, *c1, *c2;
+	size_t i;
+
+	if (!nr_devs)
+		return ERR_PTR(-EINVAL);
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+
+	if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
+		/*
+		 * Already open?
+		 * Look up each block device, make sure they all belong to a
+		 * filesystem and they all belong to the _same_ filesystem
+		 */
+
+		c1 = bch2_path_to_fs(devs[0]);
+		if (!c1)
+			return c;
+
+		for (i = 1; i < nr_devs; i++) {
+			c2 = bch2_path_to_fs(devs[i]);
+			if (!IS_ERR(c2))
+				closure_put(&c2->cl);
+
+			if (c1 != c2) {
+				closure_put(&c1->cl);
+				return c;
+			}
+		}
+
+		c = c1;
+	}
+
+	if (IS_ERR(c))
+		return c;
+
+	mutex_lock(&c->state_lock);
+
+	if (!bch2_fs_running(c)) {
+		mutex_unlock(&c->state_lock);
+		closure_put(&c->cl);
+		pr_err("err mounting %s: incomplete filesystem", dev_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_unlock(&c->state_lock);
+
+	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+	return c;
+}
+
 static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
 					     struct bch_opts opts)
 {
-	size_t nr_devs = 0, i = 0;
-	char *dev_name, *s, **devs;
-	struct bch_fs *c = NULL;
-	const char *err = "cannot allocate memory";
+	char *dev_name = NULL, **devs = NULL, *s;
+	struct bch_fs *c = ERR_PTR(-ENOMEM);
+	size_t i, nr_devs = 0;
 
 	dev_name = kstrdup(_dev_name, GFP_KERNEL);
 	if (!dev_name)
-		return NULL;
+		goto err;
 
 	for (s = dev_name; s; s = strchr(s + 1, ':'))
 		nr_devs++;
@@ -1061,57 +1174,10 @@ static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
 	     (s = strchr(s, ':')) && (*s++ = '\0'))
 		devs[i++] = s;
 
-	err = bch2_fs_open(devs, nr_devs, opts, &c);
-	if (err) {
-		/*
-		 * Already open?
-		 * Look up each block device, make sure they all belong to a
-		 * filesystem and they all belong to the _same_ filesystem
-		 */
-
-		for (i = 0; i < nr_devs; i++) {
-			struct block_device *bdev = lookup_bdev(devs[i]);
-			struct bch_fs *c2;
-
-			if (IS_ERR(bdev))
-				goto err;
-
-			c2 = bch2_bdev_to_fs(bdev);
-			bdput(bdev);
-
-			if (!c)
-				c = c2;
-			else if (c2)
-				closure_put(&c2->cl);
-
-			if (!c)
-				goto err;
-			if (c != c2) {
-				closure_put(&c->cl);
-				goto err;
-			}
-		}
-
-		mutex_lock(&c->state_lock);
-
-		if (!bch2_fs_running(c)) {
-			mutex_unlock(&c->state_lock);
-			closure_put(&c->cl);
-			err = "incomplete filesystem";
-			c = NULL;
-			goto err;
-		}
-
-		mutex_unlock(&c->state_lock);
-	}
-
-	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+	c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
 err:
 	kfree(devs);
 	kfree(dev_name);
-
-	if (!c)
-		pr_err("bch_fs_open err %s", err);
 	return c;
 }
 
@@ -1234,8 +1300,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		return ERR_PTR(ret);
 
 	c = bch2_open_as_blockdevs(dev_name, opts);
-	if (!c)
-		return ERR_PTR(-ENOENT);
+	if (IS_ERR(c))
+		return ERR_CAST(c);
 
 	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
 	if (IS_ERR(sb)) {
@@ -1261,6 +1327,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_op		= &bch_super_operations;
 	sb->s_export_op		= &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	sb->s_qcop		= &bch2_quotactl_operations;
+	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
 	sb->s_xattr		= bch2_xattr_handlers;
 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
 	sb->s_time_gran		= c->sb.time_precision;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 652105fb..dd0bd4ef 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -3,6 +3,7 @@
 
 #include "opts.h"
 #include "str_hash.h"
+#include "quota_types.h"
 
 #include <linux/seqlock.h>
 #include <linux/stat.h>
@@ -13,6 +14,7 @@ struct bch_inode_info {
 	struct mutex		ei_update_lock;
 	u64			ei_journal_seq;
 	unsigned long		ei_last_dirtied;
+	struct bch_qid		ei_qid;
 
 	struct bch_hash_info	ei_str_hash;
 
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 696926fe..ef09c131 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -266,26 +266,60 @@ static int check_extents(struct bch_fs *c)
 			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
 			"extent type %u for non regular file, inode %llu mode %o",
 			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-			ret = bch2_btree_delete_at(&iter, 0);
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
 			if (ret)
 				goto err;
 			continue;
 		}
 
-		unfixable_fsck_err_on(w.first_this_inode &&
+		if (fsck_err_on(w.first_this_inode &&
 			w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
 			w.inode.bi_sectors !=
 			(i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
 			c, "i_sectors wrong: got %llu, should be %llu",
-			w.inode.bi_sectors, i_sectors);
+			w.inode.bi_sectors, i_sectors)) {
+			struct bkey_inode_buf p;
 
-		unfixable_fsck_err_on(w.have_inode &&
+			w.inode.bi_sectors = i_sectors;
+
+			bch2_btree_iter_unlock(&iter);
+
+			bch2_inode_pack(&p, &w.inode);
+
+			ret = bch2_btree_insert(c, BTREE_ID_INODES,
+						&p.inode.k_i,
+						NULL,
+						NULL,
+						NULL,
+						BTREE_INSERT_NOFAIL);
+			if (ret) {
+				bch_err(c, "error in fs gc: error %i "
+					"updating inode", ret);
+				goto err;
+			}
+
+			/* revalidate iterator: */
+			k = bch2_btree_iter_peek(&iter);
+		}
+
+		if (fsck_err_on(w.have_inode &&
 			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 			k.k->type != BCH_RESERVATION &&
 			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
 			"extent type %u offset %llu past end of inode %llu, i_size %llu",
-			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size);
+			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_inode_truncate(c, k.k->p.inode,
+					round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
+					NULL, NULL);
+			if (ret)
+				goto err;
+			continue;
+		}
 	}
 err:
 fsck_err:
@@ -999,7 +1033,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 	u64 nlinks_pos;
 
 	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
-	genradix_iter_init(&nlinks_iter);
+	nlinks_iter = genradix_iter_init(links, 0);
 
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !btree_iter_err(k)) {
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index e045eb20..6f6d42fc 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -268,7 +268,8 @@ static void bch2_write_index(struct closure *cl)
 		}
 
 		if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
-			ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER);
+			ret = bch2_check_mark_super(c, BCH_DATA_USER,
+						    bch2_extent_devs(e.c));
 			if (ret)
 				goto err;
 		}
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index ff18fdc9..32ecac24 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -67,10 +67,7 @@ struct bch_read_bio {
 struct bch_write_bio {
 	struct bch_fs		*c;
 	struct bch_dev		*ca;
-	union {
 	struct bch_write_bio	*parent;
-	struct closure		*cl;
-	};
 
 	struct bch_devs_list	failed;
 	u8			order;
@@ -82,7 +79,6 @@ struct bch_write_bio {
 				used_mempool:1;
 
 	unsigned		submit_time_us;
-	void			*data;
 
 	struct bio		bio;
 };
@@ -94,7 +90,7 @@ struct bch_write_op {
 
 	unsigned		written; /* sectors */
 	u16			flags;
-	s8			error;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
 
 	unsigned		csum_type:4;
 	unsigned		compression_type:4;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 0133a31e..811f7a5c 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -88,6 +88,9 @@ struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
 	if (!entry)
 		return NULL;
 
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
 	k = entry->start;
 	*level = entry->level;
 	*level = entry->level;
@@ -415,6 +418,7 @@ static struct nonce journal_nonce(const struct jset *jset)
 	}};
 }
 
+/* this fills in a range with empty jset_entries: */
 static void journal_entry_null_range(void *start, void *end)
 {
 	struct jset_entry *entry;
@@ -423,7 +427,7 @@ static void journal_entry_null_range(void *start, void *end)
 		memset(entry, 0, sizeof(*entry));
 }
 
-static int journal_validate_key(struct bch_fs *c, struct jset *j,
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 				struct jset_entry *entry,
 				struct bkey_i *k, enum bkey_type key_type,
 				const char *type)
@@ -458,7 +462,7 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
 		return 0;
 	}
 
-	if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN)
+	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
 		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
 
 	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
@@ -497,26 +501,27 @@ fsck_err:
 #define journal_entry_err_on(cond, c, msg, ...)				\
 	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
-static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset,
 					  int write)
 {
 	struct jset_entry *entry;
 	int ret = 0;
 
-	vstruct_for_each(j, entry) {
+	vstruct_for_each(jset, entry) {
+		void *next = vstruct_next(entry);
 		struct bkey_i *k;
 
 		if (journal_entry_err_on(vstruct_next(entry) >
-					 vstruct_last(j), c,
+					 vstruct_last(jset), c,
 				"journal entry extends past end of jset")) {
-			j->u64s = cpu_to_le32((u64 *) entry - j->_data);
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 			break;
 		}
 
 		switch (entry->type) {
 		case JOURNAL_ENTRY_BTREE_KEYS:
 			vstruct_for_each(entry, k) {
-				ret = journal_validate_key(c, j, entry, k,
+				ret = journal_validate_key(c, jset, entry, k,
 						bkey_type(entry->level,
 							  entry->btree_id),
 						"key");
@@ -531,12 +536,17 @@ static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
 			if (journal_entry_err_on(!entry->u64s ||
 					le16_to_cpu(entry->u64s) != k->k.u64s, c,
 					"invalid btree root journal entry: wrong number of keys")) {
-				journal_entry_null_range(entry,
-						vstruct_next(entry));
+				/*
+				 * we don't want to null out this jset_entry,
+				 * just the contents, so that later we can tell
+				 * we were _supposed_ to have a btree root
+				 */
+				entry->u64s = 0;
+				journal_entry_null_range(vstruct_next(entry), next);
 				continue;
 			}
 
-			ret = journal_validate_key(c, j, entry, k,
+			ret = journal_validate_key(c, jset, entry, k,
 						   BKEY_TYPE_BTREE, "btree root");
 			if (ret)
 				goto fsck_err;
@@ -566,21 +576,21 @@ fsck_err:
 }
 
 static int journal_entry_validate(struct bch_fs *c,
-				  struct jset *j, u64 sector,
+				  struct jset *jset, u64 sector,
 				  unsigned bucket_sectors_left,
 				  unsigned sectors_read,
 				  int write)
 {
-	size_t bytes = vstruct_bytes(j);
+	size_t bytes = vstruct_bytes(jset);
 	struct bch_csum csum;
 	int ret = 0;
 
-	if (le64_to_cpu(j->magic) != jset_magic(c))
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
 		bch_err(c, "unknown journal entry version %u",
-			le32_to_cpu(j->version));
+			le32_to_cpu(jset->version));
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
 
@@ -594,26 +604,26 @@ static int journal_entry_validate(struct bch_fs *c,
 	if (bytes > sectors_read << 9)
 		return JOURNAL_ENTRY_REREAD;
 
-	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
 			"journal entry with unknown csum type %llu sector %lluu",
-			JSET_CSUM_TYPE(j), sector))
+			JSET_CSUM_TYPE(jset), sector))
 		return JOURNAL_ENTRY_BAD;
 
-	csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
-	if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c,
+	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
 			"journal checksum bad, sector %llu", sector)) {
 		/* XXX: retry IO, when we start retrying checksum errors */
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
 	}
 
-	bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
-		    j->encrypted_start,
-		    vstruct_end(j) - (void *) j->encrypted_start);
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
 
-	if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
 			"invalid journal entry: last_seq > seq"))
-		j->last_seq = j->seq;
+		jset->last_seq = jset->seq;
 
 	return 0;
 fsck_err:
@@ -960,6 +970,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	struct bch_dev *ca;
 	u64 cur_seq, end_seq;
 	unsigned iter, keys = 0, entries = 0;
+	size_t nr;
 	int ret = 0;
 
 	closure_init_stack(&jlist.cl);
@@ -994,12 +1005,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 			goto fsck_err;
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_sb_has_replicas_devlist(c, &i->devs,
-							      BCH_DATA_JOURNAL), c,
+		    fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+						      i->devs), c,
 				"superblock not marked as containing replicas (type %u)",
 				BCH_DATA_JOURNAL)) {
-			ret = bch2_check_mark_super_devlist(c, &i->devs,
-							    BCH_DATA_JOURNAL);
+			ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
+						    i->devs);
 			if (ret)
 				return ret;
 		}
@@ -1007,9 +1018,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
 	i = list_last_entry(list, struct journal_replay, list);
 
-	unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
-			le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
-			"too many journal entries open for refcount fifo");
+	nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
+
+	if (nr > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
 
 	atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
 	j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
@@ -1131,18 +1149,19 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 #endif
 }
 
-static void __journal_entry_new(struct journal *j, int count)
+static void journal_pin_new_entry(struct journal *j, int count)
 {
-	struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
+	struct journal_entry_pin_list *p;
 
 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
 	 * incremented for last_seq() to be calculated correctly
 	 */
+	p = fifo_push_ref(&j->pin);
 	atomic64_inc(&j->seq);
 
-	BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
-	       &fifo_peek_back(&j->pin));
+	EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+		&fifo_peek_back(&j->pin));
 
 	INIT_LIST_HEAD(&p->list);
 	INIT_LIST_HEAD(&p->flushed);
@@ -1150,13 +1169,10 @@ static void __journal_entry_new(struct journal *j, int count)
 	p->devs.nr = 0;
 }
 
-static void __bch2_journal_next_entry(struct journal *j)
+static void bch2_journal_buf_init(struct journal *j)
 {
-	struct journal_buf *buf;
+	struct journal_buf *buf = journal_cur_buf(j);
 
-	__journal_entry_new(j, 1);
-
-	buf = journal_cur_buf(j);
 	memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
 	memset(buf->data, 0, sizeof(*buf->data));
@@ -1208,22 +1224,24 @@ static enum {
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	journal_reclaim_fast(j);
-
 	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
 	buf = &j->buf[old.idx];
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
-	buf->data->last_seq	= cpu_to_le64(last_seq(j));
 
 	j->prev_buf_sectors =
 		vstruct_blocks_plus(buf->data, c->block_bits,
 				    journal_entry_u64s_reserve(buf)) *
 		c->opts.block_size;
-
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
-	__bch2_journal_next_entry(j);
+	journal_reclaim_fast(j);
+	/* XXX: why set this here, and not in journal_write()? */
+	buf->data->last_seq	= cpu_to_le64(last_seq(j));
+
+	journal_pin_new_entry(j, 1);
+
+	bch2_journal_buf_init(j);
 
 	cancel_delayed_work(&j->write_work);
 	spin_unlock(&j->lock);
@@ -1352,12 +1370,20 @@ static int journal_entry_sectors(struct journal *j)
 /*
  * should _only_ called from journal_res_get() - when we actually want a
  * journal reservation - journal entry is open means journal is dirty:
+ *
+ * returns:
+ * 1:		success
+ * 0:		journal currently full (must wait)
+ * -EROFS:	insufficient rw devices
+ * -EIO:	journal error
  */
 static int journal_entry_open(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
 	ssize_t u64s;
-	int ret = 0, sectors;
+	int sectors;
+	u64 v;
 
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
@@ -1387,41 +1413,36 @@ static int journal_entry_open(struct journal *j)
 
 	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
 
-	if (u64s > le32_to_cpu(buf->data->u64s)) {
-		union journal_res_state old, new;
-		u64 v = atomic64_read(&j->reservations.counter);
+	if (u64s <= le32_to_cpu(buf->data->u64s))
+		return 0;
 
-		/*
-		 * Must be set before marking the journal entry as open:
-		 */
-		j->cur_entry_u64s = u64s;
+	/*
+	 * Must be set before marking the journal entry as open:
+	 */
+	j->cur_entry_u64s = u64s;
 
-		do {
-			old.v = new.v = v;
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
 
-			if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-				return false;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return -EIO;
 
-			/* Handle any already added entries */
-			new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-		} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-					       old.v, new.v)) != old.v);
-		ret = 1;
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
 
-		wake_up(&j->wait);
+	if (j->res_get_blocked_start)
+		__bch2_time_stats_update(j->blocked_time,
+					j->res_get_blocked_start);
+	j->res_get_blocked_start = 0;
 
-		if (j->res_get_blocked_start) {
-			__bch2_time_stats_update(j->blocked_time,
-						j->res_get_blocked_start);
-			j->res_get_blocked_start = 0;
-		}
-
-		mod_delayed_work(system_freezable_wq,
-				 &j->write_work,
-				 msecs_to_jiffies(j->write_delay_ms));
-	}
-
-	return ret;
+	mod_delayed_work(system_freezable_wq,
+			 &j->write_work,
+			 msecs_to_jiffies(j->write_delay_ms));
+	wake_up(&j->wait);
+	return 1;
 }
 
 void bch2_journal_start(struct bch_fs *c)
@@ -1438,14 +1459,15 @@ void bch2_journal_start(struct bch_fs *c)
 	set_bit(JOURNAL_STARTED, &j->flags);
 
 	while (atomic64_read(&j->seq) < new_seq)
-		__journal_entry_new(j, 0);
+		journal_pin_new_entry(j, 0);
 
 	/*
 	 * journal_buf_switch() only inits the next journal entry when it
 	 * closes an open journal entry - the very first journal entry gets
 	 * initialized here:
 	 */
-	__bch2_journal_next_entry(j);
+	journal_pin_new_entry(j, 1);
+	bch2_journal_buf_init(j);
 
 	/*
 	 * Adding entries to the next journal entry before allocating space on
@@ -1476,7 +1498,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 	struct bkey_i *k, *_n;
 	struct jset_entry *entry;
 	struct journal_replay *i, *n;
-	int ret = 0, did_replay = 0;
+	int ret = 0;
 
 	list_for_each_entry_safe(i, n, list, list) {
 		j->replay_pin_list =
@@ -1514,7 +1536,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 			}
 
 			cond_resched();
-			did_replay = true;
 		}
 
 		if (atomic_dec_and_test(&j->replay_pin_list->count))
@@ -1524,22 +1545,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 	j->replay_pin_list = NULL;
 
 	bch2_journal_set_replay_done(j);
-
-	if (did_replay) {
-		bch2_journal_flush_pins(&c->journal, U64_MAX);
-
-		/*
-		 * Write a new journal entry _before_ we start journalling new data -
-		 * otherwise, we could end up with btree node bsets with journal seqs
-		 * arbitrarily far in the future vs. the most recently written journal
-		 * entry on disk, if we crash before writing the next journal entry:
-		 */
-		ret = bch2_journal_meta(j);
-		if (ret) {
-			bch_err(c, "journal replay: error %d flushing journal", ret);
-			goto err;
-		}
-	}
+	ret = bch2_journal_flush_all_pins(j);
 err:
 	bch2_journal_entries_free(list);
 	return ret;
@@ -1654,7 +1660,7 @@ err:
 	return ret;
 }
 
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned nr;
 
@@ -1670,7 +1676,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 		     min(1 << 10,
 			 (1 << 20) / ca->mi.bucket_size));
 
-	return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
+	return bch2_set_nr_journal_buckets(c, ca, nr);
 }
 
 /* Journalling */
@@ -1723,6 +1729,7 @@ static inline void __journal_pin_add(struct journal *j,
 		list_add(&pin->list, &pin_list->list);
 	else
 		INIT_LIST_HEAD(&pin->list);
+	wake_up(&j->wait);
 }
 
 static void journal_pin_add_entry(struct journal *j,
@@ -1730,9 +1737,9 @@ static void journal_pin_add_entry(struct journal *j,
 				  struct journal_entry_pin *pin,
 				  journal_pin_flush_fn flush_fn)
 {
-	spin_lock_irq(&j->pin_lock);
+	spin_lock(&j->lock);
 	__journal_pin_add(j, pin_list, pin, flush_fn);
-	spin_unlock_irq(&j->pin_lock);
+	spin_unlock(&j->lock);
 }
 
 void bch2_journal_pin_add(struct journal *j,
@@ -1744,9 +1751,9 @@ void bch2_journal_pin_add(struct journal *j,
 		? journal_seq_pin(j, res->seq)
 		: j->replay_pin_list;
 
-	spin_lock_irq(&j->pin_lock);
+	spin_lock(&j->lock);
 	__journal_pin_add(j, pin_list, pin, flush_fn);
-	spin_unlock_irq(&j->pin_lock);
+	spin_unlock(&j->lock);
 }
 
 static inline bool __journal_pin_drop(struct journal *j,
@@ -1766,13 +1773,12 @@ static inline bool __journal_pin_drop(struct journal *j,
 void bch2_journal_pin_drop(struct journal *j,
 			  struct journal_entry_pin *pin)
 {
-	unsigned long flags;
 	bool wakeup = false;
 
-	spin_lock_irqsave(&j->pin_lock, flags);
+	spin_lock(&j->lock);
 	if (journal_pin_active(pin))
 		wakeup = __journal_pin_drop(j, pin);
-	spin_unlock_irqrestore(&j->pin_lock, flags);
+	spin_unlock(&j->lock);
 
 	/*
 	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
@@ -1789,7 +1795,7 @@ void bch2_journal_pin_add_if_older(struct journal *j,
 				  struct journal_entry_pin *pin,
 				  journal_pin_flush_fn flush_fn)
 {
-	spin_lock_irq(&j->pin_lock);
+	spin_lock(&j->lock);
 
 	if (journal_pin_active(src_pin) &&
 	    (!journal_pin_active(pin) ||
@@ -1800,24 +1806,19 @@ void bch2_journal_pin_add_if_older(struct journal *j,
 		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
 	}
 
-	spin_unlock_irq(&j->pin_lock);
+	spin_unlock(&j->lock);
 }
 
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *ret = NULL;
+	struct journal_entry_pin *ret;
 	unsigned iter;
 
-	/* so we don't iterate over empty fifo entries below: */
-	if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
-		spin_lock(&j->lock);
-		journal_reclaim_fast(j);
-		spin_unlock(&j->lock);
-	}
+	/* no need to iterate over empty fifo entries: */
+	journal_reclaim_fast(j);
 
-	spin_lock_irq(&j->pin_lock);
 	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
 		if (journal_pin_seq(j, pin_list) > seq_to_flush)
 			break;
@@ -1828,71 +1829,82 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 			/* must be list_del_init(), see bch2_journal_pin_drop() */
 			list_move(&ret->list, &pin_list->flushed);
 			*seq = journal_pin_seq(j, pin_list);
-			break;
+			return ret;
 		}
 	}
-	spin_unlock_irq(&j->pin_lock);
 
-	return ret;
+	return NULL;
 }
 
-static bool journal_flush_done(struct journal *j, u64 seq_to_flush)
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 {
-	bool ret;
+	struct journal_entry_pin *ret;
 
 	spin_lock(&j->lock);
-	journal_reclaim_fast(j);
-
-	ret = (fifo_used(&j->pin) == 1 &&
-	       atomic_read(&fifo_peek_front(&j->pin).count) == 1) ||
-		last_seq(j) > seq_to_flush;
+	ret = __journal_get_next_pin(j, seq_to_flush, seq);
 	spin_unlock(&j->lock);
 
 	return ret;
 }
 
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      struct journal_entry_pin **pin,
+			      u64 *pin_seq)
 {
-	struct journal_entry_pin *pin;
-	u64 pin_seq;
+	int ret;
 
-	if (!test_bit(JOURNAL_STARTED, &j->flags))
-		return;
+	*pin = NULL;
 
-	while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
-		pin->flush(j, pin, pin_seq);
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
 
+	spin_lock(&j->lock);
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries
-	 * hold refs on their corresponding sequence numbers and thus this would
-	 * deadlock:
+	 * hold refs on their corresponding sequence numbers
 	 */
-	if (!test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-		return;
+	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) ||
+		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	spin_unlock(&j->lock);
 
-	wait_event(j->wait,
-		   journal_flush_done(j, seq_to_flush) ||
-		   bch2_journal_error(j));
+	return ret;
 }
 
-int bch2_journal_flush_all_pins(struct journal *j)
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
 	bool flush;
 
 	if (!test_bit(JOURNAL_STARTED, &j->flags))
 		return 0;
-
-	bch2_journal_flush_pins(j, U64_MAX);
+again:
+	wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+	if (pin) {
+		/* flushing a journal pin might cause a new one to be added: */
+		pin->flush(j, pin, pin_seq);
+		goto again;
+	}
 
 	spin_lock(&j->lock);
 	flush = last_seq(j) != j->last_seq_ondisk ||
-		c->btree_roots_dirty;
+		(seq_to_flush == U64_MAX && c->btree_roots_dirty);
 	spin_unlock(&j->lock);
 
 	return flush ? bch2_journal_meta(j) : 0;
 }
 
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+	return bch2_journal_flush_pins(j, U64_MAX);
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@@ -2179,14 +2191,15 @@ static void journal_write_done(struct closure *cl)
 	struct journal *j = container_of(cl, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_prev_buf(j);
-	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&w->key);
+	struct bch_devs_list devs =
+		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
 
-	if (!bch2_extent_nr_ptrs(e)) {
+	if (!devs.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
 		goto err;
 	}
 
-	if (bch2_check_mark_super(c, e, BCH_DATA_JOURNAL))
+	if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs))
 		goto err;
 out:
 	__bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -2194,8 +2207,7 @@ out:
 	spin_lock(&j->lock);
 	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
 
-	journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs =
-			bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+	journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs;
 
 	/*
 	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
@@ -2358,7 +2370,7 @@ static void journal_write(struct closure *cl)
 		}
 
 no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr)
+	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
 		ptr->offset += sectors;
 
 	continue_at(cl, journal_write_done, system_highpri_wq);
@@ -2737,7 +2749,9 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
 			seq = journal_pin_seq(j, p);
 	spin_unlock(&j->lock);
 
-	bch2_journal_flush_pins(j, seq);
+	ret = bch2_journal_flush_pins(j, seq);
+	if (ret)
+		return ret;
 
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
@@ -2751,7 +2765,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
 		seq++;
 
 		spin_unlock(&j->lock);
-		ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+		ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs);
 		spin_lock(&j->lock);
 	}
 	spin_unlock(&j->lock);
@@ -2857,7 +2871,6 @@ int bch2_fs_journal_init(struct journal *j)
 	static struct lock_class_key res_key;
 
 	spin_lock_init(&j->lock);
-	spin_lock_init(&j->pin_lock);
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
@@ -2956,7 +2969,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
 	ssize_t ret = 0;
 	unsigned i;
 
-	spin_lock_irq(&j->pin_lock);
+	spin_lock(&j->lock);
 	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
 		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				 "%llu: count %u\n",
@@ -2977,7 +2990,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
 					 "\t%p %pf\n",
 					 pin, pin->flush);
 	}
-	spin_unlock_irq(&j->pin_lock);
+	spin_unlock(&j->lock);
 
 	return ret;
 }
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 61197e57..5abf356e 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -165,7 +165,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
-void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_pins(struct journal *, u64);
 int bch2_journal_flush_all_pins(struct journal *);
 
 struct closure;
@@ -390,7 +390,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);
 
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 void bch2_fs_journal_stop(struct journal *);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 66923cf4..5eea6579 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -169,12 +169,6 @@ struct journal {
 	DECLARE_FIFO(struct journal_entry_pin_list, pin);
 	struct journal_entry_pin_list *replay_pin_list;
 
-	/*
-	 * Protects the pin lists - the fifo itself is still protected by
-	 * j->lock though:
-	 */
-	spinlock_t		pin_lock;
-
 	struct mutex		blacklist_lock;
 	struct list_head	seq_blacklist;
 
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 328316a1..2033db81 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -16,13 +16,8 @@
 static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 {
 	struct bch_dev *ca = arg;
-	const struct bch_extent_ptr *ptr;
 
-	extent_for_each_ptr(e, ptr)
-		if (ptr->dev == ca->dev_idx)
-			return true;
-
-	return false;
+	return bch2_extent_has_device(e, ca->dev_idx);
 }
 
 #define MAX_DATA_OFF_ITER	10
@@ -32,30 +27,17 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 keys_moved, sectors_moved;
+	struct bch_move_stats stats;
 	unsigned pass = 0;
 	int ret = 0;
 
-	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
 	if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
 		return 0;
 
 	/*
-	 * In theory, only one pass should be necessary as we've
-	 * quiesced all writes before calling this.
-	 *
-	 * However, in practice, more than one pass may be necessary:
-	 * - Some move fails due to an error. We can can find this out
-	 *   from the moving_context.
-	 * - Some key swap failed because some of the pointers in the
-	 *   key in the tree changed due to caching behavior, btree gc
-	 *   pruning stale pointers, or tiering (if the device being
-	 *   removed is in tier 0).  A smarter bkey_cmpxchg would
-	 *   handle these cases.
-	 *
-	 * Thus this scans the tree one more time than strictly necessary,
-	 * but that can be viewed as a verification pass.
+	 * XXX: we should be able to do this in one pass, but bch2_move_data()
+	 * can spuriously fail to move an extent due to racing with other move
+	 * operations
 	 */
 	do {
 		ret = bch2_move_data(c, NULL,
@@ -65,15 +47,14 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
 				     0,
 				     ca->dev_idx,
 				     migrate_pred, ca,
-				     &keys_moved,
-				     &sectors_moved);
+				     &stats);
 		if (ret) {
 			bch_err(c, "error migrating data: %i", ret);
 			return ret;
 		}
-	} while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
+	} while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER);
 
-	if (keys_moved) {
+	if (atomic64_read(&stats.keys_moved)) {
 		bch_err(c, "unable to migrate all data in %d iterations",
 			MAX_DATA_OFF_ITER);
 		return -1;
@@ -83,11 +64,7 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
 
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
-		if (!bkey_extent_is_data(k.k))
-			continue;
-
-		ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
-					    BCH_DATA_USER);
+		ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
 		if (ret) {
 			bch_err(c, "error migrating data %i from check_mark_super()", ret);
 			break;
@@ -99,107 +76,34 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
 	return ret;
 }
 
-static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
-			       enum btree_id id)
-{
-	struct btree_iter iter;
-	struct btree *b;
-	int ret;
-
-	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
-	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
-		if (!bch2_extent_has_device(e, ca->dev_idx))
-			continue;
-
-		ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-			return ret;
-		}
-
-		bch2_btree_iter_set_locks_want(&iter, 0);
-	}
-	ret = bch2_btree_iter_unlock(&iter);
-	if (ret)
-		return ret; /* btree IO error */
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
-			BUG_ON(bch2_extent_has_device(e, ca->dev_idx));
-		}
-		bch2_btree_iter_unlock(&iter);
-	}
-
-	return 0;
-}
-
-/*
- * This moves only the meta-data off, leaving the data (if any) in place.
- * The data is moved off by bch_move_data_off_device, if desired, and
- * called first.
- *
- * Before calling this, allocation of buckets to the device must have
- * been disabled, as else we'll continue to write meta-data to the device
- * when new buckets are picked for meta-data writes.
- * In addition, the copying gc and allocator threads for the device
- * must have been stopped.  The allocator thread is the only thread
- * that writes prio/gen information.
- *
- * Meta-data consists of:
- * - Btree nodes
- * - Prio/gen information
- * - Journal entries
- * - Superblock
- *
- * This has to move the btree nodes and the journal only:
- * - prio/gen information is not written once the allocator thread is stopped.
- *   also, as the prio/gen information is per-device it is not moved.
- * - the superblock will be written by the caller once after everything
- *   is stopped.
- *
- * Note that currently there is no way to stop btree node and journal
- * meta-data writes to a device without moving the meta-data because
- * once a bucket is open for a btree node, unless a replacement btree
- * node is allocated (and the tree updated), the bucket will continue
- * to be written with updates.  Similarly for the journal (it gets
- * written until filled).
- *
- * This routine leaves the data (if any) in place.  Whether the data
- * should be moved off is a decision independent of whether the meta
- * data should be moved off and stopped:
- *
- * - For device removal, both data and meta-data are moved off, in
- *   that order.
- *
- * - However, for turning a device read-only without removing it, only
- *   meta-data is moved off since that's the only way to prevent it
- *   from being written.  Data is left in the device, but no new data
- *   is written.
- */
-
 static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
 				     int flags)
 {
-	unsigned i;
+	struct btree_iter iter;
+	struct btree *b;
 	int ret = 0;
+	unsigned id;
 
-	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
-	if (!(bch2_dev_has_data(c, ca) &
-	      ((1 << BCH_DATA_JOURNAL)|
-	       (1 << BCH_DATA_BTREE))))
+	if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE)))
 		return 0;
 
 	mutex_lock(&c->replicas_gc_lock);
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		ret = bch2_move_btree_off(c, ca, i);
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+
+			if (!bch2_extent_has_device(e, ca->dev_idx))
+				continue;
+
+			ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
+			if (ret) {
+				bch2_btree_iter_unlock(&iter);
+				goto err;
+			}
+		}
+		ret = bch2_btree_iter_unlock(&iter);
 		if (ret)
 			goto err;
 	}
@@ -211,6 +115,9 @@ err:
 
 int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
+	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW &&
+	       bch2_dev_is_online(ca));
+
 	return bch2_dev_usrdata_migrate(c, ca, flags) ?:
 		bch2_dev_metadata_migrate(c, ca, flags);
 }
@@ -233,17 +140,6 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 	return 0;
 }
 
-/*
- * This doesn't actually move any data -- it marks the keys as bad
- * if they contain a pointer to a device that is forcibly removed
- * and don't have other valid pointers.  If there are valid pointers,
- * the necessary pointers to the removed device are replaced with
- * bad pointers instead.
- *
- * This is only called if bch_move_data_off_device above failed, meaning
- * that we've already tried to move the data MAX_DATA_OFF_ITER times and
- * are not likely to succeed if we try again.
- */
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
 	struct bkey_s_c k;
@@ -260,11 +156,15 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	while ((k = bch2_btree_iter_peek(&iter)).k &&
 	       !(ret = btree_iter_err(k))) {
-		if (!bkey_extent_is_data(k.k))
-			goto advance;
-
-		if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
-			goto advance;
+		if (!bkey_extent_is_data(k.k) ||
+		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+			ret = bch2_check_mark_super(c, BCH_DATA_USER,
+						    bch2_bkey_devs(k));
+			if (ret)
+				break;
+			bch2_btree_iter_advance_pos(&iter);
+			continue;
+		}
 
 		bkey_reassemble(&tmp.key, k);
 		e = bkey_i_to_s_extent(&tmp.key);
@@ -280,8 +180,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		 */
 		bch2_extent_normalize(c, e.s);
 
-		if (bkey_extent_is_data(e.k) &&
-		    (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+		ret = bch2_check_mark_super(c, BCH_DATA_USER,
+				bch2_bkey_devs(bkey_i_to_s_c(&tmp.key)));
+		if (ret)
 			break;
 
 		iter.pos = bkey_start_pos(&tmp.key.k);
@@ -300,16 +201,6 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 			ret = 0;
 		if (ret)
 			break;
-
-		continue;
-advance:
-		if (bkey_extent_is_data(k.k)) {
-			ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
-						    BCH_DATA_USER);
-			if (ret)
-				break;
-		}
-		bch2_btree_iter_advance_pos(&iter);
 	}
 
 	bch2_btree_iter_unlock(&iter);
@@ -346,8 +237,8 @@ retry:
 						    dev_idx)) {
 				bch2_btree_iter_set_locks_want(&iter, 0);
 
-				ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
-							    BCH_DATA_BTREE);
+				ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+						bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
 				if (ret)
 					goto err;
 			} else {
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index a3de3b05..7c7f436c 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -31,15 +31,10 @@ struct moving_context {
 	/* Closure for waiting on all reads and writes to complete */
 	struct closure		cl;
 
-	/* Key and sector moves issued, updated from submission context */
-	u64			keys_moved;
-	u64			sectors_moved;
-	atomic64_t		sectors_raced;
+	struct bch_move_stats	*stats;
 
 	struct list_head	reads;
-
 	atomic_t		sectors_in_flight;
-
 	wait_queue_head_t	wait;
 };
 
@@ -116,8 +111,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 		bch2_extent_normalize(c, extent_i_to_s(insert).s);
 		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
 
-		ret = bch2_check_mark_super(c, extent_i_to_s_c(insert),
-					    BCH_DATA_USER);
+		ret = bch2_check_mark_super(c, BCH_DATA_USER,
+				bch2_extent_devs(extent_i_to_s_c(insert)));
 		if (ret)
 			break;
 
@@ -145,7 +140,7 @@ next:
 nomatch:
 		if (m->ctxt)
 			atomic64_add(k.k->p.offset - iter.pos.offset,
-				     &m->ctxt->sectors_raced);
+				     &m->ctxt->stats->sectors_raced);
 		atomic_long_inc(&c->extent_migrate_raced);
 		trace_move_race(&new->k);
 		bch2_btree_iter_advance_pos(&iter);
@@ -303,8 +298,8 @@ static int bch2_move_extent(struct bch_fs *c,
 	io->write.op.devs	= devs;
 	io->write.op.write_point = wp;
 
-	ctxt->keys_moved++;
-	ctxt->sectors_moved += k.k->size;
+	atomic64_inc(&ctxt->stats->keys_moved);
+	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 
 	trace_move_extent(k.k);
 
@@ -353,24 +348,6 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
 }
 
-static void bch2_move_ctxt_exit(struct moving_context *ctxt)
-{
-	move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
-	closure_sync(&ctxt->cl);
-
-	EBUG_ON(!list_empty(&ctxt->reads));
-	EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
-}
-
-static void bch2_move_ctxt_init(struct moving_context *ctxt)
-{
-	memset(ctxt, 0, sizeof(*ctxt));
-	closure_init_stack(&ctxt->cl);
-
-	INIT_LIST_HEAD(&ctxt->reads);
-	init_waitqueue_head(&ctxt->wait);
-}
-
 int bch2_move_data(struct bch_fs *c,
 		   struct bch_ratelimit *rate,
 		   unsigned sectors_in_flight,
@@ -379,20 +356,21 @@ int bch2_move_data(struct bch_fs *c,
 		   int btree_insert_flags,
 		   int move_device,
 		   move_pred_fn pred, void *arg,
-		   u64 *keys_moved,
-		   u64 *sectors_moved)
+		   struct bch_move_stats *stats)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct moving_context ctxt;
+	struct moving_context ctxt = { .stats = stats };
 	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_iter iter;
 	BKEY_PADDED(k) tmp;
 	struct bkey_s_c k;
 	u64 cur_inum = U64_MAX;
 	int ret = 0;
 
-	bch2_move_ctxt_init(&ctxt);
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+	memset(stats, 0, sizeof(*stats));
+	closure_init_stack(&ctxt.cl);
+	INIT_LIST_HEAD(&ctxt.reads);
+	init_waitqueue_head(&ctxt.wait);
+	bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, POS_MIN,
 			     BTREE_ITER_PREFETCH);
 
 	if (rate)
@@ -400,7 +378,7 @@ int bch2_move_data(struct bch_fs *c,
 
 	while (!kthread || !(ret = kthread_should_stop())) {
 		if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
-			bch2_btree_iter_unlock(&iter);
+			bch2_btree_iter_unlock(&stats->iter);
 			move_ctxt_wait_event(&ctxt,
 					     atomic_read(&ctxt.sectors_in_flight) <
 					     sectors_in_flight);
@@ -408,11 +386,11 @@ int bch2_move_data(struct bch_fs *c,
 
 		if (rate &&
 		    bch2_ratelimit_delay(rate) &&
-		    (bch2_btree_iter_unlock(&iter),
+		    (bch2_btree_iter_unlock(&stats->iter),
 		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
 			break;
 peek:
-		k = bch2_btree_iter_peek(&iter);
+		k = bch2_btree_iter_peek(&stats->iter);
 		if (!k.k)
 			break;
 		ret = btree_iter_err(k);
@@ -420,13 +398,13 @@ peek:
 			break;
 
 		if (!bkey_extent_is_data(k.k))
-			goto next;
+			goto next_nondata;
 
 		if (cur_inum != k.k->p.inode) {
 			struct bch_inode_unpacked inode;
 
 			/* don't hold btree locks while looking up inode: */
-			bch2_btree_iter_unlock(&iter);
+			bch2_btree_iter_unlock(&stats->iter);
 
 			opts = bch2_opts_to_inode_opts(c->opts);
 			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
@@ -441,7 +419,7 @@ peek:
 		/* unlock before doing IO: */
 		bkey_reassemble(&tmp.k, k);
 		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&iter);
+		bch2_btree_iter_unlock(&stats->iter);
 
 		if (bch2_move_extent(c, &ctxt, devs, wp,
 				     btree_insert_flags,
@@ -454,17 +432,24 @@ peek:
 		if (rate)
 			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		bch2_btree_iter_advance_pos(&iter);
-		bch2_btree_iter_cond_resched(&iter);
+		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+			     &stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_advance_pos(&stats->iter);
+		bch2_btree_iter_cond_resched(&stats->iter);
 	}
 
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
+	bch2_btree_iter_unlock(&stats->iter);
 
-	trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
+	move_ctxt_wait_event(&ctxt, !atomic_read(&ctxt.sectors_in_flight));
+	closure_sync(&ctxt.cl);
 
-	*keys_moved	= ctxt.keys_moved;
-	*sectors_moved	= ctxt.sectors_moved;
+	EBUG_ON(!list_empty(&ctxt.reads));
+	EBUG_ON(atomic_read(&ctxt.sectors_in_flight));
+
+	trace_move_data(c,
+			atomic64_read(&stats->sectors_moved),
+			atomic64_read(&stats->keys_moved));
 
 	return ret;
 }
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 2e884ce0..24d6ddfa 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -1,6 +1,7 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "btree_iter.h"
 #include "buckets.h"
 #include "io_types.h"
 
@@ -25,10 +26,19 @@ void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
 
 typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
 
+struct bch_move_stats {
+	struct btree_iter	iter;
+
+	atomic64_t		keys_moved;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
 		   unsigned, struct bch_devs_mask *,
 		   struct write_point_specifier,
 		   int, int, move_pred_fn, void *,
-		   u64 *, u64 *);
+		   struct bch_move_stats *);
 
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 90eb4ca2..d6f2968e 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -100,7 +100,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 	copygc_heap *h = &ca->copygc_heap;
 	struct copygc_heap_entry e, *i;
 	struct bucket_array *buckets;
-	u64 keys_moved, sectors_moved;
+	struct bch_move_stats move_stats;
 	u64 sectors_to_move = 0, sectors_not_moved = 0;
 	u64 buckets_to_move, buckets_not_moved = 0;
 	size_t b;
@@ -167,8 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			     BTREE_INSERT_USE_RESERVE,
 			     ca->dev_idx,
 			     copygc_pred, ca,
-			     &keys_moved,
-			     &sectors_moved);
+			     &move_stats);
 
 	down_read(&ca->bucket_lock);
 	buckets = bucket_array(ca);
@@ -189,7 +188,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			 buckets_not_moved, buckets_to_move);
 
 	trace_copygc(ca,
-		     sectors_moved, sectors_not_moved,
+		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 		     buckets_to_move, buckets_not_moved);
 }
 
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index e6833d95..eae63cf8 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -167,6 +167,27 @@ int bch2_opt_lookup(const char *name)
 	return -1;
 }
 
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
 int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res)
 {
 	ssize_t ret;
@@ -211,7 +232,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 		val	= opt;
 
 		if (val) {
-			id = bch2_opt_lookup(name);
+			id = bch2_mount_opt_lookup(name);
 			if (id < 0)
 				goto bad_opt;
 
@@ -219,12 +240,12 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 			if (ret < 0)
 				goto bad_val;
 		} else {
-			id = bch2_opt_lookup(name);
+			id = bch2_mount_opt_lookup(name);
 			v = 1;
 
 			if (id < 0 &&
 			    !strncmp("no", name, 2)) {
-				id = bch2_opt_lookup(name + 2);
+				id = bch2_mount_opt_lookup(name + 2);
 				v = 0;
 			}
 
@@ -242,6 +263,11 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
 			goto bad_opt;
 
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
 		bch2_opt_set_by_id(opts, id, v);
 	}
 
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 126056e6..5d42dd5f 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -112,6 +112,15 @@ enum opt_type {
 	BCH_OPT(acl,			u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
 		BCH_SB_POSIX_ACL,		true)			\
+	BCH_OPT(usrquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_USRQUOTA,		false)			\
+	BCH_OPT(grpquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_GRPQUOTA,		false)			\
+	BCH_OPT(prjquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_PRJQUOTA,		false)			\
 	BCH_OPT(degraded,		u8,	OPT_MOUNT,		\
 		OPT_BOOL(),						\
 		NO_SB_OPT,			false)			\
@@ -171,7 +180,7 @@ static const struct bch_opts bch2_opts_default = {
 #define opt_defined(_opts, _name)	((_opts)._name##_defined)
 
 #define opt_get(_opts, _name)						\
-	(opt_defined(_opts, _name) ? _opts._name : bch2_opts_default._name)
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
 
 #define opt_set(_opts, _name, _v)					\
 do {									\
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
new file mode 100644
index 00000000..c550fd9e
--- /dev/null
+++ b/libbcachefs/quota.c
@@ -0,0 +1,786 @@
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+
+	if (k.k->p.inode >= QTYP_NR)
+		return "invalid quota type";
+
+	switch (k.k->type) {
+	case BCH_QUOTA: {
+		dq = bkey_s_c_to_quota(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+			return "incorrect value size";
+
+		return NULL;
+	}
+	default:
+		return "invalid type";
+	}
+}
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+static void bch2_quota_to_text(struct bch_fs *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end= buf + size;
+	struct bkey_s_c_quota dq;
+	unsigned i;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+
+		for (i = 0; i < Q_COUNTERS; i++)
+			out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
+					 bch2_quota_counters[i],
+					 le64_to_cpu(dq.v->c[i].hardlimit),
+					 le64_to_cpu(dq.v->c[i].softlimit));
+		break;
+	}
+}
+
+const struct bkey_ops bch2_bkey_quota_ops = {
+	.key_invalid	= bch2_quota_invalid,
+	.val_to_text	= bch2_quota_to_text,
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == BCH_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer &&
+	    ktime_get_real_seconds() >= qc->timer &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer == 0) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+		/* XXX is this the right one? */
+		qc->timer = ktime_get_real_seconds() +
+			q->limits[counter].warnlimit;
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock(&q->lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+		if (!mq[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock(&q->lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+		if (!src_q[i] || !dst_q[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+			   BTREE_ITER_PREFETCH, k) {
+		if (k.k->p.inode != type)
+			break;
+
+		ret = __bch2_quota_set(c, k);
+		if (ret)
+			break;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_get_quota(c->disk_sb);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	unsigned i, qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct btree_iter iter;
+	struct bch_inode_unpacked u;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_init_type(c, i);
+		if (ret)
+			return ret;
+	}
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+			   BTREE_ITER_PREFETCH, k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+			if (ret)
+				return ret;
+
+			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+					BCH_QUOTA_NOCHECK);
+			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+					BCH_QUOTA_NOCHECK);
+		}
+	}
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
+#if 0
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
+#endif
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	struct bch_memquota_type *q;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	q = &c->quotas[type];
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_quota(c->disk_sb);
+	if (!sb_quota) {
+		sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
+		if (!sb_quota)
+			return -ENOSPC;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+		genradix_iter_advance(&iter, &q->table);
+	}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
+			     BTREE_ITER_WITH_HOLES|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_with_holes(&iter);
+
+	ret = btree_iter_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		new_quota.v = *bkey_s_c_to_quota(k).v;
+		break;
+	}
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_spc_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit);
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
+	bch2_btree_iter_unlock(&iter);
+
+	if (ret)
+		return ret;
+
+	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+	return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
new file mode 100644
index 00000000..09d51a83
--- /dev/null
+++ b/libbcachefs/quota.h
@@ -0,0 +1,48 @@
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "quota_types.h"
+
+extern const struct bkey_ops bch2_bkey_quota_ops;
+
+enum quota_acct_mode {
+	BCH_QUOTA_PREALLOC,
+	BCH_QUOTA_WARN,
+	BCH_QUOTA_NOCHECK,
+};
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project,
+	};
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+#define bch2_quota_acct(_c, _uid, _gid, _counter, _v) (0)
+#define bch2_quota_transfer(_c, _type, _src, _dst, _v) (0)
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/libbcachefs/quota_types.h b/libbcachefs/quota_types.h
new file mode 100644
index 00000000..bcaed4ea
--- /dev/null
+++ b/libbcachefs/quota_types.h
@@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 21720186..8dce7dc1 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -330,9 +330,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
 		return "Btree node size not a power of two";
 
-	if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
-		return "Btree node size too large";
-
 	if (BCH_SB_GC_RESERVE(sb) < 5)
 		return "gc reserve percentage too small";
 
@@ -383,27 +380,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 
 /* device open: */
 
-static const char *bch2_blkdev_open(const char *path, fmode_t mode,
-				   void *holder, struct block_device **ret)
-{
-	struct block_device *bdev;
-
-	*ret = NULL;
-	bdev = blkdev_get_by_path(path, mode, holder);
-	if (bdev == ERR_PTR(-EBUSY))
-		return "device busy";
-
-	if (IS_ERR(bdev))
-		return "failed to open device";
-
-	if (mode & FMODE_WRITE)
-		bdev_get_queue(bdev)->backing_dev_info->capabilities
-			|= BDI_CAP_STABLE_WRITES;
-
-	*ret = bdev;
-	return NULL;
-}
-
 static void bch2_sb_update(struct bch_fs *c)
 {
 	struct bch_sb *src = c->disk_sb;
@@ -555,44 +531,55 @@ reread:
 	return NULL;
 }
 
-const char *bch2_read_super(const char *path,
-			    struct bch_opts opts,
-			    struct bch_sb_handle *ret)
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
 {
-	u64 offset = opt_get(opts, sb);
+	u64 offset = opt_get(*opts, sb);
 	struct bch_sb_layout layout;
 	const char *err;
-	unsigned i;
+	__le64 *i;
+	int ret;
 
-	memset(ret, 0, sizeof(*ret));
-	ret->mode = FMODE_READ;
+	memset(sb, 0, sizeof(*sb));
+	sb->mode = FMODE_READ;
 
-	if (!opt_get(opts, noexcl))
-		ret->mode |= FMODE_EXCL;
+	if (!opt_get(*opts, noexcl))
+		sb->mode |= FMODE_EXCL;
 
-	if (!opt_get(opts, nochanges))
-		ret->mode |= FMODE_WRITE;
+	if (!opt_get(*opts, nochanges))
+		sb->mode |= FMODE_WRITE;
 
-	err = bch2_blkdev_open(path, ret->mode, ret, &ret->bdev);
-	if (err)
-		return err;
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+	if (IS_ERR(sb->bdev) &&
+	    PTR_ERR(sb->bdev) == -EACCES &&
+	    opt_get(*opts, read_only)) {
+		sb->mode &= ~FMODE_WRITE;
+
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+		if (!IS_ERR(sb->bdev))
+			opt_set(*opts, nochanges, true);
+	}
+
+	if (IS_ERR(sb->bdev))
+		return PTR_ERR(sb->bdev);
 
 	err = "cannot allocate memory";
-	if (__bch2_super_realloc(ret, 0))
+	ret = __bch2_super_realloc(sb, 0);
+	if (ret)
 		goto err;
 
+	ret = -EFAULT;
 	err = "dynamic fault";
 	if (bch2_fs_init_fault("read_super"))
 		goto err;
 
-	err = read_one_super(ret, offset);
+	ret = -EINVAL;
+	err = read_one_super(sb, offset);
 	if (!err)
 		goto got_super;
 
-	if (offset != BCH_SB_SECTOR) {
-		pr_err("error reading superblock: %s", err);
+	if (opt_defined(*opts, sb))
 		goto err;
-	}
 
 	pr_err("error reading default superblock: %s", err);
 
@@ -600,53 +587,57 @@ const char *bch2_read_super(const char *path,
 	 * Error reading primary superblock - read location of backup
 	 * superblocks:
 	 */
-	bio_reset(ret->bio);
-	ret->bio->bi_bdev = ret->bdev;
-	ret->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-	ret->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
-	bio_set_op_attrs(ret->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+	bio_reset(sb->bio);
+	sb->bio->bi_bdev = sb->bdev;
+	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+	sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 	/*
 	 * use sb buffer to read layout, since sb buffer is page aligned but
 	 * layout won't be:
 	 */
-	bch2_bio_map(ret->bio, ret->sb);
+	bch2_bio_map(sb->bio, sb->sb);
 
 	err = "IO error";
-	if (submit_bio_wait(ret->bio))
+	if (submit_bio_wait(sb->bio))
 		goto err;
 
-	memcpy(&layout, ret->sb, sizeof(layout));
+	memcpy(&layout, sb->sb, sizeof(layout));
 	err = validate_sb_layout(&layout);
 	if (err)
 		goto err;
 
-	for (i = 0; i < layout.nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout.sb_offset[i]);
+	for (i = layout.sb_offset;
+	     i < layout.sb_offset + layout.nr_superblocks; i++) {
+		offset = le64_to_cpu(*i);
 
-		if (offset == BCH_SB_SECTOR)
+		if (offset == opt_get(*opts, sb))
 			continue;
 
-		err = read_one_super(ret, offset);
+		err = read_one_super(sb, offset);
 		if (!err)
 			goto got_super;
 	}
-	goto err;
-got_super:
-	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
-		 le64_to_cpu(ret->sb->version),
-		 le64_to_cpu(ret->sb->flags[0]),
-		 le64_to_cpu(ret->sb->seq),
-		 le32_to_cpu(ret->sb->u64s));
 
+	ret = -EINVAL;
+	goto err;
+
+got_super:
 	err = "Superblock block size smaller than device block size";
-	if (le16_to_cpu(ret->sb->block_size) << 9 <
-	    bdev_logical_block_size(ret->bdev))
+	ret = -EINVAL;
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev))
 		goto err;
 
-	return NULL;
+	if (sb->mode & FMODE_WRITE)
+		bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
+			|= BDI_CAP_STABLE_WRITES;
+
+	return 0;
 err:
-	bch2_free_super(ret);
-	return err;
+	bch2_free_super(sb);
+	pr_err("error reading superblock: %s", err);
+	return ret;
 }
 
 /* write superblock: */
@@ -1108,13 +1099,20 @@ err:
 	return ret;
 }
 
-static inline int __bch2_check_mark_super(struct bch_fs *c,
-				struct bch_replicas_cpu_entry search,
-				unsigned max_dev)
+int bch2_check_mark_super(struct bch_fs *c,
+			  enum bch_data_type data_type,
+			  struct bch_devs_list devs)
 {
+	struct bch_replicas_cpu_entry search;
 	struct bch_replicas_cpu *r, *gc_r;
+	unsigned max_dev;
 	bool marked;
 
+	if (!devs.nr)
+		return 0;
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
 	gc_r = rcu_dereference(c->replicas_gc);
@@ -1126,32 +1124,6 @@ static inline int __bch2_check_mark_super(struct bch_fs *c,
 		: bch2_check_mark_super_slowpath(c, search, max_dev);
 }
 
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-
-	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
-		return 0;
-
-	return __bch2_check_mark_super(c, search, max_dev);
-}
-
-int bch2_check_mark_super_devlist(struct bch_fs *c,
-				  struct bch_devs_list *devs,
-				  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-
-	if (!devs->nr)
-		return 0;
-
-	devlist_to_replicas(*devs, data_type, &search, &max_dev);
-	return __bch2_check_mark_super(c, search, max_dev);
-}
-
 int bch2_replicas_gc_end(struct bch_fs *c, int err)
 {
 	struct bch_replicas_cpu *new_r, *old_r;
@@ -1435,12 +1407,19 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
 
 /* Query replicas: */
 
-static bool __bch2_sb_has_replicas(struct bch_fs *c,
-				   struct bch_replicas_cpu_entry search,
-				   unsigned max_dev)
+bool bch2_sb_has_replicas(struct bch_fs *c,
+			  enum bch_data_type data_type,
+			  struct bch_devs_list devs)
 {
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
 	bool ret;
 
+	if (!devs.nr)
+		return true;
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
 	rcu_read_lock();
 	ret = replicas_has_entry(rcu_dereference(c->replicas),
 				 search, max_dev);
@@ -1449,31 +1428,6 @@ static bool __bch2_sb_has_replicas(struct bch_fs *c,
 	return ret;
 }
 
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-
-	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
-		return true;
-
-	return __bch2_sb_has_replicas(c, search, max_dev);
-}
-
-bool bch2_sb_has_replicas_devlist(struct bch_fs *c, struct bch_devs_list *devs,
-				  enum bch_data_type data_type)
-{
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-
-	if (!devs->nr)
-		return true;
-
-	devlist_to_replicas(*devs, data_type, &search, &max_dev);
-	return __bch2_sb_has_replicas(c, search, max_dev);
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 					      struct bch_devs_mask online_devs)
 {
@@ -1579,12 +1533,23 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 		goto out;
 
 	for_each_cpu_replicas_entry(r, e)
-		if (replicas_test_dev(e, ca->dev_idx)) {
+		if (replicas_test_dev(e, ca->dev_idx))
 			ret |= 1 << e->data_type;
-			break;
-		}
 out:
 	rcu_read_unlock();
 
 	return ret;
 }
+
+/* Quotas: */
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) != sizeof(*q))
+		return "invalid field quota: wrong size";
+
+	return NULL;
+}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index e0dd26e3..59a8b816 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -94,8 +94,7 @@ int bch2_super_realloc(struct bch_sb_handle *, unsigned);
 
 const char *bch2_sb_validate(struct bch_sb_handle *);
 
-const char *bch2_read_super(const char *, struct bch_opts,
-			    struct bch_sb_handle *);
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 void bch2_write_super(struct bch_fs *);
 
 /* BCH_SB_FIELD_journal: */
@@ -139,14 +138,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_replicas: */
 
-bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
-			  enum bch_data_type);
-bool bch2_sb_has_replicas_devlist(struct bch_fs *, struct bch_devs_list *,
-				  enum bch_data_type);
-int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
-			  enum bch_data_type);
-int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
-				  enum bch_data_type);
+bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type,
+			  struct bch_devs_list);
+int bch2_check_mark_super(struct bch_fs *, enum bch_data_type,
+			  struct bch_devs_list);
 
 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 69290d27..29ffba65 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -29,6 +29,7 @@
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
+#include "quota.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
@@ -214,14 +215,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
-	if (!bch2_journal_error(&c->journal))
-		bch2_btree_verify_flushed(c);
-
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_stop(ca);
 
 	bch2_fs_journal_stop(&c->journal);
 
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags))
+		bch2_btree_verify_flushed(c);
+
 	for_each_member_device(ca, c, i)
 		bch2_dev_allocator_remove(c, ca);
 }
@@ -366,6 +368,7 @@ err:
 
 static void bch2_fs_free(struct bch_fs *c)
 {
+	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_btree_cache_exit(c);
@@ -380,7 +383,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bioset_exit(&c->bio_write);
 	bioset_exit(&c->bio_read_split);
 	bioset_exit(&c->bio_read);
-	bioset_exit(&c->btree_read_bio);
+	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
 	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
@@ -492,6 +495,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	bch2_fs_allocator_init(c);
 	bch2_fs_tiering_init(c);
+	bch2_fs_quota_init(c);
 
 	INIT_LIST_HEAD(&c->list);
 
@@ -561,8 +565,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 				      sizeof(struct btree_update)) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-	    bioset_init(&c->btree_read_bio, 1,
-			offsetof(struct btree_read_bio, bio),
+	    bioset_init(&c->btree_bio, 1,
+			max(offsetof(struct btree_read_bio, bio),
+			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
 			BIOSET_NEED_BVECS) ||
@@ -671,13 +676,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 	struct bch_dev *ca;
 	LIST_HEAD(journal);
 	struct jset *j;
-	struct closure cl;
 	time64_t now;
 	unsigned i;
 	int ret = -EINVAL;
 
-	closure_init_stack(&cl);
-
 	mutex_lock(&c->state_lock);
 
 	BUG_ON(c->state != BCH_FS_STARTING);
@@ -705,14 +707,14 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 			unsigned level;
 			struct bkey_i *k;
 
-			err = "missing btree root";
 			k = bch2_journal_find_btree_root(c, j, i, &level);
-			if (!k && i < BTREE_ID_ALLOC)
-				goto err;
-
 			if (!k)
 				continue;
 
+			err = "invalid btree root pointer";
+			if (IS_ERR(k))
+				goto err;
+
 			err = "error reading btree root";
 			if (bch2_btree_root_read(c, i, k, level)) {
 				if (i != BTREE_ID_ALLOC)
@@ -722,6 +724,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 			}
 		}
 
+		for (i = 0; i < BTREE_ID_NR; i++)
+			if (!c->btree_roots[i].b)
+				bch2_btree_root_alloc(c, i);
+
 		err = "error reading allocation information";
 		ret = bch2_alloc_read(c, &journal);
 		if (ret)
@@ -739,14 +745,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		if (c->opts.noreplay)
 			goto recovery_done;
 
-		err = "cannot allocate new btree root";
-		for (i = 0; i < BTREE_ID_NR; i++)
-			if (!c->btree_roots[i].b &&
-			    bch2_btree_root_alloc(c, i, &cl))
-				goto err;
-
-		closure_sync(&cl);
-
 		/*
 		 * bch2_journal_start() can't happen sooner, or btree_gc_finish()
 		 * will give spurious errors about oldest_gen > bucket_gen -
@@ -754,12 +752,9 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		 */
 		bch2_journal_start(c);
 
-		err = "error starting allocator thread";
-		for_each_rw_member(ca, c, i)
-			if (bch2_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->io_ref);
-				goto err;
-			}
+		err = "error starting allocator";
+		if (bch2_fs_allocator_start(c))
+			goto err;
 
 		bch_verbose(c, "starting journal replay:");
 		err = "journal replay failed";
@@ -777,6 +772,14 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		if (ret)
 			goto err;
 		bch_verbose(c, "fsck done");
+
+		if (c->opts.usrquota || c->opts.grpquota) {
+			bch_verbose(c, "reading quotas:");
+			ret = bch2_fs_quota_read(c);
+			if (ret)
+				goto err;
+			bch_verbose(c, "quotas done");
+		}
 	} else {
 		struct bch_inode_unpacked inode;
 		struct bkey_inode_buf packed_inode;
@@ -784,6 +787,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		bch_notice(c, "initializing new filesystem");
 
 		set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+		set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
 
 		ret = bch2_initial_gc(c, &journal);
 		if (ret)
@@ -791,15 +795,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 
 		err = "unable to allocate journal buckets";
 		for_each_rw_member(ca, c, i)
-			if (bch2_dev_journal_alloc(ca)) {
+			if (bch2_dev_journal_alloc(c, ca)) {
 				percpu_ref_put(&ca->io_ref);
 				goto err;
 			}
 
-		err = "cannot allocate new btree root";
+		clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
+
 		for (i = 0; i < BTREE_ID_NR; i++)
-			if (bch2_btree_root_alloc(c, i, &cl))
-				goto err;
+			bch2_btree_root_alloc(c, i);
 
 		/*
 		 * journal_res_get() will crash if called before this has
@@ -808,15 +812,9 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		bch2_journal_start(c);
 		bch2_journal_set_replay_done(&c->journal);
 
-		err = "error starting allocator thread";
-		for_each_rw_member(ca, c, i)
-			if (bch2_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->io_ref);
-				goto err;
-			}
-
-		/* Wait for new btree roots to be written: */
-		closure_sync(&cl);
+		err = "error starting allocator";
+		if (bch2_fs_allocator_start(c))
+			goto err;
 
 		bch2_inode_init(c, &inode, 0, 0,
 			       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
@@ -830,6 +828,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 				     NULL, NULL, NULL, 0))
 			goto err;
 
+		if (c->opts.usrquota || c->opts.grpquota) {
+			ret = bch2_fs_quota_read(c);
+			if (ret)
+				goto err;
+		}
+
 		err = "error writing first journal entry";
 		if (bch2_journal_meta(&c->journal))
 			goto err;
@@ -867,8 +871,6 @@ out:
 	return err;
 err:
 fsck_err:
-	closure_sync(&cl);
-
 	switch (ret) {
 	case BCH_FSCK_ERRORS_NOT_FIXED:
 		bch_err(c, "filesystem contains errors: please report this to the developers");
@@ -1107,6 +1109,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	struct bch_dev *ca;
 	int ret;
 
+	lockdep_assert_held(&c->state_lock);
+
 	if (le64_to_cpu(sb->sb->seq) >
 	    le64_to_cpu(c->disk_sb->seq))
 		bch2_sb_to_fs(c, sb->sb);
@@ -1153,7 +1157,9 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 		bdevname(ca->disk_sb.bdev, c->name);
 	bdevname(ca->disk_sb.bdev, ca->name);
 
+	mutex_lock(&c->sb_lock);
 	bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	mutex_unlock(&c->sb_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@@ -1430,17 +1436,18 @@ err:
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
+	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
 	const char *err;
 	struct bch_dev *ca = NULL;
 	struct bch_sb_field_members *mi, *dev_mi;
 	struct bch_member saved_mi;
 	unsigned dev_idx, nr_devices, u64s;
-	int ret = -EINVAL;
+	int ret;
 
-	err = bch2_read_super(path, bch2_opts_empty(), &sb);
-	if (err)
-		return -EINVAL;
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret)
+		return ret;
 
 	err = bch2_sb_validate(&sb);
 	if (err)
@@ -1479,14 +1486,14 @@ have_slot:
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 	err = "no space in superblock for member info";
 
-	mi = bch2_fs_sb_resize_members(c, u64s);
-	if (!mi)
-		goto err_unlock;
-
 	dev_mi = bch2_sb_resize_members(&sb, u64s);
 	if (!dev_mi)
 		goto err_unlock;
 
+	mi = bch2_fs_sb_resize_members(c, u64s);
+	if (!mi)
+		goto err_unlock;
+
 	memcpy(dev_mi, mi, u64s * sizeof(u64));
 	dev_mi->members[dev_idx] = saved_mi;
 
@@ -1499,30 +1506,30 @@ have_slot:
 	c->disk_sb->nr_devices	= nr_devices;
 	c->sb.nr_devices	= nr_devices;
 
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
 	if (bch2_dev_alloc(c, dev_idx)) {
 		err = "cannot allocate memory";
 		ret = -ENOMEM;
-		goto err_unlock;
+		goto err;
 	}
 
 	if (__bch2_dev_online(c, &sb)) {
 		err = "bch2_dev_online() error";
 		ret = -ENOMEM;
-		goto err_unlock;
+		goto err;
 	}
 
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
 	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
-		err = "journal alloc failed";
-		if (bch2_dev_journal_alloc(ca))
-			goto err;
-
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
 			goto err;
+
+		err = "journal alloc failed";
+		if (bch2_dev_journal_alloc(c, ca))
+			goto err;
 	}
 
 	mutex_unlock(&c->state_lock);
@@ -1540,16 +1547,20 @@ err:
 /* Hot add existing device to running filesystem: */
 int bch2_dev_online(struct bch_fs *c, const char *path)
 {
+	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb = { NULL };
 	struct bch_dev *ca;
 	unsigned dev_idx;
 	const char *err;
+	int ret;
 
 	mutex_lock(&c->state_lock);
 
-	err = bch2_read_super(path, bch2_opts_empty(), &sb);
-	if (err)
-		goto err;
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		mutex_unlock(&c->state_lock);
+		return ret;
+	}
 
 	dev_idx = sb.sb->dev_idx;
 
@@ -1557,13 +1568,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 	if (err)
 		goto err;
 
-	mutex_lock(&c->sb_lock);
 	if (__bch2_dev_online(c, &sb)) {
 		err = "__bch2_dev_online() error";
-		mutex_unlock(&c->sb_lock);
 		goto err;
 	}
-	mutex_unlock(&c->sb_lock);
 
 	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
@@ -1585,6 +1593,12 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	mutex_lock(&c->state_lock);
 
+	if (!bch2_dev_is_online(ca)) {
+		bch_err(ca, "Already offline");
+		mutex_unlock(&c->state_lock);
+		return 0;
+	}
+
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
 		bch_err(ca, "Cannot offline required disk");
 		mutex_unlock(&c->state_lock);
@@ -1617,9 +1631,19 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 		goto err;
 	}
 
+	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err(ca, "Migrate failed: error %i flushing journal", ret);
+		goto err;
+	}
+
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
-		bch_err(ca, "Migrate error: data still present (%x)", data);
+		char buf[100];
+
+		bch2_scnprint_flag_list(buf, sizeof(buf),
+					bch2_data_types, data);
+		bch_err(ca, "Migrate failed, still has data (%s)", buf);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -1670,33 +1694,33 @@ err:
 
 /* Filesystem open: */
 
-const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
-			 struct bch_opts opts, struct bch_fs **ret)
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+			    struct bch_opts opts)
 {
-	const char *err;
+	struct bch_sb_handle *sb = NULL;
 	struct bch_fs *c = NULL;
-	struct bch_sb_handle *sb;
 	unsigned i, best_sb = 0;
+	const char *err;
+	int ret = -ENOMEM;
 
 	if (!nr_devices)
-		return "need at least one device";
+		return ERR_PTR(-EINVAL);
 
 	if (!try_module_get(THIS_MODULE))
-		return "module unloading";
+		return ERR_PTR(-ENODEV);
 
-	err = "cannot allocate memory";
 	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
 	if (!sb)
 		goto err;
 
 	for (i = 0; i < nr_devices; i++) {
-		err = bch2_read_super(devices[i], opts, &sb[i]);
-		if (err)
+		ret = bch2_read_super(devices[i], &opts, &sb[i]);
+		if (ret)
 			goto err;
 
 		err = bch2_sb_validate(&sb[i]);
 		if (err)
-			goto err;
+			goto err_print;
 	}
 
 	for (i = 1; i < nr_devices; i++)
@@ -1707,56 +1731,53 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	for (i = 0; i < nr_devices; i++) {
 		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
 		if (err)
-			goto err;
+			goto err_print;
 	}
 
-	err = "cannot allocate memory";
+	ret = -ENOMEM;
 	c = bch2_fs_alloc(sb[best_sb].sb, opts);
 	if (!c)
 		goto err;
 
 	err = "bch2_dev_online() error";
-	mutex_lock(&c->sb_lock);
+	mutex_lock(&c->state_lock);
 	for (i = 0; i < nr_devices; i++)
 		if (__bch2_dev_online(c, &sb[i])) {
-			mutex_unlock(&c->sb_lock);
-			goto err;
+			mutex_unlock(&c->state_lock);
+			goto err_print;
 		}
-	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
 
 	err = "insufficient devices";
 	if (!bch2_fs_may_start(c))
-		goto err;
+		goto err_print;
 
 	if (!c->opts.nostart) {
 		err = __bch2_fs_start(c);
 		if (err)
-			goto err;
+			goto err_print;
 	}
 
 	err = bch2_fs_online(c);
 	if (err)
-		goto err;
+		goto err_print;
 
-	if (ret)
-		*ret = c;
-	else
-		closure_put(&c->cl);
-
-	err = NULL;
-out:
 	kfree(sb);
 	module_put(THIS_MODULE);
-	if (err)
-		c = NULL;
-	return err;
+	return c;
+err_print:
+	pr_err("bch_fs_open err opening %s: %s",
+	       devices[0], err);
+	ret = -EINVAL;
 err:
 	if (c)
 		bch2_fs_stop(c);
 
 	for (i = 0; i < nr_devices; i++)
 		bch2_free_super(&sb[i]);
-	goto out;
+	kfree(sb);
+	module_put(THIS_MODULE);
+	return ERR_PTR(ret);
 }
 
 static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
@@ -1827,9 +1848,8 @@ const char *bch2_fs_open_incremental(const char *path)
 	struct bch_opts opts = bch2_opts_empty();
 	const char *err;
 
-	err = bch2_read_super(path, opts, &sb);
-	if (err)
-		return err;
+	if (bch2_read_super(path, &opts, &sb))
+		return "error reading superblock";
 
 	err = __bch2_fs_open_incremental(&sb, opts);
 	bch2_free_super(&sb);
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 6f628830..a35ee3db 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -198,8 +198,7 @@ const char *bch2_fs_read_write(struct bch_fs *);
 void bch2_fs_stop(struct bch_fs *);
 
 const char *bch2_fs_start(struct bch_fs *);
-const char *bch2_fs_open(char * const *, unsigned, struct bch_opts,
-			struct bch_fs **);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
 const char *bch2_fs_open_incremental(const char *path);
 
 #endif /* _BCACHEFS_SUPER_H */
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index f5007864..6a581097 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -39,7 +39,8 @@ static int bch2_tiering_thread(void *arg)
 	struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct bch_dev *ca;
-	u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
+	struct bch_move_stats move_stats;
+	u64 tier_capacity, available_sectors;
 	unsigned long last;
 	unsigned i, nr_devices;
 
@@ -91,8 +92,7 @@ static int bch2_tiering_thread(void *arg)
 			       0,
 			       -1,
 			       tiering_pred, tier,
-			       &keys_moved,
-			       &sectors_moved);
+			       &move_stats);
 	}
 
 	return 0;
diff --git a/linux/kthread.c b/linux/kthread.c
index 0f4b5715..80a9ac9a 100644
--- a/linux/kthread.c
+++ b/linux/kthread.c
@@ -64,6 +64,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
 	vsnprintf(p->comm, sizeof(p->comm), namefmt, args);
 	va_end(args);
 
+	p->flags	|= PF_KTHREAD;
 	p->thread_fn	= thread_fn;
 	p->thread_data	= thread_data;
 	p->state	= TASK_UNINTERRUPTIBLE;
@@ -73,6 +74,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
 	init_completion(&p->exited);
 
 	pthread_create(&p->thread, NULL, kthread_start_fn, p);
+	pthread_setname_np(p->thread, p->comm);
 	return p;
 }