diff --git a/.bcache_revision b/.bcache_revision
index 58bdf2da..b86381a1 100644
--- a/.bcache_revision
+++ b/.bcache_revision
@@ -1 +1 @@
-BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c
+BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc
diff --git a/Makefile b/Makefile
index 2defed04..682bf8e7 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ OBJS=bcache.o			\
      cmd_fsck.o			\
      cmd_format.o		\
      cmd_key.o			\
+     cmd_migrate.o		\
      cmd_run.o			\
      crypto.o			\
      libbcache.o		\
diff --git a/bcache.c b/bcache.c
index ac9eb07e..a0fa860f 100644
--- a/bcache.c
+++ b/bcache.c
@@ -50,7 +50,12 @@ static void usage(void)
 	     "\n"
 	     "Debug:\n"
 	     "  bcache dump    Dump filesystem metadata to a qcow2 image\n"
-	     "  bcache list    List filesystem metadata in textual form\n");
+	     "  bcache list    List filesystem metadata in textual form\n"
+	     "\n"
+	     "Migrate:\n"
+	     "  bcache migrate Migrate an existing filesystem to bcachefs, in place\n"
+	     "  bcache migrate_superblock\n"
+	     "                 Add default superblock, after bcache migrate\n");
 }
 
 int main(int argc, char *argv[])
@@ -104,6 +109,11 @@ int main(int argc, char *argv[])
 	if (!strcmp(cmd, "list"))
 		return cmd_list(argc, argv);
 
+	if (!strcmp(cmd, "migrate"))
+		return cmd_migrate(argc, argv);
+	if (!strcmp(cmd, "migrate_superblock"))
+		return cmd_migrate_superblock(argc, argv);
+
 	usage();
 	return 0;
 }
diff --git a/cmd_debug.c b/cmd_debug.c
index 4f2586d4..ca0f4530 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -30,35 +30,35 @@ static void dump_usage(void)
 static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 {
 	struct bch_sb *sb = ca->disk_sb.sb;
-	sparse_data data;
+	ranges data;
 	unsigned i;
 
 	darray_init(data);
 
 	/* Superblock: */
-	data_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
-		 sizeof(struct bch_sb_layout));
+	range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+		  sizeof(struct bch_sb_layout));
 
 	for (i = 0; i < sb->layout.nr_superblocks; i++)
-		data_add(&data,
-			 le64_to_cpu(sb->layout.sb_offset[i]) << 9,
-			 vstruct_bytes(sb));
+		range_add(&data,
+			  le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+			  vstruct_bytes(sb));
 
 	/* Journal: */
 	for (i = 0; i < ca->journal.nr; i++)
 		if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
 			u64 bucket = ca->journal.buckets[i];
 
-			data_add(&data,
-				 bucket_bytes(ca) * bucket,
-				 bucket_bytes(ca));
+			range_add(&data,
+				  bucket_bytes(ca) * bucket,
+				  bucket_bytes(ca));
 		}
 
 	/* Prios/gens: */
 	for (i = 0; i < prio_buckets(ca); i++)
-		data_add(&data,
-			 bucket_bytes(ca) * ca->prio_last_buckets[i],
-			 bucket_bytes(ca));
+		range_add(&data,
+			  bucket_bytes(ca) * ca->prio_last_buckets[i],
+			  bucket_bytes(ca));
 
 	/* Btree: */
 	for (i = 0; i < BTREE_ID_NR; i++) {
@@ -71,9 +71,9 @@ static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 
 			extent_for_each_ptr(e, ptr)
 				if (ptr->dev == ca->dev_idx)
-					data_add(&data,
-						 ptr->offset << 9,
-						 b->written << 9);
+					range_add(&data,
+						  ptr->offset << 9,
+						  b->written << 9);
 		}
 		bch_btree_iter_unlock(&iter);
 	}
@@ -87,7 +87,7 @@ int cmd_dump(int argc, char *argv[])
 	struct bch_opts opts = bch_opts_empty();
 	struct cache_set *c = NULL;
 	const char *err;
-	char *out = NULL, *buf;
+	char *out = NULL;
 	unsigned i, nr_devices = 0;
 	bool force = false;
 	int fd, opt;
@@ -116,9 +116,6 @@ int cmd_dump(int argc, char *argv[])
 	if (!out)
 		die("Please supply output filename");
 
-	buf = alloca(strlen(out) + 10);
-	strcpy(buf, out);
-
 	err = bch_fs_open(argv + optind, argc - optind, opts, &c);
 	if (err)
 		die("error opening %s: %s", argv[optind], err);
@@ -140,12 +137,11 @@ int cmd_dump(int argc, char *argv[])
 		if (!c->cache[i])
 			continue;
 
-		if (nr_devices > 1)
-			sprintf(buf, "%s.%u", out, i);
-
-		fd = open(buf, mode, 0600);
-		if (fd < 0)
-			die("error opening %s: %s", buf, strerror(errno));
+		char *path = nr_devices > 1
+			? mprintf("%s.%u", out, i)
+			: strdup(out);
+		fd = xopen(path, mode, 0600);
+		free(path);
 
 		dump_one_device(c, c->cache[i], fd);
 		close(fd);
@@ -153,7 +149,7 @@ int cmd_dump(int argc, char *argv[])
 
 	up_read(&c->gc_lock);
 
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
 
@@ -213,14 +209,20 @@ static void list_keys_usage(void)
 	     "Usage: bcache list_keys [OPTION]... <devices>\n"
 	     "\n"
 	     "Options:\n"
-	     "  -b btree_id   Integer btree id to list\n"
-	     "  -s start      Start pos (as inode:offset)\n"
-	     "  -e end        End pos\n"
-	     "  -m mode       Mode for listing\n"
-	     "  -h            Display this help and exit\n"
+	     "  -b (extents|inodes|dirents|xattrs)    Btree to list from\n"
+	     "  -s inode:offset                       Start position to list from\n"
+	     "  -e inode:offset                       End position\n"
+	     "  -m (keys|formats)                     List mode\n"
+	     "  -h                                    Display this help and exit\n"
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 
+static const char * const list_modes[] = {
+	"keys",
+	"formats",
+	NULL
+};
+
 int cmd_list(int argc, char *argv[])
 {
 	struct bch_opts opts = bch_opts_empty();
@@ -229,7 +231,6 @@ int cmd_list(int argc, char *argv[])
 	struct bpos start = POS_MIN, end = POS_MAX;
 	const char *err;
 	int mode = 0, opt;
-	u64 v;
 
 	opts.nochanges	= true;
 	opts.norecovery	= true;
@@ -239,10 +240,8 @@ int cmd_list(int argc, char *argv[])
 	while ((opt = getopt(argc, argv, "b:s:e:m:h")) != -1)
 		switch (opt) {
 		case 'b':
-			if (kstrtoull(optarg, 10, &v) ||
-			    v >= BTREE_ID_NR)
-				die("invalid btree id");
-			btree_id = v;
+			btree_id = read_string_list_or_die(optarg,
+						bch_btree_ids, "btree id");
 			break;
 		case 's':
 			start	= parse_pos(optarg);
@@ -251,6 +250,8 @@ int cmd_list(int argc, char *argv[])
 			end	= parse_pos(optarg);
 			break;
 		case 'm':
+			mode = read_string_list_or_die(optarg,
+						list_modes, "list mode");
 			break;
 		case 'h':
 			list_keys_usage();
@@ -275,6 +276,6 @@ int cmd_list(int argc, char *argv[])
 		die("Invalid mode");
 	}
 
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
diff --git a/cmd_device.c b/cmd_device.c
index 1c5208af..505fedc4 100644
--- a/cmd_device.c
+++ b/cmd_device.c
@@ -121,10 +121,7 @@ int cmd_device_show(int argc, char *argv[])
 
 		char *dev_name = basename(dirname(link));
 
-		int fd = openat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
-		if (fd < 0)
-			die("couldn't open device %s: %s\n",
-			    entry->d_name, strerror(errno));
+		int fd = xopenat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
 
 		devices[nr_devices] = fill_dev(strdup(dev_name), nr, fd);
 		tiers[devices[nr_devices].tier]++;
diff --git a/cmd_format.c b/cmd_format.c
index 2b1453ee..f222a8b7 100644
--- a/cmd_format.c
+++ b/cmd_format.c
@@ -34,10 +34,8 @@ static int open_for_format(const char *dev, bool force)
 	blkid_probe pr;
 	const char *fs_type = NULL, *fs_label = NULL;
 	size_t fs_type_len, fs_label_len;
-	int fd;
 
-	if ((fd = open(dev, O_RDWR|O_EXCL)) == -1)
-		die("Can't open dev %s: %s\n", dev, strerror(errno));
+	int fd = xopen(dev, O_RDWR|O_EXCL);
 
 	if (force)
 		return fd;
@@ -70,8 +68,41 @@ static int open_for_format(const char *dev, bool force)
 	return fd;
 }
 
+#define OPTS									\
+t("bcache format - create a new bcache filesystem on one or more devices")	\
+t("Usage: bcache format [OPTION]... <devices>")					\
+t("")										\
+x('b',	block_size,		"size",			NULL)			\
+x(0,	btree_node_size,	"size",			"Default 256k")		\
+x(0,	metadata_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
+x(0,	data_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
+x(0,	compression_type,	"(none|lz4|gzip)",	NULL)			\
+x(0,	encrypted,		NULL,			"Enable whole filesystem encryption (chacha20/poly1305)")\
+x(0,	no_passphrase,		NULL,			"Don't encrypt master encryption key")\
+x('e',	error_action,		"(continue|readonly|panic)", NULL)		\
+x(0,	max_journal_entry_size,	"size",			NULL)			\
+x('L',	label,			"label",		NULL)			\
+x('U',	uuid,			"uuid",			NULL)			\
+x('f',	force,			NULL,			NULL)			\
+t("")										\
+t("Device specific options:")							\
+x(0,	fs_size,		"size",			"Size of filesystem on device")\
+x(0,	bucket_size,		"size",			"Bucket size")		\
+x('t',	tier,			"#",			"Higher tier indicates slower devices")\
+x(0,	discard,		NULL,			NULL)			\
+t("Device specific options must come before corresponding devices, e.g.")	\
+t("  bcache format --tier 0 /dev/sdb --tier 1 /dev/sdc")			\
+t("")										\
+x('h',	help,			NULL,			"display this help and exit")
+
 static void usage(void)
 {
+#define t(text)				puts(text "\n")
+#define x(shortopt, longopt, arg, help) do {				\
+	OPTS
+#undef x
+#undef t
+
 	puts("bcache format - create a new bcache filesystem on one or more devices\n"
 	     "Usage: bcache format [OPTION]... <devices>\n"
 	     "\n"
@@ -81,7 +112,8 @@ static void usage(void)
 	     "      --metadata_checksum_type=(none|crc32c|crc64)\n"
 	     "      --data_checksum_type=(none|crc32c|crc64)\n"
 	     "      --compression_type=(none|lz4|gzip)\n"
-	     "      --encrypted\n"
+	     "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
+	     "      --no_passphrase         Don't encrypt master encryption key\n"
 	     "      --error_action=(continue|readonly|panic)\n"
 	     "                              Action to take on filesystem error\n"
 	     "      --max_journal_entry_size=size\n"
@@ -103,37 +135,26 @@ static void usage(void)
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 
-#define OPTS								\
-	OPT('b',	block_size,		required_argument)	\
-	OPT(0,		btree_node_size,	required_argument)	\
-	OPT(0,		metadata_checksum_type,	required_argument)	\
-	OPT(0,		data_checksum_type,	required_argument)	\
-	OPT(0,		compression_type,	required_argument)	\
-	OPT(0,		encrypted,		no_argument)		\
-	OPT('e',	error_action,		required_argument)	\
-	OPT(0,		max_journal_entry_size,	required_argument)	\
-	OPT('L',	label,			required_argument)	\
-	OPT('U',	uuid,			required_argument)	\
-	OPT('f',	force,			no_argument)		\
-	OPT(0,		fs_size,		required_argument)	\
-	OPT(0,		bucket_size,		required_argument)	\
-	OPT('t',	tier,			required_argument)	\
-	OPT(0,		discard,		no_argument)		\
-	OPT('h',	help,			no_argument)
-
 enum {
 	Opt_no_opt = 1,
-#define OPT(shortopt, longopt, has_arg)	Opt_##longopt,
+#define t(text)
+#define x(shortopt, longopt, arg, help)	Opt_##longopt,
 	OPTS
-#undef OPT
+#undef x
+#undef t
 };
 
 static const struct option format_opts[] = {
-#define OPT(shortopt, longopt, has_arg)	{				\
-		#longopt,  has_arg, NULL, Opt_##longopt			\
-	},
+#define t(text)
+#define x(shortopt, longopt, arg, help)	{				\
+	.name		= #longopt,					\
+	.has_arg	= arg ? required_argument : no_argument,	\
+	.flag		= NULL,						\
+	.val		= Opt_##longopt,				\
+},
 	OPTS
-#undef OPT
+#undef x
+#undef t
 	{ NULL }
 };
 
@@ -161,29 +182,12 @@ static unsigned hatoi_validate(const char *s, const char *msg)
 int cmd_format(int argc, char *argv[])
 {
 	darray(struct dev_opts) devices;
-	struct dev_opts *dev;
-	unsigned block_size = 0;
-	unsigned btree_node_size = 0;
-	unsigned meta_csum_type = BCH_CSUM_CRC32C;
-	unsigned data_csum_type = BCH_CSUM_CRC32C;
-	unsigned compression_type = BCH_COMPRESSION_NONE;
-	bool encrypted = false;
-	unsigned on_error_action = BCH_ON_ERROR_RO;
-	char *label = NULL;
-	uuid_le uuid;
-	bool force = false;
-
-	/* Device specific options: */
-	u64 filesystem_size = 0;
-	unsigned bucket_size = 0;
-	unsigned tier = 0;
-	bool discard = false;
-	unsigned max_journal_entry_size = 0;
-	char *passphrase = NULL;
+	struct format_opts opts = format_opts_default();
+	struct dev_opts dev_opts = { 0 }, *dev;
+	bool force = false, no_passphrase = false;
 	int opt;
 
 	darray_init(devices);
-	uuid_clear(uuid.b);
 
 	while ((opt = getopt_long(argc, argv,
 				  "-b:e:L:U:ft:h",
@@ -192,45 +196,52 @@ int cmd_format(int argc, char *argv[])
 		switch (opt) {
 		case Opt_block_size:
 		case 'b':
-			block_size = hatoi_validate(optarg,
-						"block size");
+			opts.block_size =
+				hatoi_validate(optarg, "block size");
 			break;
 		case Opt_btree_node_size:
-			btree_node_size = hatoi_validate(optarg,
-						"btree node size");
+			opts.btree_node_size =
+				hatoi_validate(optarg, "btree node size");
 			break;
 		case Opt_metadata_checksum_type:
-			meta_csum_type = read_string_list_or_die(optarg,
+			opts.meta_csum_type =
+				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
 		case Opt_data_checksum_type:
-			data_csum_type = read_string_list_or_die(optarg,
+			opts.data_csum_type =
+				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
 		case Opt_compression_type:
-			compression_type = read_string_list_or_die(optarg,
+			opts.compression_type =
+				read_string_list_or_die(optarg,
 						bch_compression_types,
 						"compression type");
 			break;
 		case Opt_encrypted:
-			encrypted = true;
+			opts.encrypted = true;
+			break;
+		case Opt_no_passphrase:
+			no_passphrase = true;
 			break;
 		case Opt_error_action:
 		case 'e':
-			on_error_action = read_string_list_or_die(optarg,
+			opts.on_error_action =
+				read_string_list_or_die(optarg,
 						bch_error_actions, "error action");
 			break;
 		case Opt_max_journal_entry_size:
-			max_journal_entry_size = hatoi_validate(optarg,
-						"journal entry size");
+			opts.max_journal_entry_size =
+				hatoi_validate(optarg, "journal entry size");
 			break;
 		case Opt_label:
 		case 'L':
-			label = strdup(optarg);
+			opts.label = strdup(optarg);
 			break;
 		case Opt_uuid:
 		case 'U':
-			if (uuid_parse(optarg, uuid.b))
+			if (uuid_parse(optarg, opts.uuid.b))
 				die("Bad uuid");
 			break;
 		case Opt_force:
@@ -238,31 +249,28 @@ int cmd_format(int argc, char *argv[])
 			force = true;
 			break;
 		case Opt_fs_size:
-			if (bch_strtoull_h(optarg, &filesystem_size))
+			if (bch_strtoull_h(optarg, &dev_opts.size))
 				die("invalid filesystem size");
 
-			filesystem_size >>= 9;
+			dev_opts.size >>= 9;
 			break;
 		case Opt_bucket_size:
-			bucket_size = hatoi_validate(optarg, "bucket size");
+			dev_opts.bucket_size =
+				hatoi_validate(optarg, "bucket size");
 			break;
 		case Opt_tier:
 		case 't':
-			if (kstrtouint(optarg, 10, &tier) ||
-			    tier >= BCH_TIER_MAX)
+			if (kstrtouint(optarg, 10, &dev_opts.tier) ||
+			    dev_opts.tier >= BCH_TIER_MAX)
 				die("invalid tier");
 			break;
 		case Opt_discard:
-			discard = true;
+			dev_opts.discard = true;
 			break;
 		case Opt_no_opt:
-			darray_append(devices, (struct dev_opts) {
-				.path			= strdup(optarg),
-				.size			= filesystem_size,
-				.bucket_size		= bucket_size,
-				.tier			= tier,
-				.discard		= discard,
-			});
+			dev_opts.path = strdup(optarg);
+			darray_append(devices, dev_opts);
+			dev_opts.size = 0;
 			break;
 		case Opt_help:
 		case 'h':
@@ -274,18 +282,16 @@ int cmd_format(int argc, char *argv[])
 	if (!darray_size(devices))
 		die("Please supply a device");
 
-	if (uuid_is_null(uuid.b))
-		uuid_generate(uuid.b);
-
-	if (encrypted) {
-		passphrase = read_passphrase("Enter passphrase: ");
+	if (opts.encrypted && !no_passphrase) {
+		opts.passphrase = read_passphrase("Enter passphrase: ");
 
 		if (isatty(STDIN_FILENO)) {
 			char *pass2 =
 				read_passphrase("Enter same passphrase again: ");
 
-			if (strcmp(passphrase, pass2)) {
-				memzero_explicit(passphrase, strlen(passphrase));
+			if (strcmp(opts.passphrase, pass2)) {
+				memzero_explicit(opts.passphrase,
+						 strlen(opts.passphrase));
 				memzero_explicit(pass2, strlen(pass2));
 				die("Passphrases do not match");
 			}
@@ -298,23 +304,14 @@ int cmd_format(int argc, char *argv[])
 	darray_foreach(dev, devices)
 		dev->fd = open_for_format(dev->path, force);
 
-	bcache_format(devices.item, darray_size(devices),
-		      block_size,
-		      btree_node_size,
-		      meta_csum_type,
-		      data_csum_type,
-		      compression_type,
-		      passphrase,
-		      1,
-		      1,
-		      on_error_action,
-		      max_journal_entry_size,
-		      label,
-		      uuid);
+	struct bch_sb *sb =
+		bcache_format(opts, devices.item, darray_size(devices));
+	bcache_super_print(sb, HUMAN_READABLE);
+	free(sb);
 
-	if (passphrase) {
-		memzero_explicit(passphrase, strlen(passphrase));
-		free(passphrase);
+	if (opts.passphrase) {
+		memzero_explicit(opts.passphrase, strlen(opts.passphrase));
+		free(opts.passphrase);
 	}
 
 	return 0;
diff --git a/cmd_fsck.c b/cmd_fsck.c
index a8c8dc53..6af56692 100644
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@@ -56,6 +56,6 @@ int cmd_fsck(int argc, char *argv[])
 	if (err)
 		die("error opening %s: %s", argv[optind], err);
 
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
diff --git a/cmd_key.c b/cmd_key.c
index 587ecbe3..654ad774 100644
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -1,6 +1,5 @@
 #include <errno.h>
 #include <unistd.h>
-#include <keyutils.h>
 #include <uuid/uuid.h>
 
 #include "cmds.h"
@@ -10,52 +9,18 @@
 
 int cmd_unlock(int argc, char *argv[])
 {
-	struct bch_encrypted_key sb_key;
-	struct bch_key passphrase_key;
 	struct bch_sb *sb;
-	struct bch_sb_field_crypt *crypt;
 	char *passphrase;
-	char uuid[40];
-	char description[60];
 
 	if (argc != 2)
 		die("please supply a single device");
 
 	sb = bcache_super_read(argv[1]);
 
-	crypt = bch_sb_get_crypt(sb);
-	if (!crypt)
-		die("filesystem is not encrypted");
-
-	sb_key = crypt->key;
-
-	if (!bch_key_is_encrypted(&sb_key))
-		die("filesystem does not have encryption key");
-
 	passphrase = read_passphrase("Enter passphrase: ");
-	derive_passphrase(crypt, &passphrase_key, passphrase);
 
-	/* Check if the user supplied the correct passphrase: */
-	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
-				   &sb_key, sizeof(sb_key)))
-		die("error encrypting key");
+	add_bcache_key(sb, passphrase);
 
-	if (bch_key_is_encrypted(&sb_key))
-		die("incorrect passphrase");
-
-	uuid_unparse_lower(sb->user_uuid.b, uuid);
-	sprintf(description, "bcache:%s", uuid);
-
-	if (add_key("logon", description,
-		    &passphrase_key, sizeof(passphrase_key),
-		    KEY_SPEC_USER_KEYRING) < 0 ||
-	    add_key("user", description,
-		    &passphrase_key, sizeof(passphrase_key),
-		    KEY_SPEC_USER_KEYRING) < 0)
-		die("add_key error: %s", strerror(errno));
-
-	memzero_explicit(&sb_key, sizeof(sb_key));
-	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
 	memzero_explicit(passphrase, strlen(passphrase));
 	free(passphrase);
 	return 0;
diff --git a/cmd_migrate.c b/cmd_migrate.c
new file mode 100644
index 00000000..9a02cb9f
--- /dev/null
+++ b/cmd_migrate.c
@@ -0,0 +1,835 @@
+#include </usr/include/dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+#include <attr/xattr.h>
+
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "crypto.h"
+#include "libbcache.h"
+#include "linux/bcache.h"
+
+#include <linux/dcache.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/xattr.h>
+#include "btree_update.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "fs.h"
+#include "inode.h"
+#include "io.h"
+#include "str_hash.h"
+#include "super.h"
+#include "xattr.h"
+
+static char *dev_t_to_path(dev_t dev)
+{
+	char link[PATH_MAX], *p;
+	int ret;
+
+	char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
+				  major(dev), minor(dev));
+	ret = readlink(sysfs_dev, link, sizeof(link));
+	free(sysfs_dev);
+
+	if (ret < 0 || ret >= sizeof(link))
+		die("readlink error while looking up block device: %s", strerror(errno));
+
+	link[ret] = '\0';
+
+	p = strrchr(link, '/');
+	if (!p)
+		die("error looking up device name");
+	p++;
+
+	return mprintf("/dev/%s", p);
+}
+
+static bool path_is_fs_root(char *path)
+{
+	char *line = NULL, *p, *mount;
+	size_t n = 0;
+	FILE *f;
+	bool ret = true;
+
+	f = fopen("/proc/self/mountinfo", "r");
+	if (!f)
+		die("Error getting mount information");
+
+	while (getline(&line, &n, f) != -1) {
+		p = line;
+
+		strsep(&p, " "); /* mount id */
+		strsep(&p, " "); /* parent id */
+		strsep(&p, " "); /* dev */
+		strsep(&p, " "); /* root */
+		mount = strsep(&p, " ");
+		strsep(&p, " ");
+
+		if (mount && !strcmp(path, mount))
+			goto found;
+	}
+
+	ret = false;
+found:
+	fclose(f);
+	free(line);
+	return ret;
+}
+
+static void mark_unreserved_space(struct cache_set *c, ranges extents)
+{
+	struct cache *ca = c->cache[0];
+	struct hole_iter iter;
+	struct range i;
+
+	for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
+		struct bucket_mark new;
+		u64 b;
+
+		if (i.start == i.end)
+			return;
+
+		b = sector_to_bucket(ca, i.start >> 9);
+		do {
+			bucket_cmpxchg(&ca->buckets[b], new, new.nouse = 1);
+			b++;
+		} while (bucket_to_sector(ca, b) << 9 < i.end);
+	}
+}
+
+static void update_inode(struct cache_set *c,
+			 struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf packed;
+	int ret;
+
+	bch_inode_pack(&packed, inode);
+	ret = bch_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL);
+	if (ret)
+		die("error creating file: %s", strerror(-ret));
+}
+
+static void create_dirent(struct cache_set *c,
+			  struct bch_inode_unpacked *parent,
+			  const char *name, u64 inum, mode_t mode)
+{
+	struct bch_hash_info parent_hash_info = bch_hash_info_init(parent);
+	struct qstr qname = { { { .len = strlen(name), } }, .name = name };
+
+	int ret = bch_dirent_create(c, parent->inum, &parent_hash_info,
+				    mode_to_type(mode), &qname,
+				    inum, NULL, BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		die("error creating file: %s", strerror(-ret));
+
+	if (S_ISDIR(mode))
+		parent->i_nlink++;
+}
+
+static void create_link(struct cache_set *c,
+			struct bch_inode_unpacked *parent,
+			const char *name, u64 inum, mode_t mode)
+{
+	struct bch_inode_unpacked inode;
+	int ret = bch_inode_find_by_inum(c, inum, &inode);
+	if (ret)
+		die("error looking up hardlink: %s", strerror(-ret));
+
+	inode.i_nlink++;
+	update_inode(c, &inode);
+
+	create_dirent(c, parent, name, inum, mode);
+}
+
+static struct bch_inode_unpacked create_file(struct cache_set *c,
+					     struct bch_inode_unpacked *parent,
+					     const char *name,
+					     uid_t uid, gid_t gid,
+					     mode_t mode, dev_t rdev)
+{
+	struct bch_inode_unpacked new_inode;
+	struct bkey_inode_buf packed;
+	int ret;
+
+	bch_inode_init(c, &new_inode, uid, gid, mode, rdev);
+	bch_inode_pack(&packed, &new_inode);
+
+	ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
+			       &c->unused_inode_hint);
+	if (ret)
+		die("error creating file: %s", strerror(-ret));
+
+	new_inode.inum = packed.inode.k.p.inode;
+	create_dirent(c, parent, name, new_inode.inum, mode);
+
+	return new_inode;
+}
+
+#define for_each_xattr_handler(handlers, handler)		\
+	if (handlers)						\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+static const struct xattr_handler *xattr_resolve_name(const char **name)
+{
+	const struct xattr_handler **handlers = bch_xattr_handlers;
+	const struct xattr_handler *handler;
+
+	for_each_xattr_handler(handlers, handler) {
+		const char *n;
+
+		n = strcmp_prefix(*name, xattr_prefix(handler));
+		if (n) {
+			if (!handler->prefix ^ !*n) {
+				if (*n)
+					continue;
+				return ERR_PTR(-EINVAL);
+			}
+			*name = n;
+			return handler;
+		}
+	}
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void copy_times(struct cache_set *c, struct bch_inode_unpacked *dst,
+		       struct stat *src)
+{
+	dst->i_atime = timespec_to_bch_time(c, src->st_atim);
+	dst->i_mtime = timespec_to_bch_time(c, src->st_mtim);
+	dst->i_ctime = timespec_to_bch_time(c, src->st_ctim);
+}
+
+static void copy_xattrs(struct cache_set *c, struct bch_inode_unpacked *dst,
+			char *src)
+{
+	struct bch_hash_info hash_info = bch_hash_info_init(dst);
+	ssize_t size = llistxattr(src, NULL, 0);
+	if (size < 0)
+		die("listxattr error: %s", strerror(errno));
+
+	if (!size)
+		return;
+
+	char *buf = malloc(size);
+	size = llistxattr(src, buf, size);
+	if (size < 0)
+		die("listxattr error: %s", strerror(errno));
+
+	for (const char *next, *attr = buf;
+	     attr <= buf + size;
+	     attr = next) {
+		next = attr + strlen(attr) + 1;
+
+		/* max possible xattr val: */
+		static char val[64 << 10];
+		ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
+
+		if (val_size < 0)
+			die("error getting xattr val: %s", strerror(errno));
+
+		const struct xattr_handler *h = xattr_resolve_name(&attr);
+
+		int ret = __bch_xattr_set(c, dst->inum, &hash_info, attr,
+					  val, val_size, 0, h->flags, NULL);
+		if (ret < 0)
+			die("error creating xattr: %s", strerror(-ret));
+	}
+
+	free(buf);
+}
+
+static void write_data(struct cache_set *c,
+		       struct bch_inode_unpacked *dst_inode,
+		       u64 dst_offset, void *buf, size_t len)
+{
+	struct disk_reservation res;
+	struct bch_write_op op;
+	struct bch_write_bio bio;
+	struct bio_vec bv;
+	struct closure cl;
+
+	BUG_ON(dst_offset	& (block_bytes(c) - 1));
+	BUG_ON(len		& (block_bytes(c) - 1));
+
+	closure_init_stack(&cl);
+
+	bio_init(&bio.bio);
+	bio.bio.bi_max_vecs	= 1;
+	bio.bio.bi_io_vec	= &bv;
+	bio.bio.bi_iter.bi_size	= len;
+	bch_bio_map(&bio.bio, buf);
+
+	int ret = bch_disk_reservation_get(c, &res, len >> 9, 0);
+	if (ret)
+		die("error reserving space in new filesystem: %s", strerror(-ret));
+
+	bch_write_op_init(&op, c, &bio, res, c->write_points,
+			  POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
+	closure_call(&op.cl, bch_write, NULL, &cl);
+	closure_sync(&cl);
+
+	dst_inode->i_sectors += len >> 9;
+}
+
+static char buf[1 << 20] __aligned(PAGE_SIZE);
+
+static void copy_data(struct cache_set *c,
+		      struct bch_inode_unpacked *dst_inode,
+		      int src_fd, u64 start, u64 end)
+{
+	while (start < end) {
+		unsigned len = min_t(u64, end - start, sizeof(buf));
+
+		xpread(src_fd, buf, len, start);
+		write_data(c, dst_inode, start, buf, len);
+		start += len;
+	}
+}
+
+static void link_data(struct cache_set *c, struct bch_inode_unpacked *dst,
+		      u64 logical, u64 physical, u64 length)
+{
+	struct cache *ca = c->cache[0];
+
+	BUG_ON(logical	& (block_bytes(c) - 1));
+	BUG_ON(physical & (block_bytes(c) - 1));
+	BUG_ON(length	& (block_bytes(c) - 1));
+
+	logical		>>= 9;
+	physical	>>= 9;
+	length		>>= 9;
+
+	BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
+
+	while (length) {
+		struct bkey_i_extent *e;
+		BKEY_PADDED(k) k;
+		u64 b = sector_to_bucket(ca, physical >> 9);
+		struct disk_reservation res;
+		unsigned sectors;
+		int ret;
+
+		sectors = min(ca->mi.bucket_size -
+			      (physical & (ca->mi.bucket_size - 1)),
+			      length);
+
+		e = bkey_extent_init(&k.k);
+		e->k.p.inode	= dst->inum;
+		e->k.p.offset	= logical + sectors;
+		e->k.size	= sectors;
+		extent_ptr_append(e, (struct bch_extent_ptr) {
+					.offset = physical,
+					.dev = 0,
+					.gen = ca->buckets[b].mark.gen,
+				  });
+
+		ret = bch_disk_reservation_get(c, &res, sectors,
+					       BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			die("error reserving space in new filesystem: %s",
+			    strerror(-ret));
+
+		ret = bch_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
+				       &res, NULL, NULL, 0);
+		if (ret)
+			die("btree insert error %s", strerror(-ret));
+
+		bch_disk_reservation_put(c, &res);
+
+		dst->i_sectors	+= sectors;
+		logical		+= sectors;
+		physical	+= sectors;
+		length		-= sectors;
+	}
+}
+
+static void copy_link(struct cache_set *c, struct bch_inode_unpacked *dst,
+		      char *src)
+{
+	ssize_t ret = readlink(src, buf, sizeof(buf));
+	if (ret < 0)
+		die("readlink error: %s", strerror(errno));
+
+	write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
+}
+
+static void copy_file(struct cache_set *c, struct bch_inode_unpacked *dst,
+		      int src, char *src_path, ranges *extents)
+{
+	struct fiemap_iter iter;
+	struct fiemap_extent e;
+
+	fiemap_for_each(src, iter, e)
+		if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
+			fsync(src);
+			break;
+		}
+
+	fiemap_for_each(src, iter, e) {
+		if ((e.fe_logical	& (block_bytes(c) - 1)) ||
+		    (e.fe_length	& (block_bytes(c) - 1)))
+			die("Unaligned extent in %s - can't handle", src_path);
+
+		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+				  FIEMAP_EXTENT_ENCODED|
+				  FIEMAP_EXTENT_NOT_ALIGNED|
+				  FIEMAP_EXTENT_DATA_INLINE)) {
+			copy_data(c, dst,
+				  src,
+				  round_down(e.fe_logical, block_bytes(c)),
+				  round_up(e.fe_logical + e.fe_length,
+					   block_bytes(c)));
+			continue;
+		}
+
+		if ((e.fe_physical	& (block_bytes(c) - 1)))
+			die("Unaligned extent in %s - can't handle", src_path);
+
+		range_add(extents, e.fe_physical, e.fe_length);
+		link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
+	}
+}
+
+struct copy_fs_state {
+	u64			bcachefs_inum;
+	dev_t			dev;
+
+	GENRADIX(u64)		hardlinks;
+	ranges			extents;
+};
+
+static void copy_dir(struct copy_fs_state *s,
+		     struct cache_set *c,
+		     struct bch_inode_unpacked *dst,
+		     int src_fd, const char *src_path)
+{
+	DIR *dir = fdopendir(src_fd);
+	struct dirent *d;
+
+	while ((errno = 0), (d = readdir(dir))) {
+		struct bch_inode_unpacked inode;
+		int fd;
+
+		if (fchdir(src_fd))
+			die("chdir error: %s", strerror(errno));
+
+		struct stat stat =
+			xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
+
+		if (!strcmp(d->d_name, ".") ||
+		    !strcmp(d->d_name, "..") ||
+		    stat.st_ino == s->bcachefs_inum)
+			continue;
+
+		char *child_path = mprintf("%s/%s", src_path, d->d_name);
+
+		if (stat.st_dev != s->dev)
+			die("%s does not have correct st_dev!", child_path);
+
+		u64 *dst_inum = S_ISREG(stat.st_mode)
+			? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
+			: NULL;
+
+		if (dst_inum && *dst_inum) {
+			create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
+			goto next;
+		}
+
+		inode = create_file(c, dst, d->d_name,
+				    stat.st_uid, stat.st_gid,
+				    stat.st_mode, stat.st_rdev);
+
+		if (dst_inum)
+			*dst_inum = inode.inum;
+
+		copy_times(c, &inode, &stat);
+		copy_xattrs(c, &inode, d->d_name);
+
+		/* copy xattrs */
+
+		switch (mode_to_type(stat.st_mode)) {
+		case DT_DIR:
+			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+			copy_dir(s, c, &inode, fd, child_path);
+			close(fd);
+			break;
+		case DT_REG:
+			inode.i_size = stat.st_size;
+
+			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+			copy_file(c, &inode, fd, child_path, &s->extents);
+			close(fd);
+			break;
+		case DT_LNK:
+			inode.i_size = stat.st_size;
+
+			copy_link(c, &inode, d->d_name);
+			break;
+		case DT_FIFO:
+		case DT_CHR:
+		case DT_BLK:
+		case DT_SOCK:
+		case DT_WHT:
+			/* nothing else to copy for these: */
+			break;
+		default:
+			BUG();
+		}
+
+		update_inode(c, &inode);
+next:
+		free(child_path);
+	}
+
+	if (errno)
+		die("readdir error: %s", strerror(errno));
+}
+
+static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
+				   u64 size, u64 *bcachefs_inum, dev_t dev)
+{
+	int fd = open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
+	if (fd < 0)
+		die("Error creating %s for bcachefs metadata: %s",
+		    file_path, strerror(errno));
+
+	struct stat statbuf = xfstat(fd);
+
+	if (statbuf.st_dev != dev)
+		die("bcachefs file has incorrect device");
+
+	*bcachefs_inum = statbuf.st_ino;
+
+	if (fallocate(fd, 0, 0, size))
+		die("Error reserving space for bcachefs metadata: %s",
+		    strerror(errno));
+
+	fsync(fd);
+
+	struct fiemap_iter iter;
+	struct fiemap_extent e;
+	ranges extents = { NULL };
+
+	fiemap_for_each(fd, iter, e) {
+		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+				  FIEMAP_EXTENT_ENCODED|
+				  FIEMAP_EXTENT_NOT_ALIGNED|
+				  FIEMAP_EXTENT_DATA_INLINE))
+			die("Unable to continue: metadata file not fully mapped");
+
+		if ((e.fe_physical	& (block_size - 1)) ||
+		    (e.fe_length	& (block_size - 1)))
+			die("Unable to continue: unaligned extents in metadata file");
+
+		range_add(&extents, e.fe_physical, e.fe_length);
+	}
+	close(fd);
+
+	ranges_sort_merge(&extents);
+	return extents;
+}
+
+static void reserve_old_fs_space(struct cache_set *c,
+				 struct bch_inode_unpacked *root_inode,
+				 ranges *extents)
+{
+	struct cache *ca = c->cache[0];
+	struct bch_inode_unpacked dst;
+	struct hole_iter iter;
+	struct range i;
+
+	dst = create_file(c, root_inode, "old_migrated_filesystem",
+			  0, 0, S_IFREG|0400, 0);
+	dst.i_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
+
+	ranges_sort_merge(extents);
+
+	for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
+		link_data(c, &dst, i.start, i.start, i.end - i.start);
+
+	update_inode(c, &dst);
+}
+
+static void copy_fs(struct cache_set *c, int src_fd, const char *src_path,
+		    u64 bcachefs_inum, ranges *extents)
+{
+	syncfs(src_fd);
+
+	struct bch_inode_unpacked root_inode;
+	int ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, &root_inode);
+	if (ret)
+		die("error looking up root directory: %s", strerror(-ret));
+
+	if (fchdir(src_fd))
+		die("chdir error: %s", strerror(errno));
+
+	struct stat stat = xfstat(src_fd);
+	copy_times(c, &root_inode, &stat);
+	copy_xattrs(c, &root_inode, ".");
+
+	struct copy_fs_state s = {
+		.bcachefs_inum	= bcachefs_inum,
+		.dev		= stat.st_dev,
+		.extents	= *extents,
+	};
+
+	/* now, copy: */
+	copy_dir(&s, c, &root_inode, src_fd, src_path);
+
+	reserve_old_fs_space(c, &root_inode, &s.extents);
+
+	update_inode(c, &root_inode);
+
+	darray_free(s.extents);
+	genradix_free(&s.hardlinks);
+}
+
+static void find_superblock_space(ranges extents, struct dev_opts *dev)
+{
+	struct range *i;
+	darray_foreach(i, extents) {
+		u64 offset = max(256ULL << 10, i->start);
+
+		if (offset + (128 << 10) <= i->end) {
+			dev->sb_offset	= offset >> 9;
+			dev->sb_end	= dev->sb_offset + 256;
+			return;
+		}
+	}
+
+	die("Couldn't find a valid location for superblock");
+}
+
+static void migrate_usage(void)
+{
+	puts("bcache migrate - migrate an existing filesystem to bcachefs\n"
+	     "Usage: bcache migrate [OPTION]...\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f fs                  Root of filesystem to migrate(s)\n"
+	     "      --encrypted        Enable whole filesystem encryption (chacha20/poly1305)\n"
+	     "      --no_passphrase    Don't encrypt master encryption key\n"
+	     "  -h                     Display this help and exit\n"
+	     "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+static const struct option migrate_opts[] = {
+	{ "encrypted",		no_argument, NULL, 'e' },
+	{ "no_passphrase",	no_argument, NULL, 'p' },
+	{ NULL }
+};
+
+int cmd_migrate(int argc, char *argv[])
+{
+	struct format_opts format_opts = format_opts_default();
+	char *fs_path = NULL;
+	unsigned block_size;
+	bool no_passphrase = false;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "f:h",
+				  migrate_opts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			fs_path = optarg;
+			break;
+		case 'e':
+			format_opts.encrypted = true;
+			break;
+		case 'p':
+			no_passphrase = true;
+			break;
+		case 'h':
+			migrate_usage();
+			exit(EXIT_SUCCESS);
+		}
+
+	if (!fs_path)
+		die("Please specify a filesytem to migrate");
+
+	if (!path_is_fs_root(fs_path))
+		die("%s is not a filysestem root", fs_path);
+
+	int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
+	struct stat stat = xfstat(fs_fd);
+
+	if (!S_ISDIR(stat.st_mode))
+		die("%s is not a directory", fs_path);
+
+	struct dev_opts dev = { 0 };
+
+	dev.path = dev_t_to_path(stat.st_dev);
+	dev.fd = xopen(dev.path, O_RDWR);
+
+	block_size = min_t(unsigned, stat.st_blksize,
+			   get_blocksize(dev.path, dev.fd) << 9);
+
+	BUG_ON(!is_power_of_2(block_size) || block_size < 512);
+	format_opts.block_size = block_size >> 9;
+
+	u64 bcachefs_inum;
+	char *file_path = mprintf("%s/bcachefs", fs_path);
+
+	ranges extents = reserve_new_fs_space(file_path,
+				block_size, get_size(dev.path, dev.fd) / 5,
+				&bcachefs_inum, stat.st_dev);
+
+	find_superblock_space(extents, &dev);
+
+	if (format_opts.encrypted && !no_passphrase) {
+		format_opts.passphrase = read_passphrase("Enter passphrase: ");
+
+		if (isatty(STDIN_FILENO)) {
+			char *pass2 =
+				read_passphrase("Enter same passphrase again: ");
+
+			if (strcmp(format_opts.passphrase, pass2)) {
+				memzero_explicit(format_opts.passphrase,
+						 strlen(format_opts.passphrase));
+				memzero_explicit(pass2, strlen(pass2));
+				die("Passphrases do not match");
+			}
+
+			memzero_explicit(pass2, strlen(pass2));
+			free(pass2);
+		}
+	}
+
+	struct bch_sb *sb = bcache_format(format_opts, &dev, 1);
+	u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
+
+	if (format_opts.passphrase)
+		add_bcache_key(sb, format_opts.passphrase);
+
+	free(sb);
+
+	printf("Creating new filesystem on %s in space reserved at %s\n"
+	       "To mount, run\n"
+	       "  mount -t bcache -o sb=%llu %s dir\n"
+	       "\n"
+	       "After verifying that the new filesystem is correct, to create a\n"
+	       "superblock at the default offset and finish the migration run\n"
+	       "  bcache migrate_superblock -d %s -o %llu\n"
+	       "\n"
+	       "The new filesystem will have a file at /old_migrated_filestem\n"
+	       "referencing all disk space that might be used by the existing\n"
+	       "filesystem. That file can be deleted once the old filesystem is\n"
+	       "no longer needed (and should be deleted prior to running\n"
+	       "bcache migrate_superblock)\n",
+	       dev.path, file_path, sb_offset, dev.path,
+	       dev.path, sb_offset);
+
+	struct bch_opts opts = bch_opts_empty();
+	struct cache_set *c = NULL;
+	char *path[1] = { dev.path };
+	const char *err;
+
+	opts.sb		= sb_offset;
+	opts.nostart	= true;
+	opts.noexcl	= true;
+
+	err = bch_fs_open(path, 1, opts, &c);
+	if (err)
+		die("Error opening new filesystem: %s", err);
+
+	mark_unreserved_space(c, extents);
+
+	err = bch_fs_start(c);
+	if (err)
+		die("Error starting new filesystem: %s", err);
+
+	copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
+
+	bch_fs_stop(c);
+
+	printf("Migrate complete, running fsck:\n");
+	opts.nostart	= false;
+	opts.nochanges	= true;
+	fsck_err_opt	= FSCK_ERR_NO;
+
+	err = bch_fs_open(path, 1, opts, &c);
+	if (err)
+		die("Error opening new filesystem: %s", err);
+
+	bch_fs_stop(c);
+	printf("fsck complete\n");
+	return 0;
+}
+
+static void migrate_superblock_usage(void)
+{
+	puts("bcache migrate_superblock - create default superblock after migrating\n"
+	     "Usage: bcache migrate_superblock [OPTION]...\n"
+	     "\n"
+	     "Options:\n"
+	     "  -d device     Device to create superblock for\n"
+	     "  -o offset     Offset of existing superblock\n"
+	     "  -h            Display this help and exit\n"
+	     "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+int cmd_migrate_superblock(int argc, char *argv[])
+{
+	char *dev = NULL;
+	u64 offset = 0;
+	int opt, ret;
+
+	while ((opt = getopt(argc, argv, "d:o:h")) != -1)
+		switch (opt) {
+			case 'd':
+				dev = optarg;
+				break;
+			case 'o':
+				ret = kstrtou64(optarg, 10, &offset);
+				if (ret)
+					die("Invalid offset");
+				break;
+			case 'h':
+				migrate_superblock_usage();
+				exit(EXIT_SUCCESS);
+		}
+
+	if (!dev)
+		die("Please specify a device");
+
+	if (!offset)
+		die("Please specify offset of existing superblock");
+
+	int fd = xopen(dev, O_RDWR);
+	struct bch_sb *sb = __bcache_super_read(fd, offset);
+
+	if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
+		die("Can't add superblock: no space left in superblock layout");
+
+	for (unsigned i = 0; i < sb->layout.nr_superblocks; i++)
+		if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
+			die("Superblock layout already has default superblock");
+
+	memmove(&sb->layout.sb_offset[1],
+		&sb->layout.sb_offset[0],
+		sb->layout.nr_superblocks * sizeof(u64));
+	sb->layout.nr_superblocks++;
+
+	sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
+
+	bcache_super_write(fd, sb);
+	close(fd);
+
+	return 0;
+}
diff --git a/cmd_run.c b/cmd_run.c
index 74f32480..6fb1c4f9 100644
--- a/cmd_run.c
+++ b/cmd_run.c
@@ -25,9 +25,6 @@ int cmd_stop(int argc, char *argv[])
 		die("Please supply a filesystem");
 
 	struct bcache_handle fs = bcache_fs_open(argv[1]);
-
-	if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP))
-		die("BCH_IOCTL_STOP error: %s", strerror(errno));
-
+	xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
 	return 0;
 }
diff --git a/cmds.h b/cmds.h
index 946acfda..120e83f9 100644
--- a/cmds.h
+++ b/cmds.h
@@ -29,4 +29,7 @@ int cmd_fsck(int argc, char *argv[]);
 int cmd_dump(int argc, char *argv[]);
 int cmd_list(int argc, char *argv[]);
 
+int cmd_migrate(int argc, char *argv[]);
+int cmd_migrate_superblock(int argc, char *argv[]);
+
 #endif /* _CMDS_H */
diff --git a/crypto.c b/crypto.c
index 86da70a1..f38a359d 100644
--- a/crypto.c
+++ b/crypto.c
@@ -10,8 +10,10 @@
 #include <time.h>
 #include <unistd.h>
 
+#include <keyutils.h>
 #include <linux/random.h>
 #include <libscrypt.h>
+#include <uuid/uuid.h>
 
 #include "checksum.h"
 #include "crypto.h"
@@ -75,29 +77,71 @@ void derive_passphrase(struct bch_sb_field_crypt *crypt,
 	}
 }
 
+void add_bcache_key(struct bch_sb *sb, const char *passphrase)
+{
+	struct bch_sb_field_crypt *crypt = bch_sb_get_crypt(sb);
+	if (!crypt)
+		die("filesystem is not encrypted");
+
+	struct bch_encrypted_key sb_key = crypt->key;
+	if (!bch_key_is_encrypted(&sb_key))
+		die("filesystem does not have encryption key");
+
+	struct bch_key passphrase_key;
+	derive_passphrase(crypt, &passphrase_key, passphrase);
+
+	/* Check if the user supplied the correct passphrase: */
+	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+				   &sb_key, sizeof(sb_key)))
+		die("error encrypting key");
+
+	if (bch_key_is_encrypted(&sb_key))
+		die("incorrect passphrase");
+
+	char uuid[40];
+	uuid_unparse_lower(sb->user_uuid.b, uuid);
+
+	char *description = mprintf("bcache:%s", uuid);
+
+	if (add_key("logon", description,
+		    &passphrase_key, sizeof(passphrase_key),
+		    KEY_SPEC_USER_KEYRING) < 0 ||
+	    add_key("user", description,
+		    &passphrase_key, sizeof(passphrase_key),
+		    KEY_SPEC_USER_KEYRING) < 0)
+		die("add_key error: %s", strerror(errno));
+
+	memzero_explicit(description, strlen(description));
+	free(description);
+	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+	memzero_explicit(&sb_key, sizeof(sb_key));
+}
+
 void bch_sb_crypt_init(struct bch_sb *sb,
 		       struct bch_sb_field_crypt *crypt,
 		       const char *passphrase)
 {
-	struct bch_key passphrase_key;
-
-	SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
-	SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
-	SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
-	SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
-
-	derive_passphrase(crypt, &passphrase_key, passphrase);
-
 	crypt->key.magic = BCH_KEY_MAGIC;
 	get_random_bytes(&crypt->key.key, sizeof(crypt->key.key));
 
-	assert(!bch_key_is_encrypted(&crypt->key));
+	if (passphrase) {
+		struct bch_key passphrase_key;
 
-	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
-				   &crypt->key, sizeof(crypt->key)))
-		die("error encrypting key");
+		SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
+		SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
+		SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
+		SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
 
-	assert(bch_key_is_encrypted(&crypt->key));
+		derive_passphrase(crypt, &passphrase_key, passphrase);
 
-	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+		assert(!bch_key_is_encrypted(&crypt->key));
+
+		if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+					   &crypt->key, sizeof(crypt->key)))
+			die("error encrypting key");
+
+		assert(bch_key_is_encrypted(&crypt->key));
+
+		memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+	}
 }
diff --git a/crypto.h b/crypto.h
index 643073eb..91a8b9fc 100644
--- a/crypto.h
+++ b/crypto.h
@@ -1,12 +1,16 @@
 #ifndef _CRYPTO_H
 #define _CRYPTO_H
 
-#include "super-io.h"
 #include "tools-util.h"
 
+struct bch_sb;
+struct bch_sb_field_crypt;
+struct bch_key;
+
 char *read_passphrase(const char *);
 void derive_passphrase(struct bch_sb_field_crypt *,
 		       struct bch_key *, const char *);
+void add_bcache_key(struct bch_sb *, const char *);
 void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
 		       const char *);
 
diff --git a/include/linux/bcache.h b/include/linux/bcache.h
index dbb02742..d70e2e32 100644
--- a/include/linux/bcache.h
+++ b/include/linux/bcache.h
@@ -821,7 +821,7 @@ struct bch_sb_field {
 	__le32			type;
 };
 
-enum bch_sb_field_types {
+enum bch_sb_field_type {
 	BCH_SB_FIELD_journal	= 0,
 	BCH_SB_FIELD_members	= 1,
 	BCH_SB_FIELD_crypt	= 2,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3c185945..217ff094 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -110,6 +110,7 @@ struct super_block {
  * NOTE! These match bits 12..15 of stat.st_mode
  * (ie "(i_mode >> 12) & 15").
  */
+#ifndef DT_UNKNOWN
 #define DT_UNKNOWN	0
 #define DT_FIFO		1
 #define DT_CHR		2
@@ -119,6 +120,7 @@ struct super_block {
 #define DT_LNK		10
 #define DT_SOCK		12
 #define DT_WHT		14
+#endif
 
 /*
  * This is the "filldir" function type, used by readdir() to let
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 1a951e97..6ea2deb2 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -8,7 +8,6 @@
  * interior nodes.
  */
 
-#include <linux/page.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -41,20 +40,14 @@ struct __genradix {
  * genradix.
  */
 
-#define DECLARE_GENRADIX_TYPE(_name, _type)			\
-struct _name {							\
+#define GENRADIX(_type)						\
+struct {							\
 	struct __genradix	tree;				\
 	_type			type[0] __aligned(1);		\
 }
 
-#define DECLARE_GENRADIX(_name, _type)				\
-struct {							\
-	struct __genradix	tree;				\
-	_type			type[0] __aligned(1);		\
-} _name
-
 #define DEFINE_GENRADIX(_name, _type)				\
-	DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER
+	GENRADIX(_type) _name = __GENRADIX_INITIALIZER
 
 #define genradix_init(_radix)					\
 do {								\
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 5a986188..2bbd0979 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -180,4 +180,9 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
 	return !atomic_long_read(&ref->count);
 }
 
+static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
+{
+	return percpu_ref_is_zero(ref);
+}
+
 #endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */
diff --git a/libbcache.c b/libbcache.c
index 6908ead9..0cfafbbc 100644
--- a/libbcache.c
+++ b/libbcache.c
@@ -23,66 +23,82 @@
 
 #define BCH_MIN_NR_NBUCKETS	(1 << 10)
 
-/* first bucket should start 1 mb in, in sectors: */
-#define FIRST_BUCKET_OFFSET	(1 << 11)
-
 /* minimum size filesystem we can create, given a bucket size: */
 static u64 min_size(unsigned bucket_size)
 {
-	return (DIV_ROUND_UP(FIRST_BUCKET_OFFSET, bucket_size) +
-		BCH_MIN_NR_NBUCKETS) * bucket_size;
+	return BCH_MIN_NR_NBUCKETS * bucket_size;
 }
 
-static void init_layout(struct bch_sb_layout *l)
+static void init_layout(struct bch_sb_layout *l, unsigned block_size,
+			u64 start, u64 end)
 {
+	unsigned sb_size;
+	u64 backup; /* offset of 2nd sb */
+
 	memset(l, 0, sizeof(*l));
 
+	if (start != BCH_SB_SECTOR)
+		start = round_up(start, block_size);
+	end = round_down(end, block_size);
+
+	if (start >= end)
+		die("insufficient space for superblocks");
+
+	/*
+	 * Create two superblocks in the allowed range: reserve a maximum of 64k
+	 */
+	sb_size = min_t(u64, 128, end - start / 2);
+
+	backup = start + sb_size;
+	backup = round_up(backup, block_size);
+
+	backup = min(backup, end);
+
+	sb_size = min(end - backup, backup- start);
+	sb_size = rounddown_pow_of_two(sb_size);
+
+	if (sb_size < 8)
+		die("insufficient space for superblocks");
+
 	l->magic		= BCACHE_MAGIC;
 	l->layout_type		= 0;
 	l->nr_superblocks	= 2;
-	l->sb_max_size_bits	= 7;
-	l->sb_offset[0]		= cpu_to_le64(BCH_SB_SECTOR);
-	l->sb_offset[1]		= cpu_to_le64(BCH_SB_SECTOR +
-					      (1 << l->sb_max_size_bits));
+	l->sb_max_size_bits	= ilog2(sb_size);
+	l->sb_offset[0]		= cpu_to_le64(start);
+	l->sb_offset[1]		= cpu_to_le64(backup);
 }
 
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
-		   unsigned block_size,
-		   unsigned btree_node_size,
-		   unsigned meta_csum_type,
-		   unsigned data_csum_type,
-		   unsigned compression_type,
-		   const char *passphrase,
-		   unsigned meta_replicas,
-		   unsigned data_replicas,
-		   unsigned on_error_action,
-		   unsigned max_journal_entry_size,
-		   char *label,
-		   uuid_le uuid)
+struct bch_sb *bcache_format(struct format_opts opts,
+			     struct dev_opts *devs, size_t nr_devs)
 {
 	struct bch_sb *sb;
 	struct dev_opts *i;
 	struct bch_sb_field_members *mi;
-	unsigned u64s, j;
+	unsigned u64s;
 
 	/* calculate block size: */
-	if (!block_size)
+	if (!opts.block_size)
 		for (i = devs; i < devs + nr_devs; i++)
-			block_size = max(block_size,
-					 get_blocksize(i->path, i->fd));
+			opts.block_size = max(opts.block_size,
+					      get_blocksize(i->path, i->fd));
 
 	/* calculate bucket sizes: */
 	for (i = devs; i < devs + nr_devs; i++) {
+		if (!i->sb_offset) {
+			i->sb_offset	= BCH_SB_SECTOR;
+			i->sb_end	= BCH_SB_SECTOR + 256;
+		}
+
 		if (!i->size)
 			i->size = get_size(i->path, i->fd) >> 9;
 
 		if (!i->bucket_size) {
-			if (i->size < min_size(block_size))
+			if (i->size < min_size(opts.block_size))
 				die("cannot format %s, too small (%llu sectors, min %llu)",
-				    i->path, i->size, min_size(block_size));
+				    i->path, i->size, min_size(opts.block_size));
 
 			/* Want a bucket size of at least 128k, if possible: */
-			i->bucket_size = max(block_size, 256U);
+			i->bucket_size = max(opts.block_size, 256U);
 
 			if (i->size >= min_size(i->bucket_size)) {
 				unsigned scale = max(1,
@@ -99,34 +115,36 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 			}
 		}
 
-		/* first bucket: 1 mb in */
-		i->first_bucket	= DIV_ROUND_UP(FIRST_BUCKET_OFFSET, i->bucket_size);
 		i->nbuckets	= i->size / i->bucket_size;
 
-		if (i->bucket_size < block_size)
+		if (i->bucket_size < opts.block_size)
 			die("Bucket size cannot be smaller than block size");
 
-		if (i->nbuckets - i->first_bucket < BCH_MIN_NR_NBUCKETS)
+		if (i->nbuckets < BCH_MIN_NR_NBUCKETS)
 			die("Not enough buckets: %llu, need %u (bucket size %u)",
-			    i->nbuckets - i->first_bucket, BCH_MIN_NR_NBUCKETS,
-			    i->bucket_size);
+			    i->nbuckets, BCH_MIN_NR_NBUCKETS, i->bucket_size);
 	}
 
 	/* calculate btree node size: */
-	if (!btree_node_size) {
+	if (!opts.btree_node_size) {
 		/* 256k default btree node size */
-		btree_node_size = 512;
+		opts.btree_node_size = 512;
 
 		for (i = devs; i < devs + nr_devs; i++)
-			btree_node_size = min(btree_node_size, i->bucket_size);
+			opts.btree_node_size =
+				min(opts.btree_node_size, i->bucket_size);
 	}
 
-	if (!max_journal_entry_size) {
+	if (!opts.max_journal_entry_size) {
 		/* 2 MB default: */
-		max_journal_entry_size = 4096;
+		opts.max_journal_entry_size = 4096;
 	}
 
-	max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size);
+	opts.max_journal_entry_size =
+		roundup_pow_of_two(opts.max_journal_entry_size);
+
+	if (uuid_is_null(opts.uuid.b))
+		uuid_generate(opts.uuid.b);
 
 	sb = calloc(1, sizeof(*sb) +
 		    sizeof(struct bch_sb_field_members) +
@@ -135,35 +153,29 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 
 	sb->version	= cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4);
 	sb->magic	= BCACHE_MAGIC;
-	sb->block_size	= cpu_to_le16(block_size);
-	sb->user_uuid	= uuid;
+	sb->block_size	= cpu_to_le16(opts.block_size);
+	sb->user_uuid	= opts.uuid;
 	sb->nr_devices	= nr_devs;
 
-	init_layout(&sb->layout);
-
 	uuid_generate(sb->uuid.b);
 
-	if (label)
-		strncpy((char *) sb->label, label, sizeof(sb->label));
+	if (opts.label)
+		strncpy((char *) sb->label, opts.label, sizeof(sb->label));
 
-	/*
-	 * don't have a userspace crc32c implementation handy, just always use
-	 * crc64
-	 */
-	SET_BCH_SB_CSUM_TYPE(sb,		BCH_CSUM_CRC64);
-	SET_BCH_SB_META_CSUM_TYPE(sb,		meta_csum_type);
-	SET_BCH_SB_DATA_CSUM_TYPE(sb,		data_csum_type);
-	SET_BCH_SB_COMPRESSION_TYPE(sb,		compression_type);
+	SET_BCH_SB_CSUM_TYPE(sb,		opts.meta_csum_type);
+	SET_BCH_SB_META_CSUM_TYPE(sb,		opts.meta_csum_type);
+	SET_BCH_SB_DATA_CSUM_TYPE(sb,		opts.data_csum_type);
+	SET_BCH_SB_COMPRESSION_TYPE(sb,		opts.compression_type);
 
-	SET_BCH_SB_BTREE_NODE_SIZE(sb,		btree_node_size);
+	SET_BCH_SB_BTREE_NODE_SIZE(sb,		opts.btree_node_size);
 	SET_BCH_SB_GC_RESERVE(sb,		8);
-	SET_BCH_SB_META_REPLICAS_WANT(sb,	meta_replicas);
-	SET_BCH_SB_META_REPLICAS_HAVE(sb,	meta_replicas);
-	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	data_replicas);
-	SET_BCH_SB_DATA_REPLICAS_HAVE(sb,	data_replicas);
-	SET_BCH_SB_ERROR_ACTION(sb,		on_error_action);
+	SET_BCH_SB_META_REPLICAS_WANT(sb,	opts.meta_replicas);
+	SET_BCH_SB_META_REPLICAS_HAVE(sb,	opts.meta_replicas);
+	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	opts.data_replicas);
+	SET_BCH_SB_DATA_REPLICAS_HAVE(sb,	opts.data_replicas);
+	SET_BCH_SB_ERROR_ACTION(sb,		opts.on_error_action);
 	SET_BCH_SB_STR_HASH_TYPE(sb,		BCH_STR_HASH_SIPHASH);
-	SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,	ilog2(max_journal_entry_size));
+	SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,	ilog2(opts.max_journal_entry_size));
 
 	struct timespec now;
 	if (clock_gettime(CLOCK_REALTIME, &now))
@@ -172,7 +184,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
 	sb->time_precision	= cpu_to_le32(1);
 
-	if (passphrase) {
+	if (opts.encrypted) {
 		struct bch_sb_field_crypt *crypt = vstruct_end(sb);
 
 		u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
@@ -181,7 +193,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 		crypt->field.u64s = cpu_to_le32(u64s);
 		crypt->field.type = BCH_SB_FIELD_crypt;
 
-		bch_sb_crypt_init(sb, crypt, passphrase);
+		bch_sb_crypt_init(sb, crypt, opts.passphrase);
 		SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
 	}
 
@@ -198,7 +210,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 
 		uuid_generate(m->uuid.b);
 		m->nbuckets	= cpu_to_le64(i->nbuckets);
-		m->first_bucket	= cpu_to_le16(i->first_bucket);
+		m->first_bucket	= 0;
 		m->bucket_size	= cpu_to_le16(i->bucket_size);
 
 		SET_BCH_MEMBER_TIER(m,		i->tier);
@@ -209,42 +221,49 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	for (i = devs; i < devs + nr_devs; i++) {
 		sb->dev_idx = i - devs;
 
-		static const char zeroes[BCH_SB_SECTOR << 9];
-		struct nonce nonce = { 0 };
+		init_layout(&sb->layout, opts.block_size,
+			    i->sb_offset, i->sb_end);
 
-		/* Zero start of disk */
-		xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+		if (i->sb_offset == BCH_SB_SECTOR) {
+			/* Zero start of disk */
+			static const char zeroes[BCH_SB_SECTOR << 9];
 
-		xpwrite(i->fd, &sb->layout, sizeof(sb->layout),
-			BCH_SB_LAYOUT_SECTOR << 9);
-
-		for (j = 0; j < sb->layout.nr_superblocks; j++) {
-			sb->offset = sb->layout.sb_offset[j];
-
-			sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb),
-						   nonce, sb);
-			xpwrite(i->fd, sb, vstruct_bytes(sb),
-				le64_to_cpu(sb->offset) << 9);
+			xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
 		}
 
-		fsync(i->fd);
+		bcache_super_write(i->fd, sb);
 		close(i->fd);
 	}
 
-	bcache_super_print(sb, HUMAN_READABLE);
-
-	free(sb);
+	return sb;
 }
 
-struct bch_sb *bcache_super_read(const char *path)
+void bcache_super_write(int fd, struct bch_sb *sb)
+{
+	struct nonce nonce = { 0 };
+
+	for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) {
+		sb->offset = sb->layout.sb_offset[i];
+
+		if (sb->offset == BCH_SB_SECTOR) {
+			/* Write backup layout */
+			xpwrite(fd, &sb->layout, sizeof(sb->layout),
+				BCH_SB_LAYOUT_SECTOR << 9);
+		}
+
+		sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
+		xpwrite(fd, sb, vstruct_bytes(sb),
+			le64_to_cpu(sb->offset) << 9);
+	}
+
+	fsync(fd);
+}
+
+struct bch_sb *__bcache_super_read(int fd, u64 sector)
 {
 	struct bch_sb sb, *ret;
 
-	int fd = open(path, O_RDONLY);
-	if (fd < 0)
-		die("couldn't open %s", path);
-
-	xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9);
+	xpread(fd, &sb, sizeof(sb), sector << 9);
 
 	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
 		die("not a bcache superblock");
@@ -253,11 +272,19 @@ struct bch_sb *bcache_super_read(const char *path)
 
 	ret = malloc(bytes);
 
-	xpread(fd, ret, bytes, BCH_SB_SECTOR << 9);
+	xpread(fd, ret, bytes, sector << 9);
 
 	return ret;
 }
 
+struct bch_sb *bcache_super_read(const char *path)
+{
+	int fd = xopen(path, O_RDONLY);
+	struct bch_sb *sb = __bcache_super_read(fd, BCH_SB_SECTOR);
+	close(fd);
+	return sb;
+}
+
 void bcache_super_print(struct bch_sb *sb, int units)
 {
 	struct bch_sb_field_members *mi;
diff --git a/libbcache.h b/libbcache.h
index 6ec3f42d..779b4708 100644
--- a/libbcache.h
+++ b/libbcache.h
@@ -1,6 +1,7 @@
 #ifndef _LIBBCACHE_H
 #define _LIBBCACHE_H
 
+#include <linux/bcache.h>
 #include <linux/uuid.h>
 #include "tools-util.h"
 #include "vstructs.h"
@@ -18,32 +19,56 @@ enum fsck_err_opts {
 
 extern enum fsck_err_opts fsck_err_opt;
 
+struct format_opts {
+	char		*label;
+	uuid_le		uuid;
+
+	unsigned	on_error_action;
+	unsigned	max_journal_entry_size; /* will be removed */
+
+	unsigned	block_size;
+	unsigned	btree_node_size;
+
+	unsigned	meta_replicas;
+	unsigned	data_replicas;
+
+	unsigned	meta_csum_type;
+	unsigned	data_csum_type;
+	unsigned	compression_type;
+
+	bool		encrypted;
+	char		*passphrase;
+};
+
+static inline struct format_opts format_opts_default()
+{
+	return (struct format_opts) {
+		.on_error_action	= BCH_ON_ERROR_RO,
+		.meta_csum_type		= BCH_CSUM_CRC32C,
+		.data_csum_type		= BCH_CSUM_CRC32C,
+		.meta_replicas		= 1,
+		.data_replicas		= 1,
+	};
+}
+
 struct dev_opts {
 	int		fd;
-	const char	*path;
+	char		*path;
 	u64		size; /* 512 byte sectors */
 	unsigned	bucket_size;
 	unsigned	tier;
 	bool		discard;
 
-	u64		first_bucket;
 	u64		nbuckets;
+
+	u64		sb_offset;
+	u64		sb_end;
 };
 
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
-		   unsigned block_size,
-		   unsigned btree_node_size,
-		   unsigned meta_csum_type,
-		   unsigned data_csum_type,
-		   unsigned compression_type,
-		   const char *passphrase,
-		   unsigned meta_replicas,
-		   unsigned data_replicas,
-		   unsigned on_error_action,
-		   unsigned max_journal_entry_size,
-		   char *label,
-		   uuid_le uuid);
+struct bch_sb *bcache_format(struct format_opts, struct dev_opts *, size_t);
 
+void bcache_super_write(int, struct bch_sb *);
+struct bch_sb *__bcache_super_read(int, u64);
 struct bch_sb *bcache_super_read(const char *);
 
 void bcache_super_print(struct bch_sb *, int);
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 8cb31944..93f0c2f1 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -73,7 +73,6 @@
 #include <linux/rcupdate.h>
 #include <trace/events/bcache.h>
 
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
 static void __bch_bucket_free(struct cache *, struct bucket *);
 
 /* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
 
 	spin_lock(&grp->lock);
 
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca) {
-			grp->nr_devices--;
+			grp->nr--;
 			memmove(&grp->d[i],
 				&grp->d[i + 1],
-				(grp->nr_devices - i) * sizeof(grp->d[0]));
+				(grp->nr- i) * sizeof(grp->d[0]));
 			break;
 		}
 
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
 	unsigned i;
 
 	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca)
 			goto out;
 
-	BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+	BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
 
-	rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+	rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
 out:
 	spin_unlock(&grp->lock);
 }
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
 					   struct cache_set,
 					   pd_controllers_update);
 	struct cache *ca;
-	unsigned iter;
-	int i;
+	unsigned i, iter;
 
 	/* All units are in bytes */
-	u64 tier_size[BCH_TIER_MAX];
-	u64 tier_free[BCH_TIER_MAX];
-	u64 tier_dirty[BCH_TIER_MAX];
-	u64 tier0_can_free = 0;
+	u64 faster_tiers_size	= 0;
+	u64 faster_tiers_dirty	= 0;
 
-	memset(tier_size, 0, sizeof(tier_size));
-	memset(tier_free, 0, sizeof(tier_free));
-	memset(tier_dirty, 0, sizeof(tier_dirty));
+	u64 fastest_tier_size	= 0;
+	u64 fastest_tier_free	= 0;
+	u64 copygc_can_free	= 0;
 
 	rcu_read_lock();
-	for (i = BCH_TIER_MAX - 1; i >= 0; --i)
-		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+		bch_pd_controller_update(&c->tiers[i].pd,
+				div_u64(faster_tiers_size *
+					c->tiering_percent, 100),
+				faster_tiers_dirty,
+				-1);
+
+		group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
 			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
 			unsigned bucket_bits = ca->bucket_bits + 9;
 
+			u64 size = (ca->mi.nbuckets -
+				    ca->mi.first_bucket) << bucket_bits;
+			u64 dirty = stats.buckets_dirty << bucket_bits;
+			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
 			/*
 			 * Bytes of internal fragmentation, which can be
 			 * reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
 				((stats.sectors_dirty +
 				  stats.sectors_cached) << 9);
 
-			u64 dev_size = (ca->mi.nbuckets -
-					ca->mi.first_bucket) << bucket_bits;
-
-			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
 			if (fragmented < 0)
 				fragmented = 0;
 
 			bch_pd_controller_update(&ca->moving_gc_pd,
 						 free, fragmented, -1);
 
-			if (i == 0)
-				tier0_can_free += fragmented;
+			faster_tiers_size		+= size;
+			faster_tiers_dirty		+= dirty;
 
-			tier_size[i] += dev_size;
-			tier_free[i] += free;
-			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
+			if (!c->fastest_tier ||
+			    c->fastest_tier == &c->tiers[i]) {
+				fastest_tier_size	+= size;
+				fastest_tier_free	+= free;
+			}
+
+			copygc_can_free			+= fragmented;
 		}
-	rcu_read_unlock();
-
-	if (tier_size[1]) {
-		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
-
-		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
-
-		bch_pd_controller_update(&c->tiering_pd,
-					 target,
-					 tier_dirty[0],
-					 -1);
 	}
 
+	rcu_read_unlock();
+
 	/*
 	 * Throttle foreground writes if tier 0 is running out of free buckets,
-	 * and either tiering or copygc can free up space (but don't take both
-	 * into account).
+	 * and either tiering or copygc can free up space.
 	 *
 	 * Target will be small if there isn't any work to do - we don't want to
 	 * throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
 	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
 	 * for foreground writes.
 	 */
+	if (c->fastest_tier)
+		copygc_can_free = U64_MAX;
+
 	bch_pd_controller_update(&c->foreground_write_pd,
-				 min(tier0_can_free,
-				     div_u64(tier_size[0] *
+				 min(copygc_can_free,
+				     div_u64(fastest_tier_size *
 					     c->foreground_target_percent,
 					     100)),
-				 tier_free[0],
+				 fastest_tier_free,
 				 -1);
 
 	schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
 		 * it getting gc'd from under us
 		 */
 		ca->prio_buckets[i] = r;
-		bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+		bch_mark_metadata_bucket(ca, ca->buckets + r,
+					 BUCKET_PRIOS, false);
 		spin_unlock(&ca->prio_buckets_lock);
 
 		SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
 	do {
 		unsigned u64s = jset_u64s(0);
 
+		if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+			break;
+
 		ret = bch_journal_res_get(j, &res, u64s, u64s);
 		if (ret)
 			return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
 		if (is_available_bucket(m) &&
 		    !m.cached_sectors &&
 		    !m.had_metadata &&
-		    (!m.wait_on_journal ||
-		     ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+		    !bucket_needs_journal_commit(m, last_seq_ondisk)) {
 			spin_lock(&ca->freelist_lock);
 
 			bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
 
 	set_freezable();
 
+	bch_find_empty_buckets(c, ca);
+
 	while (1) {
 		/*
 		 * First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
 		 * See if we have buckets we can reuse without invalidating them
 		 * or forcing a journal commit:
 		 */
-		bch_find_empty_buckets(c, ca);
+		//bch_find_empty_buckets(c, ca);
 
 		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
 			up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
  *
  * Returns index of bucket on success, 0 on failure
  * */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
 {
 	struct bucket *g;
 	long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
 	u64 available_buckets = 1; /* avoid a divide by zero... */
 	unsigned i;
 
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		ca = devs->d[i].dev;
 
 		devs->d[i].weight = buckets_free_cache(ca);
 		available_buckets += devs->d[i].weight;
 	}
 
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		const unsigned min_weight = U32_MAX >> 4;
 		const unsigned max_weight = U32_MAX;
 
 		devs->d[i].weight =
 			min_weight +
 			div64_u64(devs->d[i].weight *
-				  devs->nr_devices *
+				  devs->nr *
 				  (max_weight - min_weight),
 				  available_buckets);
 		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 	rcu_read_lock();
 	spin_lock(&devs->lock);
 
-	for (i = 0; i < devs->nr_devices; i++)
+	for (i = 0; i < devs->nr; i++)
 		available += !test_bit(devs->d[i].dev->dev_idx,
 				       caches_used);
 
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 		}
 
 		i++;
-		i %= devs->nr_devices;
+		i %= devs->nr;
 
 		ret = FREELIST_EMPTY;
 		if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 						    enum alloc_reserve reserve,
 						    long *caches_used)
 {
+	struct bch_tier *tier;
 	/*
 	 * this should implement policy - for a given type of allocation, decide
 	 * which devices to allocate from:
 	 *
 	 * XXX: switch off wp->type and do something more intelligent here
 	 */
+	if (wp->group)
+		return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+					      wp->group, caches_used);
 
-	/* foreground writes: prefer tier 0: */
-	if (wp->group == &c->cache_all)
+	/* foreground writes: prefer fastest tier: */
+	tier = READ_ONCE(c->fastest_tier);
+	if (tier)
 		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				       &c->cache_tiers[0], caches_used);
+				       &tier->devs, caches_used);
 
 	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				      wp->group, caches_used);
+				      &c->cache_all, caches_used);
 }
 
 static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
 		? 0 : BTREE_NODE_RESERVE;
 	int ret;
 
-	BUG_ON(!wp->group);
 	BUG_ON(!reserve);
 	BUG_ON(!nr_replicas);
 retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 				   unsigned nr_replicas, struct open_bucket *ob,
 				   unsigned sectors)
 {
-	struct bch_extent_ptr tmp, *ptr;
+	struct bch_extent_ptr tmp;
 	struct cache *ca;
 	bool has_data = false;
 	unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 	if (nr_replicas < ob->nr_ptrs)
 		has_data = true;
 
+	rcu_read_lock();
+
 	for (i = 0; i < nr_replicas; i++) {
 		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
 
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 		extent_ptr_append(e, tmp);
 
 		ob->ptr_offset[i] += sectors;
+
+		if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+			this_cpu_add(*ca->sectors_written, sectors);
 	}
 
-	open_bucket_for_each_online_device(c, ob, ptr, ca)
-		this_cpu_add(*ca->sectors_written, sectors);
+	rcu_read_unlock();
 }
 
 /*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 
 /* Startup/shutdown (ro/rw): */
 
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
 {
-	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+	struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
 	struct cache *ca;
 	u64 total_capacity, capacity = 0, reserved_sectors = 0;
 	unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
 
 	c->bdi.ra_pages = ra_pages;
 
+	/* Find fastest, slowest tiers with devices: */
+
+	for (tier = c->tiers;
+	     tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+		if (!tier->devs.nr)
+			continue;
+		if (!fastest_tier)
+			fastest_tier = tier;
+		slowest_tier = tier;
+	}
+
+	c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+	c->promote_write_point.group = &fastest_tier->devs;
+
+	if (!fastest_tier)
+		goto set_capacity;
+
 	/*
 	 * Capacity of the cache set is the capacity of all the devices in the
 	 * slowest (highest) tier - we don't include lower tier devices.
 	 */
-	for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
-	     tier > c->cache_tiers && !tier->nr_devices;
-	     --tier)
-		;
-
-	group_for_each_cache_rcu(ca, tier, i) {
+	group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
 		size_t reserve = 0;
 
 		/*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
 			     ca->mi.first_bucket) <<
 			ca->bucket_bits;
 	}
+set_capacity:
 	rcu_read_unlock();
-
 	total_capacity = capacity;
 
 	capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
 void bch_dev_allocator_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *p;
 	struct closure cl;
 	unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
 int bch_dev_allocator_start(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *k;
 
 	/*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
 
 	bch_dev_group_add(tier, ca);
 	bch_dev_group_add(&c->cache_all, ca);
+	bch_dev_group_add(&c->journal.devs, ca);
 
 	bch_recalc_capacity(c);
 
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
 	return 0;
 }
 
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
 {
 	unsigned i;
 
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
 
 	spin_lock_init(&c->cache_all.lock);
 
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+		spin_lock_init(&c->tiers[i].devs.lock);
+
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
 		c->write_points[i].throttle = true;
-		c->write_points[i].group = &c->cache_tiers[0];
-	}
-
-	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
-		spin_lock_init(&c->cache_tiers[i].lock);
-
-	c->promote_write_point.group = &c->cache_tiers[0];
-
-	c->migration_write_point.group = &c->cache_all;
-
-	c->btree_write_point.group = &c->cache_all;
 
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
index 09139a59..9573dd2c 100644
--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
 
 void bch_recalc_min_prio(struct cache *, int);
 
+size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 
 struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
 {
 	struct cache *ret = NULL;
 
-	while (*iter < devs->nr_devices &&
+	while (*iter < devs->nr &&
 	       !(ret = rcu_dereference(devs->d[*iter].dev)))
 		(*iter)++;
 
@@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
 	     ((_ca) = __open_bucket_next_online_device(_c, _ob,	_ptr, _ca));\
 	     (_ptr)++)
 
+void bch_recalc_capacity(struct cache_set *);
 void bch_dev_allocator_stop(struct cache *);
 int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
 
 #endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
index fbe8b75c..f408bd97 100644
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
 
 struct cache_group {
 	spinlock_t		lock;
-	unsigned		nr_devices;
+	unsigned		nr;
 	unsigned		cur_device;
 	struct {
 		u64		weight;
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
index babc08db..5b668c71 100644
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@@ -464,24 +464,10 @@ struct cache {
  * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
  * all the backing devices first (their cached data gets invalidated, and they
  * won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
  */
 enum {
-	/* Startup: */
 	BCH_FS_INITIAL_GC_DONE,
-	BCH_FS_RUNNING,
-
-	/* Shutdown: */
 	BCH_FS_DETACHING,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RO_COMPLETE,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_GC_STOPPING,
@@ -498,6 +484,21 @@ struct btree_debug {
 	struct dentry		*failed;
 };
 
+struct bch_tier {
+	unsigned		idx;
+	struct task_struct	*migrate;
+	struct bch_pd_controller pd;
+
+	struct cache_group	devs;
+};
+
+enum bch_fs_state {
+	BCH_FS_STARTING		= 0,
+	BCH_FS_STOPPING,
+	BCH_FS_RO,
+	BCH_FS_RW,
+};
+
 struct cache_set {
 	struct closure		cl;
 
@@ -506,7 +507,6 @@ struct cache_set {
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
-	struct completion	*stop_completion;
 	unsigned long		flags;
 
 	int			minor;
@@ -514,6 +514,10 @@ struct cache_set {
 	struct super_block	*vfs_sb;
 	char			name[40];
 
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+	enum bch_fs_state	state;
+
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
 	struct work_struct	read_only_work;
@@ -640,7 +644,9 @@ struct cache_set {
 	 * allocate from:
 	 */
 	struct cache_group	cache_all;
-	struct cache_group	cache_tiers[BCH_TIER_MAX];
+	struct bch_tier		tiers[BCH_TIER_MAX];
+	/* NULL if we only have devices in one tier: */
+	struct bch_tier		*fastest_tier;
 
 	u64			capacity; /* sectors */
 
@@ -753,10 +759,6 @@ struct cache_set {
 	unsigned		writeback_pages_max;
 	atomic_long_t		nr_inodes;
 
-	/* TIERING */
-	struct task_struct	*tiering_read;
-	struct bch_pd_controller tiering_pd;
-
 	/* NOTIFICATIONS */
 	struct mutex		uevent_lock;
 	struct kobj_uevent_env	uevent_env;
@@ -828,6 +830,11 @@ struct cache_set {
 #undef BCH_TIME_STAT
 };
 
+static inline bool bch_fs_running(struct cache_set *c)
+{
+	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
 static inline unsigned bucket_pages(const struct cache *ca)
 {
 	return ca->mi.bucket_size / PAGE_SECTORS;
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
index 82b07f59..ba2e9a8c 100644
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	bool found;
 	int ret;
 
+	lockdep_assert_held(&c->state_lock);
+
 	bdevname(dc->disk_sb.bdev, buf);
 
 	if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 		return -EINVAL;
 	}
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
-		return 0;
-
-	if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-		pr_err("Can't attach %s: shutting down", buf);
+	if (!bch_fs_running(c)) {
+		pr_err("Can't attach %s: not running", buf);
 		return -EINVAL;
 	}
 
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
 	struct cached_dev *dc, *t;
 
 	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
 		bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
 	struct bkey_s_c_inode_blockdev inode;
 	int ret = 0;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EINVAL;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
index 4d5efdbd..4d0c6d4d 100644
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@@ -11,8 +11,9 @@
 
 #define DEF_BTREE_ID(kwd, val, name) name,
 
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
 	DEFINE_BCH_BTREE_IDS()
+	NULL
 };
 
 #undef DEF_BTREE_ID
@@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
 	return mca_can_free(c) * btree_pages(c);
 }
 
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
 {
 	struct btree *b;
 	unsigned i;
@@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
 		rhashtable_destroy(&c->btree_cache_table);
 }
 
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
 {
 	unsigned i;
 	int ret;
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
index c26489d1..4d67704b 100644
--- a/libbcache/btree_cache.h
+++ b/libbcache/btree_cache.h
@@ -6,7 +6,7 @@
 
 struct btree_iter;
 
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
 
 void bch_recalc_btree_reserve(struct cache_set *);
 
@@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
 struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
 				 unsigned, enum six_lock_type);
 
-void bch_btree_cache_free(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
 	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
index 0eb7290c..b90807f7 100644
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
 	}
 }
 
+static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
+				  enum bucket_data_type type)
+{
+	u64 b = start >> ca->bucket_bits;
+
+	do {
+		bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+		b++;
+	} while (b < end >> ca->bucket_bits);
+}
+
 /*
  * Mark non btree metadata - prios, journal
  */
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
+	u64 b;
+
+	/* Mark superblocks: */
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		if (layout->sb_offset[i] == BCH_SB_SECTOR)
+			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
+					      BUCKET_SB);
+
+		mark_metadata_sectors(ca,
+				      layout->sb_offset[i],
+				      layout->sb_offset[i] +
+				      (1 << layout->sb_max_size_bits),
+				      BUCKET_SB);
+	}
+
+	spin_lock(&c->journal.lock);
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch_mark_metadata_bucket(ca, ca->buckets + b,
+					 BUCKET_JOURNAL, true);
+	}
+
+	spin_unlock(&c->journal.lock);
+
+	spin_lock(&ca->prio_buckets_lock);
+
+	for (i = 0; i < prio_buckets(ca) * 2; i++) {
+		b = ca->prio_buckets[i];
+		if (b)
+			bch_mark_metadata_bucket(ca, ca->buckets + b,
+						 BUCKET_PRIOS, true);
+	}
+
+	spin_unlock(&ca->prio_buckets_lock);
+}
+
 static void bch_mark_metadata(struct cache_set *c)
 {
 	struct cache *ca;
-	unsigned i, j;
-	u64 b;
+	unsigned i;
 
-	for_each_cache(ca, c, i) {
-		for (j = 0; j < ca->journal.nr; j++) {
-			b = ca->journal.buckets[j];
-			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-		}
+	mutex_lock(&c->sb_lock);
 
-		spin_lock(&ca->prio_buckets_lock);
+	for_each_cache(ca, c, i)
+		bch_mark_dev_metadata(c, ca);
 
-		for (j = 0; j < prio_buckets(ca) * 2; j++) {
-			b = ca->prio_buckets[j];
-			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-		}
-
-		spin_unlock(&ca->prio_buckets_lock);
-	}
+	mutex_unlock(&c->sb_lock);
 }
 
 /* Also see bch_pending_btree_node_free_insert_done() */
@@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, new, ({
 				new.owned_by_allocator	= 0;
-				new.is_metadata		= 0;
+				new.data_type		= 0;
 				new.cached_sectors	= 0;
 				new.dirty_sectors	= 0;
 			}));
@@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
 	u64 start_time;
 	enum btree_id id;
 
-	if (btree_gc_coalesce_disabled(c))
-		return;
-
 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return;
 
@@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
 		last_kick = atomic_read(&c->kick_gc);
 
 		bch_gc(c);
-		bch_coalesce(c);
+		if (!btree_gc_coalesce_disabled(c))
+			bch_coalesce(c);
 
 		debug_check_no_locks_held();
 	}
@@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
 {
 	set_bit(BCH_FS_GC_STOPPING, &c->flags);
 
-	if (!IS_ERR_OR_NULL(c->gc_thread))
+	if (c->gc_thread)
 		kthread_stop(c->gc_thread);
+
+	c->gc_thread = NULL;
+	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 }
 
 int bch_gc_thread_start(struct cache_set *c)
 {
-	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+	struct task_struct *p;
 
-	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
-	if (IS_ERR(c->gc_thread))
-		return PTR_ERR(c->gc_thread);
+	BUG_ON(c->gc_thread);
 
+	p = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	c->gc_thread = p;
 	wake_up_process(c->gc_thread);
 	return 0;
 }
@@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 {
 	enum btree_id id;
 
-	if (journal) {
-		for (id = 0; id < BTREE_ID_NR; id++)
-			bch_initial_gc_btree(c, id);
+	bch_mark_metadata(c);
 
+	for (id = 0; id < BTREE_ID_NR; id++)
+		bch_initial_gc_btree(c, id);
+
+	if (journal)
 		bch_journal_mark(c, journal);
-	}
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 
-	bch_mark_metadata(c);
-
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
index 315cfbec..ec4ee54a 100644
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@@ -66,6 +66,7 @@
 #include "alloc.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "error.h"
 
 #include <linux/preempt.h>
 #include <trace/events/bcache.h>
@@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
 
 #endif
 
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
 void bch_bucket_seq_cleanup(struct cache_set *c)
 {
 	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, m, ({
-				if (!m.wait_on_journal ||
-				    ((s16) last_seq_ondisk -
-				     (s16) m.journal_seq < 0))
+				if (!m.journal_seq_valid ||
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
 					break;
 
-				m.wait_on_journal = 0;
+				m.journal_seq_valid = 0;
 			}));
 		}
 }
@@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
 
 static inline int is_meta_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && m.is_metadata;
+	return m.data_type != BUCKET_DATA;
 }
 
 static inline int is_dirty_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+	return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
 }
 
 static inline int is_cached_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+	return m.data_type == BUCKET_DATA &&
+		!m.dirty_sectors && !!m.cached_sectors;
 }
 
 void bch_fs_stats_apply(struct cache_set *c,
@@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
 	memset(stats, 0, sizeof(*stats));
 }
 
+static bool bucket_became_unavailable(struct cache_set *c,
+				      struct bucket_mark old,
+				      struct bucket_mark new)
+{
+	return is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       c->gc_pos.phase == GC_PHASE_DONE;
+}
+
 static void bucket_stats_update(struct cache *ca,
 			struct bucket_mark old, struct bucket_mark new,
-			bool may_make_unavailable,
 			struct bucket_stats_cache_set *bch_alloc_stats)
 {
 	struct cache_set *c = ca->set;
 	struct bucket_stats_cache *cache_stats;
 
-	BUG_ON(!may_make_unavailable &&
-	       is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	bch_fs_inconsistent_on(old.data_type && new.data_type &&
+			old.data_type != new.data_type, c,
+			"different types of metadata in same bucket: %u, %u",
+			old.data_type, new.data_type);
 
 	if (bch_alloc_stats) {
 		bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
 			(int) new.cached_sectors - (int) old.cached_sectors;
 
 		bch_alloc_stats->s[S_COMPRESSED]
-			[old.is_metadata ? S_META : S_DIRTY] -=
+			[is_meta_bucket(old) ? S_META : S_DIRTY] -=
 			old.dirty_sectors;
 
 		bch_alloc_stats->s[S_COMPRESSED]
-			[new.is_metadata ? S_META : S_DIRTY] +=
+			[is_meta_bucket(new) ? S_META : S_DIRTY] +=
 			new.dirty_sectors;
 	}
 
@@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
 	cache_stats->sectors_cached +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
 
-	if (old.is_metadata)
+	if (is_meta_bucket(old))
 		cache_stats->sectors_meta -= old.dirty_sectors;
 	else
 		cache_stats->sectors_dirty -= old.dirty_sectors;
 
-	if (new.is_metadata)
+	if (is_meta_bucket(new))
 		cache_stats->sectors_meta += new.dirty_sectors;
 	else
 		cache_stats->sectors_dirty += new.dirty_sectors;
@@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
 		bch_wake_allocator(ca);
 }
 
+#define bucket_data_cmpxchg(ca, g, new, expr)			\
+({								\
+	struct bucket_stats_cache_set _stats = { 0 };		\
+	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
+								\
+	bucket_stats_update(ca, _old, new, &_stats);		\
+	_old;							\
+})
+
 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 {
 	struct bucket_stats_cache_set stats = { 0 };
@@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
 	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= 1;
-		new.is_metadata		= 0;
+		new.had_metadata	= 0;
+		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 		new.copygc		= 0;
 		new.gen++;
 	}));
 
-	BUG_ON(old.dirty_sectors);
+	bucket_stats_update(ca, old, new, &stats);
 
-	bucket_stats_update(ca, old, new, true, &stats);
+	BUG_ON(old.dirty_sectors);
 
 	/*
 	 * Ick:
@@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
 void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
 {
-	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
 
-	old = bucket_cmpxchg(g, new, ({
+	old = bucket_data_cmpxchg(ca, g, new, ({
 		new.owned_by_allocator	= 0;
-		new.is_metadata		= 0;
+		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 	}));
 
-	bucket_stats_update(ca, old, new, false, &stats);
+	BUG_ON(bucket_became_unavailable(ca->set, old, new));
 }
 
 void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
 			   bool owned_by_allocator)
 {
-	struct bucket_stats_cache_set stats = { 0 };
-	struct bucket_mark old, new;
+	struct bucket_mark new;
 
-	old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
-
-	bucket_stats_update(ca, old, new, true, &stats);
+	bucket_data_cmpxchg(ca, g, new, ({
+		new.owned_by_allocator = owned_by_allocator;
+	}));
 }
 
 void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+			      enum bucket_data_type type,
 			      bool may_make_unavailable)
 {
-	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
 
-	old = bucket_cmpxchg(g, new, ({
-		new.is_metadata = 1;
+	BUG_ON(!type);
+
+	old = bucket_data_cmpxchg(ca, g, new, ({
+		new.data_type = type;
 		new.had_metadata = 1;
 	}));
 
 	BUG_ON(old.cached_sectors);
 	BUG_ON(old.dirty_sectors);
-
-	bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+	BUG_ON(!may_make_unavailable &&
+	       bucket_became_unavailable(ca->set, old, new));
 }
 
 #define saturated_add(ca, dst, src, max)			\
@@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
 
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
-			new.is_metadata = false;
+			new.data_type	= 0;
 
 			if (journal_seq) {
-				new.wait_on_journal = true;
+				new.journal_seq_valid = 1;
 				new.journal_seq = journal_seq;
 			}
 		} else {
-			new.is_metadata = (type == S_META);
+			new.data_type = type == S_META
+				? BUCKET_BTREE : BUCKET_DATA;
 		}
 
-		new.had_metadata |= new.is_metadata;
+		new.had_metadata |= is_meta_bucket(new);
 	} while ((v = cmpxchg(&g->_mark.counter,
 			      old.counter,
 			      new.counter)) != old.counter);
 
-	bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+	bucket_stats_update(ca, old, new, NULL);
+
+	BUG_ON(!may_make_unavailable &&
+	       bucket_became_unavailable(c, old, new));
 
 	if (saturated &&
 	    atomic_long_add_return(saturated,
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
index 9c6e4385..6d70103e 100644
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
-		!mark.is_metadata &&
-		!mark.dirty_sectors);
+		mark.data_type == BUCKET_DATA &&
+		!mark.dirty_sectors &&
+		!mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
 }
 
 void bch_bucket_seq_cleanup(struct cache_set *);
@@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
 void bch_invalidate_bucket(struct cache *, struct bucket *);
 void bch_mark_free_bucket(struct cache *, struct bucket *);
 void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
+			      enum bucket_data_type, bool);
 
 void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
 		       struct bucket_stats_cache_set *);
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
index 6bbdcd26..f42e09d8 100644
--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@@ -1,6 +1,14 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+enum bucket_data_type {
+	BUCKET_DATA	= 0,
+	BUCKET_BTREE,
+	BUCKET_PRIOS,
+	BUCKET_JOURNAL,
+	BUCKET_SB,
+};
+
 struct bucket_mark {
 	union {
 	struct {
@@ -12,23 +20,30 @@ struct bucket_mark {
 
 		/* generation copygc is going to move this bucket into */
 		unsigned	copygc:1;
-		unsigned	wait_on_journal:1;
+
+		unsigned	journal_seq_valid:1;
 
 		/*
-		 * If this bucket ever had metadata in it, the allocator must
-		 * increment its gen before we reuse it:
+		 * If this bucket had metadata while at the current generation
+		 * number, the allocator must increment its gen before we reuse
+		 * it:
 		 */
 		unsigned	had_metadata:1;
 
 		unsigned	owned_by_allocator:1;
-		unsigned	is_metadata:1;
 
-		u16		cached_sectors;
+		unsigned	data_type:3;
+
+		unsigned	nouse:1;
+
 		u16		dirty_sectors;
+		u16		cached_sectors;
 
 		/*
 		 * low bits of journal sequence number when this bucket was most
-		 * recently modified:
+		 * recently modified: if journal_seq_valid is set, this bucket
+		 * can't be reused until the journal sequence number written to
+		 * disk is >= the bucket's journal sequence number:
 		 */
 		u16		journal_seq;
 	};
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
index b142d7b2..049aa910 100644
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
 
 static long bch_ioctl_stop(struct cache_set *c)
 {
-	bch_fs_stop(c);
+	bch_fs_stop_async(c);
 	return 0;
 }
 
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
index dae52d49..92036db4 100644
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
 	if (ret)
 		goto err;
 
-	crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
-						sizeof(*crypt) / sizeof(u64)),
-				     struct bch_sb_field_crypt, field);
+	crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
 		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
 		goto err;
 	}
 
-	crypt->field.type = BCH_SB_FIELD_crypt;
 	crypt->key = key;
 
 	/* write superblock */
@@ -560,7 +557,7 @@ err:
 	return ret;
 }
 
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->poly1305))
 		crypto_free_shash(c->poly1305);
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
index 137c9155..9d4da08d 100644
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
 int bch_disable_encryption(struct cache_set *);
 int bch_enable_encryption(struct cache_set *, bool);
 
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
 int bch_fs_encryption_init(struct cache_set *);
 
 static inline unsigned bch_data_checksum_type(struct cache_set *c)
diff --git a/libbcache/compress.c b/libbcache/compress.c
index f81a8143..89da31e5 100644
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
 		break;
 	}
 
-	return bch_compress_init(c);
+	return bch_fs_compress_init(c);
 }
 
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
 {
 	vfree(c->zlib_workspace);
 	mempool_exit(&c->lz4_workspace_pool);
@@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
 	max_t(size_t, zlib_inflate_workspacesize(),			\
 	      zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
 
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
 {
 	unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
 	int ret, cpu;
 
-	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
-	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-		return 0;
-
 	if (!c->bio_decompress_worker) {
 		c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
 		if (!c->bio_decompress_worker)
@@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
 		}
 	}
 
+	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+		return 0;
+
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_page_pool(&c->compression_bounce[READ],
 					     1, order);
diff --git a/libbcache/compress.h b/libbcache/compress.h
index 485acd95..4604b065 100644
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
 		      struct bio *, size_t *, unsigned *);
 
 int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
 
 #endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
index d25c32ae..16cc72b9 100644
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
 	.read		= bch_read_bfloat_failed,
 };
 
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->debug))
 		debugfs_remove_recursive(c->debug);
 }
 
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
 {
 	struct btree_debug *bd;
 	char name[100];
@@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
 	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 	     bd++) {
 		bd->id = bd - c->btree_debug;
-		bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+		bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
 						0400, c->debug, bd,
 						&btree_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-formats",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 
 		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
 						       &btree_format_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-bfloat-failed",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 
 		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
 						 &bfloat_failed_debug_ops);
diff --git a/libbcache/debug.h b/libbcache/debug.h
index a3635e60..d34a95a0 100644
--- a/libbcache/debug.h
+++ b/libbcache/debug.h
@@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
 }
 
 #ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
 #else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
 #endif
 
 void bch_debug_exit(void);
diff --git a/libbcache/error.c b/libbcache/error.c
index 9f39be1b..f4109da6 100644
--- a/libbcache/error.c
+++ b/libbcache/error.c
@@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
 	case BCH_ON_ERROR_RO:
 		if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 			/* XXX do something better here? */
-			bch_fs_stop(c);
+			bch_fs_stop_async(c);
 			return;
 		}
 
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 	} else {
 		bch_notify_dev_error(ca, true);
 
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&c->state_lock);
 		dev = bch_dev_may_remove(ca);
 		if (dev
 		    ? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 				"too many IO errors on %s, setting %s RO",
 				bdevname(ca->disk_sb.bdev, buf),
 				dev ? "device" : "filesystem");
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&c->state_lock);
 	}
 }
 
diff --git a/libbcache/extents.c b/libbcache/extents.c
index 523f3f48..c5e0e375 100644
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
 			do {
 				seq = read_seqcount_begin(&c->gc_pos_lock);
 				bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				       !g->mark.is_metadata;
+				       g->mark.data_type != BUCKET_BTREE;
 			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
 			err = "inconsistent";
@@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
+	struct extent_pick_ptr pick = { .ca = NULL };
 	struct cache *ca;
 
 	rcu_read_lock();
@@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 				PTR_BUCKET_NR(ca, ptr)))
 			continue;
 
-		percpu_ref_get(&ca->ref);
-		rcu_read_unlock();
+		if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
+			continue;
 
-		return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+		pick.ca		= ca;
+		pick.ptr	= *ptr;
 	}
 
+	if (pick.ca)
+		percpu_ref_get(&pick.ca->ref);
+
 	rcu_read_unlock();
 
-	return (struct extent_pick_ptr) { .ca = NULL, };
+	return pick;
 }
 
 const struct bkey_ops bch_bkey_btree_ops = {
@@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
 				if (stale)
 					break;
 
-				bad = (mark.is_metadata ||
+				bad = (mark.data_type != BUCKET_DATA ||
 				       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
 					!mark.owned_by_allocator &&
 					!(ptr->cached
@@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 		rcu_read_lock();
 		ret->ca = NULL;
 
-		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
-			if (!ptr_stale(ca, ptr)) {
-				*ret = (struct extent_pick_ptr) {
-					.crc = crc_to_128(e.k, crc),
-					.ptr = *ptr,
-					.ca = ca,
-				};
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+			if (ptr_stale(ca, ptr))
+				continue;
 
-				if (ca != avoid)
-					break;
-			}
+			if (ret->ca &&
+			    (ca == avoid ||
+			     ret->ca->mi.tier < ca->mi.tier))
+				continue;
+
+			*ret = (struct extent_pick_ptr) {
+				.crc = crc_to_128(e.k, crc),
+				.ptr = *ptr,
+				.ca = ca,
+			};
+		}
 
 		if (ret->ca)
 			percpu_ref_get(&ret->ca->ref);
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
index e9585fd5..e2f1427f 100644
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@@ -545,9 +545,9 @@ struct nlink {
 	u32	dir_count;
 };
 
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
 
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
 		     u64 range_start, u64 *range_end,
 		     u64 inum, bool dir)
 {
@@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
 }
 
 noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
 			       u64 range_start, u64 *range_end)
 {
 	struct btree_iter iter;
@@ -776,7 +776,7 @@ fsck_err:
 noinline_for_stack
 static int bch_gc_walk_inodes(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode,
-			      struct nlinks *links,
+			      nlink_table *links,
 			      u64 range_start, u64 range_end)
 {
 	struct btree_iter iter;
@@ -850,7 +850,7 @@ noinline_for_stack
 static int check_inode_nlinks(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode)
 {
-	struct nlinks links;
+	nlink_table links;
 	u64 this_iter_range_start, next_iter_range_start = 0;
 	int ret = 0;
 
diff --git a/libbcache/fs.c b/libbcache/fs.c
index ab0d9728..ec70a3e3 100644
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
 		if (!c)
 			goto err_unlock;
 
-		if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+		mutex_lock(&c->state_lock);
+
+		if (!bch_fs_running(c)) {
+			mutex_unlock(&c->state_lock);
 			err = "incomplete cache set";
 			c = NULL;
 			goto err_unlock;
 		}
 
 		closure_get(&c->cl);
+		mutex_unlock(&c->state_lock);
 		mutex_unlock(&bch_register_lock);
 	}
 
@@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		return ret;
 
-	mutex_lock(&bch_register_lock);
-
 	if (opts.read_only >= 0 &&
 	    opts.read_only != c->opts.read_only) {
 		const char *err = NULL;
 
 		if (opts.read_only) {
-			bch_fs_read_only_sync(c);
+			bch_fs_read_only(c);
 
 			sb->s_flags |= MS_RDONLY;
 		} else {
 			err = bch_fs_read_write(c);
 			if (err) {
 				bch_err(c, "error going rw: %s", err);
-				ret = -EINVAL;
-				goto unlock;
+				return -EINVAL;
 			}
 
 			sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.errors >= 0)
 		c->opts.errors = opts.errors;
 
-unlock:
-	mutex_unlock(&bch_register_lock);
-
 	return ret;
 }
 
@@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 
 	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-		bch_fs_stop_sync(c);
+		bch_fs_stop(c);
 	else
 		closure_put(&c->cl);
 }
@@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
 
 MODULE_ALIAS_FS("bcache");
 
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
 	if (bch_dio_write_bioset)
@@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
 		kmem_cache_destroy(bch_inode_cache);
 }
 
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
@@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
 
 	return 0;
 err:
-	bch_fs_exit();
+	bch_vfs_exit();
 	return ret;
 }
diff --git a/libbcache/fs.h b/libbcache/fs.h
index 933fb6de..2a29b132 100644
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
 int __must_check bch_write_inode(struct cache_set *,
 				 struct bch_inode_info *);
 
-void bch_fs_exit(void);
-int bch_fs_init(void);
+void bch_vfs_exit(void);
+int bch_vfs_init(void);
 
 #else
 
-static inline void bch_fs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline void bch_vfs_exit(void) {}
+static inline int bch_vfs_init(void) { return 0; }
 
 #endif
 
diff --git a/libbcache/io.c b/libbcache/io.c
index be99a973..a3df3794 100644
--- a/libbcache/io.c
+++ b/libbcache/io.c
@@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
 	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
 
 	while ((op = c->write_wait_head)) {
-		if (!test_bit(BCH_FS_RO, &c->flags) &&
-		    !test_bit(BCH_FS_STOPPING, &c->flags) &&
-		    time_after(op->expires, jiffies)) {
+		if (time_after(op->expires, jiffies)) {
 			mod_timer(&c->foreground_write_wakeup, op->expires);
 			break;
 		}
@@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
 		return;
 	}
 
-	if (rbio->promote &&
-	    !test_bit(BCH_FS_RO, &c->flags) &&
-	    !test_bit(BCH_FS_STOPPING, &c->flags)) {
+	if (rbio->promote) {
 		struct cache_promote_op *promote = rbio->promote;
 		struct closure *cl = &promote->cl;
 
@@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
 		preempt_disable();
 		d = this_cpu_ptr(c->bio_decompress_worker);
 		llist_add(&rbio->list, &d->bio_list);
-		queue_work(system_unbound_wq, &d->work);
+		queue_work(system_highpri_wq, &d->work);
 		preempt_enable();
 	} else {
 		__bch_read_endio(c, rbio);
 	}
 }
 
+static bool should_promote(struct cache_set *c,
+			   struct extent_pick_ptr *pick, unsigned flags)
+{
+	if (!(flags & BCH_READ_PROMOTE))
+		return false;
+
+	if (percpu_ref_is_dying(&c->writes))
+		return false;
+
+	return c->fastest_tier &&
+		c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
 void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 			  struct bvec_iter iter, struct bkey_s_c k,
 			  struct extent_pick_ptr *pick, unsigned flags)
@@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 	 * XXX: multiple promotes can race with each other, wastefully. Keep a
 	 * list of outstanding promotes?
 	 */
-	if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+	if (should_promote(c, pick, flags)) {
 		/*
 		 * biovec needs to be big enough to hold decompressed data, if
 		 * the bch_write_extent() has to decompress/recompress it:
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 99dd9f26..b2838376 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
 
-	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
-			bytes > c->journal.entry_size_max, c,
+	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
 			"journal entry too big (%zu bytes), sector %lluu",
 			bytes, sector)) {
 		/* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
 {
 	struct journal *j = &c->journal;
 	struct journal_seq_blacklist *bl;
-	struct cache *ca;
 	u64 new_seq = 0;
-	unsigned i;
-
-	for_each_cache(ca, c, i)
-		if (is_journal_device(ca))
-			bch_dev_group_add(&c->journal.devs, ca);
 
 	list_for_each_entry(bl, &j->seq_blacklist, list)
 		new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
 	return ret;
 }
 
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+				      unsigned nr, bool write_super)
 {
+	struct journal *j = &c->journal;
 	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets =
-		bch_sb_get_journal(ca->disk_sb.sb);
-	struct bch_sb_field *f;
-	u64 *p;
+	struct bch_sb_field_journal *journal_buckets;
+	struct disk_reservation disk_res = { 0, 0 };
+	struct closure cl;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	int ret = 0;
 
-	p = krealloc(ja->bucket_seq, nr * sizeof(u64),
-		     GFP_KERNEL|__GFP_ZERO);
-	if (!p)
-		return -ENOMEM;
+	closure_init_stack(&cl);
 
-	ja->bucket_seq = p;
+	mutex_lock(&c->sb_lock);
 
-	p = krealloc(ja->buckets, nr * sizeof(u64),
-		     GFP_KERNEL|__GFP_ZERO);
-	if (!p)
-		return -ENOMEM;
+	/* don't handle reducing nr of buckets yet: */
+	if (nr <= ja->nr)
+		goto err;
 
-	ja->buckets = p;
+	/*
+	 * note: journal buckets aren't really counted as _sectors_ used yet, so
+	 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+	 * when space used goes up without a reservation - but we do need the
+	 * reservation to ensure we'll actually be able to allocate:
+	 */
 
-	f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
-				    sizeof(*journal_buckets) / sizeof(u64));
-	if (!f)
-		return -ENOMEM;
-	f->type = BCH_SB_FIELD_journal;
+	ret = ENOSPC;
+	if (bch_disk_reservation_get(c, &disk_res,
+			(nr - ja->nr) << ca->bucket_bits, 0))
+		goto err;
 
-	ja->nr = nr;
-	return 0;
+	ret = -ENOMEM;
+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	if (!new_buckets || !new_bucket_seq)
+		goto err;
+
+	journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+				nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (!journal_buckets)
+		goto err;
+
+	spin_lock(&j->lock);
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+
+	while (ja->nr < nr) {
+		/* must happen under journal lock, to avoid racing with gc: */
+		u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+		if (!b) {
+			if (!closure_wait(&c->freelist_wait, &cl)) {
+				spin_unlock(&j->lock);
+				closure_sync(&cl);
+				spin_lock(&j->lock);
+			}
+			continue;
+		}
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[b],
+					 BUCKET_JOURNAL, false);
+		bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+		memmove(ja->buckets + ja->last_idx + 1,
+			ja->buckets + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+		memmove(ja->bucket_seq + ja->last_idx + 1,
+			ja->bucket_seq + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+		memmove(journal_buckets->buckets + ja->last_idx + 1,
+			journal_buckets->buckets + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+
+		ja->buckets[ja->last_idx] = b;
+		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+		if (ja->last_idx < ja->nr) {
+			if (ja->cur_idx >= ja->last_idx)
+				ja->cur_idx++;
+			ja->last_idx++;
+		}
+		ja->nr++;
+
+	}
+	spin_unlock(&j->lock);
+
+	BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+	if (write_super)
+		bch_write_super(c);
+
+	ret = 0;
+err:
+	mutex_unlock(&c->sb_lock);
+
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+	bch_disk_reservation_put(c, &disk_res);
+
+	return ret;
 }
 
 int bch_dev_journal_alloc(struct cache *ca)
 {
-	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets;
-	int ret;
-	unsigned i;
-
-	if (ca->mi.tier != 0)
-		return 0;
-
 	if (dynamic_fault("bcache:add:journal_alloc"))
 		return -ENOMEM;
 
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
 	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
 	 * is smaller:
 	 */
-	ret = bch_set_nr_journal_buckets(ca,
+	return bch_set_nr_journal_buckets(ca->set, ca,
 			clamp_t(unsigned, ca->mi.nbuckets >> 8,
 				BCH_JOURNAL_BUCKETS_MIN,
 				min(1 << 10,
-				    (1 << 20) / ca->mi.bucket_size)));
-	if (ret)
-		return ret;
-
-	journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
-	for (i = 0; i < ja->nr; i++) {
-		u64 bucket = ca->mi.first_bucket + i;
-
-		ja->buckets[i] = bucket;
-		journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
-		bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
-	}
-
-	return 0;
+				    (1 << 20) / ca->mi.bucket_size)),
+			false);
 }
 
 /* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
 	     fifo_entry_idx(&j->pin, pin->pin_list))) {
 		if (journal_pin_active(pin))
 			__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list,
-				  pin, NULL);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
 	}
 
 	spin_unlock_irq(&j->pin_lock);
 }
 
-
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 {
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 	return ret;
 }
 
+static bool journal_has_pins(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	journal_reclaim_fast(j);
+	ret = fifo_used(&j->pin) > 1 ||
+		atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+	struct journal_entry_pin *pin;
+
+	while ((pin = journal_get_next_pin(j, U64_MAX)))
+		pin->flush(j, pin);
+
+	wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	struct cache_set *c = container_of(j, struct cache_set, journal);
 	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	struct bch_extent_ptr *ptr;
+	struct journal_device *ja;
 	struct cache *ca;
-	unsigned iter, replicas, replicas_want =
+	bool swapped;
+	unsigned i, replicas, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
 
 	spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
 	replicas = bch_extent_nr_ptrs(e.c);
 
+	spin_lock(&j->devs.lock);
+
+	/* Sort by tier: */
+	do {
+		swapped = false;
+
+		for (i = 0; i + 1 < j->devs.nr; i++)
+			if (j->devs.d[i + 0].dev->mi.tier >
+			    j->devs.d[i + 1].dev->mi.tier) {
+				swap(j->devs.d[i], j->devs.d[i + 1]);
+				swapped = true;
+			}
+	} while (swapped);
+
 	/*
-	 * Determine location of the next journal write:
-	 * XXX: sort caches by free journal space
+	 * Pick devices for next journal write:
+	 * XXX: sort devices by free journal space?
 	 */
-	group_for_each_cache_rcu(ca, &j->devs, iter) {
-		struct journal_device *ja = &ca->journal;
+	for (i = 0; i < j->devs.nr; i++) {
+		ca = j->devs.d[i].dev;
+		ja = &ca->journal;
 
 		if (replicas >= replicas_want)
 			break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
 		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
 	}
-
+	spin_unlock(&j->devs.lock);
 	rcu_read_unlock();
 
 	j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
 	return bch_journal_flush_seq(j, seq);
 }
 
-void bch_journal_free(struct journal *j)
-{
-	unsigned order = get_order(j->entry_size_max);
-
-	free_pages((unsigned long) j->buf[1].data, order);
-	free_pages((unsigned long) j->buf[0].data, order);
-	free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
-	static struct lock_class_key res_key;
-	unsigned order = get_order(entry_size_max);
-
-	spin_lock_init(&j->lock);
-	spin_lock_init(&j->pin_lock);
-	init_waitqueue_head(&j->wait);
-	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
-	mutex_init(&j->blacklist_lock);
-	INIT_LIST_HEAD(&j->seq_blacklist);
-	spin_lock_init(&j->devs.lock);
-	mutex_init(&j->reclaim_lock);
-
-	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
-	j->entry_size_max	= entry_size_max;
-	j->write_delay_ms	= 100;
-	j->reclaim_delay_ms	= 100;
-
-	bkey_extent_init(&j->key);
-
-	atomic64_set(&j->reservations.counter,
-		((union journal_res_state)
-		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
-	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
-		return -ENOMEM;
-
-	return 0;
-}
-
 ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 {
 	union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
 	return ret;
 }
 
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	/*
+	 * Empty out the journal by first flushing everything pinning existing
+	 * journal entries, then force a brand new empty journal entry to be
+	 * written:
+	 */
+	bch_journal_flush_pins(j);
+	bch_journal_flush_async(j, NULL);
+	bch_journal_meta(j);
+
+	cancel_delayed_work_sync(&j->write_work);
+	cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
 {
 	kfree(ca->journal.buckets);
 	kfree(ca->journal.bucket_seq);
 }
 
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
 
 	return 0;
 }
+
+void bch_fs_journal_exit(struct journal *j)
+{
+	unsigned order = get_order(j->entry_size_max);
+
+	free_pages((unsigned long) j->buf[1].data, order);
+	free_pages((unsigned long) j->buf[0].data, order);
+	free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+	static struct lock_class_key res_key;
+	unsigned order = get_order(entry_size_max);
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->pin_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+	mutex_init(&j->blacklist_lock);
+	INIT_LIST_HEAD(&j->seq_blacklist);
+	spin_lock_init(&j->devs.lock);
+	mutex_init(&j->reclaim_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->entry_size_max	= entry_size_max;
+	j->write_delay_ms	= 100;
+	j->reclaim_delay_ms	= 100;
+
+	bkey_extent_init(&j->key);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
index 02a6e676..d3a1db0c 100644
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@@ -111,7 +111,6 @@
 #include <linux/hash.h>
 
 #include "journal_types.h"
-//#include "super-io.h"
 
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
@@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
+void bch_journal_flush_pins(struct journal *);
 
 struct closure;
 struct cache_set;
@@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
 		? -EIO : 0;
 }
 
-static inline bool is_journal_device(struct cache *ca)
-{
-	return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
-}
-
 static inline bool journal_flushes_device(struct cache *ca)
 {
 	return true;
@@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
 	spin_unlock(&j->lock);
 }
 
-void bch_journal_free(struct journal *);
-int bch_journal_alloc(struct journal *, unsigned);
-
 ssize_t bch_journal_print_debug(struct journal *, char *);
 
 int bch_dev_journal_alloc(struct cache *);
@@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 
 int bch_journal_move(struct cache *);
 
-void bch_journal_free_cache(struct cache *);
-int bch_journal_init_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
+void bch_dev_journal_exit(struct cache *);
+int bch_dev_journal_init(struct cache *);
+void bch_fs_journal_exit(struct journal *);
+int bch_fs_journal_init(struct journal *, unsigned);
 
 #endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
index e40dfbca..27f5c63c 100644
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
 		}
 
 		if (g->mark.owned_by_allocator ||
-		    g->mark.is_metadata)
+		    g->mark.data_type != BUCKET_DATA)
 			continue;
 
 		sectors_used = bucket_sectors_used(g);
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
 	return 0;
 }
 
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
 {
-	bch_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+	if (ca->moving_gc_read)
+		kthread_stop(ca->moving_gc_read);
+	ca->moving_gc_read = NULL;
 }
 
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
 {
 	struct task_struct *t;
 
-	/* The moving gc read thread must be stopped */
-	BUG_ON(ca->moving_gc_read != NULL);
+	BUG_ON(ca->moving_gc_read);
 
 	if (ca->set->opts.nochanges)
 		return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
 	return 0;
 }
 
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
+	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.d_term = 0;
 }
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
index 5f153085..e8ae95e5 100644
--- a/libbcache/movinggc.h
+++ b/libbcache/movinggc.h
@@ -23,8 +23,8 @@
 #define COPYGC_SECTORS_PER_ITER(ca)					\
 	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
 
-void bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
 
 #endif
diff --git a/libbcache/opts.h b/libbcache/opts.h
index 95184db1..9b10310d 100644
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@@ -86,11 +86,17 @@ enum opt_type {
 	BCH_OPT(noreplay,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_OPT(norecovery,		0444,	NO_SB_OPT,		\
-		s8,  OPT_BOOL())
+		s8,  OPT_BOOL())					\
+	BCH_OPT(noexcl,			0444,	NO_SB_OPT,		\
+		s8,  OPT_BOOL())					\
+	BCH_OPT(sb,			0444,	NO_SB_OPT,		\
+		s64, OPT_UINT(0, S64_MAX))				\
 
 #define BCH_OPTS()							\
 	BCH_OPT(read_only,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
+	BCH_OPT(nostart,		0444,	NO_SB_OPT,		\
+		s8,  OPT_BOOL())					\
 	BCH_VISIBLE_OPTS()
 
 struct bch_opts {
@@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 
+#define opt_defined(_opt)		((_opt) >= 0)
+
 void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
 struct bch_opts bch_sb_opts(struct bch_sb *);
 
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
index be27d3ee..f50a5ee8 100644
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@@ -10,6 +10,7 @@
 #include "vstructs.h"
 
 #include <linux/backing-dev.h>
+#include <linux/sort.h>
 
 static inline void __bch_sb_layout_size_assert(void)
 {
@@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
 }
 
 struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
-				      enum bch_sb_field_types type)
+				      enum bch_sb_field_type type)
 {
 	struct bch_sb_field *f;
 
@@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
 	if (sb->bio)
 		bio_put(sb->bio);
 	if (!IS_ERR_OR_NULL(sb->bdev))
-		blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+		blkdev_put(sb->bdev, sb->mode);
 
 	free_pages((unsigned long) sb->sb, sb->page_order);
 	memset(sb, 0, sizeof(*sb));
@@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
 	return 0;
 }
 
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
 {
 	u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
 	u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
 	le32_add_cpu(&sb->u64s, u64s - old_u64s);
 
 	return f;
+}
 
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
+					 enum bch_sb_field_type type,
+					 unsigned u64s)
+{
+	struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
+
+	if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	f = __bch_sb_field_resize(sb->sb, f, u64s);
+	f->type = type;
+	return f;
 }
 
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
-					    struct bch_sb_field *f,
+					    enum bch_sb_field_type type,
 					    unsigned u64s)
 {
+	struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 	struct cache *ca;
@@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
 	for_each_cache(ca, c, i) {
 		struct bcache_superblock *sb = &ca->disk_sb;
 
-		if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+		if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
 			percpu_ref_put(&ca->ref);
 			return NULL;
 		}
 	}
 
-	return __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
-
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
-					     struct bch_sb_field *f,
-					     unsigned u64s)
-{
-	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-	ssize_t d = -old_u64s + u64s;
-
-	if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
-		return NULL;
-
-	return __bch_sb_field_resize(sb->sb, f, u64s);
+	f = __bch_sb_field_resize(c->disk_sb, f, u64s);
+	f->type = type;
+	return f;
 }
 
 static const char *validate_sb_layout(struct bch_sb_layout *layout)
@@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 
 	prev_offset = le64_to_cpu(layout->sb_offset[0]);
 
-	if (prev_offset != BCH_SB_SECTOR)
-		return "Invalid superblock layout: doesn't have default superblock location";
-
 	for (i = 1; i < layout->nr_superblocks; i++) {
 		offset = le64_to_cpu(layout->sb_offset[i]);
 
@@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	return NULL;
 }
 
+static int u64_cmp(const void *_l, const void *_r)
+{
+	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+	return l < r ? -1 : l > r ? 1 : 0;
+}
+
+const char *bch_validate_journal_layout(struct bch_sb *sb,
+					struct cache_member_cpu mi)
+{
+	struct bch_sb_field_journal *journal;
+	const char *err;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	journal = bch_sb_get_journal(sb);
+	if (!journal)
+		return NULL;
+
+	nr = bch_nr_journal_buckets(journal);
+	if (!nr)
+		return NULL;
+
+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	if (!b)
+		return "cannot allocate memory";
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	err = "journal bucket at sector 0";
+	if (!b[0])
+		goto err;
+
+	err = "journal bucket before first bucket";
+	if (b[0] < mi.first_bucket)
+		goto err;
+
+	err = "journal bucket past end of device";
+	if (b[nr - 1] >= mi.nbuckets)
+		goto err;
+
+	err = "duplicate journal buckets";
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1])
+			goto err;
+
+	err = NULL;
+err:
+	kfree(b);
+	return err;
+}
+
 const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *sb_mi;
-	struct bch_sb_field_journal *journal;
 	struct cache_member_cpu	mi;
 	const char *err;
 	u16 block_size;
-	unsigned i;
 
 	switch (le64_to_cpu(sb->version)) {
 	case BCACHE_SB_VERSION_CDEV_V4:
@@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 
 	mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
 
-	for (i = 0; i < sb->layout.nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
-		u64 max_size = 1 << sb->layout.sb_max_size_bits;
-
-		if (offset + max_size > mi.first_bucket * mi.bucket_size)
-			return "Invalid superblock: first bucket comes before end of super";
-	}
-
 	if (mi.nbuckets > LONG_MAX)
 		return "Too many buckets";
 
@@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	    mi.bucket_size * mi.nbuckets)
 		return "Invalid superblock: device too small";
 
-	/* Validate journal buckets: */
-	journal = bch_sb_get_journal(sb);
-	if (journal) {
-		for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
-			u64 b = le64_to_cpu(journal->buckets[i]);
-
-			if (b <  mi.first_bucket || b >= mi.nbuckets)
-				return "bad journal bucket";
-		}
-	}
+	err = bch_validate_journal_layout(sb, mi);
+	if (err)
+		return err;
 
 	return NULL;
 }
@@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
 
 static bool bch_is_open(struct block_device *bdev)
 {
-	lockdep_assert_held(&bch_register_lock);
+	bool ret;
 
-	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_lock(&bch_register_lock);
+	ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
 }
 
-static const char *bch_blkdev_open(const char *path, void *holder,
-				   struct bch_opts opts,
-				   struct block_device **ret)
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
+				   void *holder, struct block_device **ret)
 {
 	struct block_device *bdev;
-	fmode_t mode = opts.nochanges > 0
-		? FMODE_READ
-		: FMODE_READ|FMODE_WRITE|FMODE_EXCL;
 	const char *err;
 
 	*ret = NULL;
@@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
 	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
 	int ret;
 
-	ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+	ret = bch_sb_realloc(&ca->disk_sb, u64s);
 	if (ret)
 		return ret;
 
@@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
 reread:
 	bio_reset(sb->bio);
 	sb->bio->bi_bdev = sb->bdev;
-	sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+	sb->bio->bi_iter.bi_sector = offset;
 	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
 	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 	bch_bio_map(sb->bio, sb->sb);
@@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
 			   struct bch_opts opts,
 			   const char *path)
 {
+	u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
 	struct bch_sb_layout layout;
 	const char *err;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
-
 	memset(sb, 0, sizeof(*sb));
+	sb->mode = FMODE_READ;
 
-	err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+	if (!(opt_defined(opts.noexcl) && opts.noexcl))
+		sb->mode |= FMODE_EXCL;
+
+	if (!(opt_defined(opts.nochanges) && opts.nochanges))
+		sb->mode |= FMODE_WRITE;
+
+	err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
 	if (err)
 		return err;
 
@@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
 	if (bch_fs_init_fault("read_super"))
 		goto err;
 
-	err = read_one_super(sb, BCH_SB_SECTOR);
+	err = read_one_super(sb, offset);
 	if (!err)
 		goto got_super;
 
-	pr_err("error reading default super: %s", err);
+	if (offset != BCH_SB_SECTOR) {
+		pr_err("error reading superblock: %s", err);
+		goto err;
+	}
+
+	pr_err("error reading default superblock: %s", err);
 
 	/*
 	 * Error reading primary superblock - read location of backup
@@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
 
 	lockdep_assert_held(&c->sb_lock);
 
+	if (c->opts.nochanges)
+		return;
+
 	closure_init_stack(cl);
 
 	le64_add_cpu(&c->disk_sb->seq, 1);
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
index 665de811..ae1e8b9d 100644
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@@ -6,16 +6,35 @@
 
 #include <asm/byteorder.h>
 
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
+					 enum bch_sb_field_type, unsigned);
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+					 enum bch_sb_field_type, unsigned);
 
-#define BCH_SB_FIELD_TYPE(_name)				\
-static inline struct bch_sb_field_##_name *			\
-bch_sb_get_##_name(struct bch_sb *sb)				\
-{								\
-	struct bch_sb_field *f =				\
-		bch_sb_field_get(sb, BCH_SB_FIELD_##_name);	\
-								\
-	return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define BCH_SB_FIELD_TYPE(_name)					\
+static inline struct bch_sb_field_##_name *				\
+bch_sb_get_##_name(struct bch_sb *sb)					\
+{									\
+	return field_to_type(bch_sb_field_get(sb,			\
+				BCH_SB_FIELD_##_name), _name);		\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s)	\
+{									\
+	return field_to_type(bch_sb_field_resize(sb,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s)		\
+{									\
+	return field_to_type(bch_fs_sb_field_resize(c,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
 }
 
 BCH_SB_FIELD_TYPE(journal);
@@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
 int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
 int bch_sb_from_cache_set(struct cache_set *, struct cache *);
 
-struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
-				struct bch_sb_field *, unsigned);
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
-				struct bch_sb_field *, unsigned);
-
 void bch_free_super(struct bcache_superblock *);
 int bch_super_realloc(struct bcache_superblock *, unsigned);
 
+const char *bch_validate_journal_layout(struct bch_sb *,
+					struct cache_member_cpu);
 const char *bch_validate_cache_super(struct bcache_superblock *);
 
 const char *bch_read_super(struct bcache_superblock *,
diff --git a/libbcache/super.c b/libbcache/super.c
index fab34805..5535639c 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
 struct crypto_shash *bch_sha256;
 
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
 static int bch_dev_online(struct cache *);
 
 static int bch_congested_fn(void *data, int bdi_bits)
@@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits)
 			}
 		}
 	} else {
-		/* Writes only go to tier 0: */
-		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+		/* Writes prefer fastest tier: */
+		struct bch_tier *tier = READ_ONCE(c->fastest_tier);
+		struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+
+		group_for_each_cache_rcu(ca, grp, i) {
 			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
 			if (bdi_congested(bdi, bdi_bits)) {
@@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
 	return ret;
 }
 
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
 
 /*
  * For startup/shutdown of RW stuff, the dependencies are:
@@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c)
 	struct cache *ca;
 	unsigned i;
 
-	c->tiering_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&c->tiering_pd.rate);
-	bch_tiering_read_stop(c);
+	bch_tiering_stop(c);
 
 	for_each_cache(ca, c, i)
 		bch_moving_gc_stop(ca);
@@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		bch_dev_allocator_stop(ca);
 
-	/*
-	 * Write a journal entry after flushing the btree, so we don't end up
-	 * replaying everything we just flushed:
-	 */
-	if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-		int ret;
-
-		bch_journal_flush_async(&c->journal, NULL);
-		ret = bch_journal_meta(&c->journal);
-		BUG_ON(ret && !bch_journal_error(&c->journal));
-	}
-
-	cancel_delayed_work_sync(&c->journal.write_work);
-	cancel_delayed_work_sync(&c->journal.reclaim_work);
+	bch_fs_journal_stop(&c->journal);
 }
 
 static void bch_writes_disabled(struct percpu_ref *writes)
@@ -167,67 +155,18 @@ static void bch_writes_disabled(struct percpu_ref *writes)
 	wake_up(&bch_read_only_wait);
 }
 
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
 {
-	struct cache_set *c =
-		container_of(work, struct cache_set, read_only_work);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RW)
+		goto out;
 
-	percpu_ref_put(&c->writes);
-
-	del_timer(&c->foreground_write_wakeup);
-	cancel_delayed_work(&c->pd_controllers_update);
-
-	c->foreground_write_pd.rate.rate = UINT_MAX;
-	bch_wake_delayed_writes((unsigned long) c);
-
-	if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-		/*
-		 * If we're not doing an emergency shutdown, we want to wait on
-		 * outstanding writes to complete so they don't see spurious
-		 * errors due to shutting down the allocator:
-		 */
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-
-		__bch_fs_read_only(c);
-
-		if (!bch_journal_error(&c->journal) &&
-		    !test_bit(BCH_FS_ERROR, &c->flags)) {
-			mutex_lock(&c->sb_lock);
-			SET_BCH_SB_CLEAN(c->disk_sb, true);
-			bch_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
-	} else {
-		/*
-		 * If we are doing an emergency shutdown outstanding writes may
-		 * hang until we shutdown the allocator so we don't want to wait
-		 * on outstanding writes before shutting everything down - but
-		 * we do need to wait on them before returning and signalling
-		 * that going RO is complete:
-		 */
-		__bch_fs_read_only(c);
-
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-	}
-
-	bch_notify_fs_read_only(c);
-	trace_fs_read_only_done(c);
-
-	set_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	wake_up(&bch_read_only_wait);
-}
-
-bool bch_fs_read_only(struct cache_set *c)
-{
-	if (test_and_set_bit(BCH_FS_RO, &c->flags))
-		return false;
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		goto out;
 
 	trace_fs_read_only(c);
 
-	percpu_ref_get(&c->writes);
-
 	/*
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
@@ -238,40 +177,83 @@ bool bch_fs_read_only(struct cache_set *c)
 	 */
 	percpu_ref_kill(&c->writes);
 
-	queue_work(system_freezable_wq, &c->read_only_work);
-	return true;
+	del_timer(&c->foreground_write_wakeup);
+	cancel_delayed_work(&c->pd_controllers_update);
+
+	c->foreground_write_pd.rate.rate = UINT_MAX;
+	bch_wake_delayed_writes((unsigned long) c);
+
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+	__bch_fs_read_only(c);
+
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+
+	if (!bch_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		mutex_lock(&c->sb_lock);
+		SET_BCH_SB_CLEAN(c->disk_sb, true);
+		bch_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	c->state = BCH_FS_RO;
+	bch_notify_fs_read_only(c);
+	trace_fs_read_only_done(c);
+out:
+	mutex_unlock(&c->state_lock);
+}
+
+static void bch_fs_read_only_work(struct work_struct *work)
+{
+	struct cache_set *c =
+		container_of(work, struct cache_set, read_only_work);
+
+	bch_fs_read_only(c);
+}
+
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
 }
 
 bool bch_fs_emergency_read_only(struct cache_set *c)
 {
 	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
-	bch_fs_read_only(c);
+	bch_fs_read_only_async(c);
 	bch_journal_halt(&c->journal);
 
 	wake_up(&bch_read_only_wait);
 	return ret;
 }
 
-void bch_fs_read_only_sync(struct cache_set *c)
-{
-	/* so we don't race with bch_fs_read_write() */
-	lockdep_assert_held(&bch_register_lock);
-
-	bch_fs_read_only(c);
-
-	wait_event(bch_read_only_wait,
-		   test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
-		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
 {
 	struct cache *ca;
-	const char *err;
+	const char *err = NULL;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RO)
+		goto out;
 
 	err = "error starting allocator thread";
 	for_each_cache(ca, c, i)
@@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c)
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	for_each_cache(ca, c, i) {
-		if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-			continue;
-
-		err = "error starting moving GC thread";
-		if (bch_moving_gc_thread_start(ca)) {
+	err = "error starting moving GC thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+		    bch_moving_gc_start(ca)) {
 			percpu_ref_put(&ca->ref);
 			goto err;
 		}
-	}
 
 	err = "error starting tiering thread";
-	if (bch_tiering_read_start(c))
+	if (bch_tiering_start(c))
 		goto err;
 
 	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
-	return NULL;
+	if (c->state != BCH_FS_STARTING)
+		percpu_ref_reinit(&c->writes);
+
+	c->state = BCH_FS_RW;
+	err = NULL;
+out:
+	mutex_unlock(&c->state_lock);
+	return err;
 err:
 	__bch_fs_read_only(c);
-	return err;
+	goto out;
 }
 
-const char *bch_fs_read_write(struct cache_set *c)
-{
-	const char *err;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
-		return NULL;
-
-	err = __bch_fs_read_write(c);
-	if (err)
-		return err;
-
-	percpu_ref_reinit(&c->writes);
-
-	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
-	clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_RO, &c->flags);
-	return NULL;
-}
-
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
 
 static void bch_fs_free(struct cache_set *c)
 {
-	del_timer_sync(&c->foreground_write_wakeup);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
-	cancel_work_sync(&c->read_only_work);
-	cancel_work_sync(&c->bio_submit_work);
-	cancel_work_sync(&c->read_retry_work);
-
-	bch_fs_encryption_free(c);
-	bch_btree_cache_free(c);
-	bch_journal_free(&c->journal);
+	bch_fs_encryption_exit(c);
+	bch_fs_btree_exit(c);
+	bch_fs_journal_exit(&c->journal);
 	bch_io_clock_exit(&c->io_clock[WRITE]);
 	bch_io_clock_exit(&c->io_clock[READ]);
-	bch_compress_free(c);
+	bch_fs_compress_exit(c);
 	bch_fs_blockdev_exit(c);
 	bdi_destroy(&c->bdi);
 	lg_lock_free(&c->bucket_stats_lock);
@@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
 	module_put(THIS_MODULE);
 }
 
+static void bch_fs_exit(struct cache_set *c)
+{
+	unsigned i;
+
+	del_timer_sync(&c->foreground_write_wakeup);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+	cancel_work_sync(&c->bio_submit_work);
+	cancel_work_sync(&c->read_retry_work);
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (c->cache[i])
+			bch_dev_free(c->cache[i]);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_register_lock);
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	for_each_cache(ca, c, i)
+		if (ca->kobj.state_in_sysfs)
+			kobject_del(&ca->kobj);
+
+	bch_fs_debug_exit(c);
+	bch_fs_chardev_exit(c);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	__bch_fs_read_only(c);
+}
+
 /*
  * should be __bch_fs_stop4 - block devices are closed, now we can finally
  * free it
@@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
 void bch_fs_release(struct kobject *kobj)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
-	struct completion *stop_completion = c->stop_completion;
 
 	bch_notify_fs_stopped(c);
-	bch_info(c, "stopped");
-
 	bch_fs_free(c);
-
-	if (stop_completion)
-		complete(stop_completion);
 }
 
 /*
@@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
 static void __bch_fs_stop3(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, cl);
-	struct cache *ca;
-	unsigned i;
 
-	mutex_lock(&bch_register_lock);
-	for_each_cache(ca, c, i)
-		bch_dev_stop(ca);
-
-	list_del(&c->list);
-	mutex_unlock(&bch_register_lock);
-
-	closure_debug_destroy(&c->cl);
-	kobject_put(&c->kobj);
+	bch_fs_exit(c);
 }
 
 /*
@@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 
-	bch_debug_exit_cache_set(c);
-	bch_fs_chardev_exit(c);
-
-	if (c->kobj.state_in_sysfs)
-		kobject_del(&c->kobj);
-
-	bch_cache_accounting_destroy(&c->accounting);
-
-	kobject_put(&c->time_stats);
-	kobject_put(&c->opts_dir);
-	kobject_put(&c->internal);
-
-	mutex_lock(&bch_register_lock);
-	bch_fs_read_only_sync(c);
-	mutex_unlock(&bch_register_lock);
+	bch_fs_offline(c);
 
 	closure_return(cl);
 }
 
 /*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
  * context to shut down block devices:
  */
 static void __bch_fs_stop1(struct closure *cl)
@@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl)
 	continue_at(cl, __bch_fs_stop2, system_wq);
 }
 
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop_async(struct cache_set *c)
 {
-	if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STOPPING) {
+		c->state = BCH_FS_STOPPING;
 		closure_queue(&c->caching);
+	}
+	mutex_unlock(&c->state_lock);
 }
 
-void bch_fs_stop_sync(struct cache_set *c)
+void bch_fs_stop(struct cache_set *c)
 {
-	DECLARE_COMPLETION_ONSTACK(complete);
+	mutex_lock(&c->state_lock);
+	BUG_ON(c->state == BCH_FS_STOPPING);
+	c->state = BCH_FS_STOPPING;
+	mutex_unlock(&c->state_lock);
+
+	bch_blockdevs_stop(c);
+
+	closure_sync(&c->caching);
+	closure_debug_destroy(&c->caching);
+
+	bch_fs_offline(c);
 
-	c->stop_completion = &complete;
-	bch_fs_stop(c);
 	closure_put(&c->cl);
+	closure_sync(&c->cl);
 
-	/* Killable? */
-	wait_for_completion(&complete);
+	bch_fs_exit(c);
+	kobject_put(&c->kobj);
 }
 
 /* Stop, detaching from backing devices: */
 void bch_fs_detach(struct cache_set *c)
 {
 	if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
-		bch_fs_stop(c);
+		bch_fs_stop_async(c);
 }
 
 static unsigned bch_fs_nr_devices(struct cache_set *c)
@@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->minor		= -1;
 
+	mutex_init(&c->state_lock);
 	mutex_init(&c->sb_lock);
 	INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
 	mutex_init(&c->btree_cache_lock);
@@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
-	bch_open_buckets_init(c);
-	bch_tiering_init_cache_set(c);
+	bch_fs_allocator_init(c);
+	bch_fs_tiering_init(c);
 
 	INIT_LIST_HEAD(&c->list);
 	INIT_LIST_HEAD(&c->cached_devs);
@@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch_fs_blockdev_init(c) ||
 	    bch_io_clock_init(&c->io_clock[READ]) ||
 	    bch_io_clock_init(&c->io_clock[WRITE]) ||
-	    bch_journal_alloc(&c->journal, journal_entry_bytes) ||
-	    bch_btree_cache_alloc(c) ||
+	    bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
+	    bch_fs_btree_init(c) ||
 	    bch_fs_encryption_init(c) ||
-	    bch_compress_init(c) ||
+	    bch_fs_compress_init(c) ||
 	    bch_check_set_has_compressed_data(c, c->opts.compression))
 		goto err;
 
@@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	closure_init(&c->caching, &c->cl);
 	set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 
+	closure_get(&c->cl);
 	continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
 	return c;
 err:
@@ -671,7 +660,20 @@ err:
 	return NULL;
 }
 
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+	struct cache_set *c;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
 {
 	struct cache *ca;
 	unsigned i;
@@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (!list_empty(&c->list))
-		return 0;
+		return NULL;
 
-	list_add(&c->list, &bch_fs_list);
+	if (bch_fs_lookup(c->sb.uuid))
+		return "filesystem UUID already open";
 
 	ret = bch_fs_chardev_init(c);
 	if (ret)
-		return ret;
+		return "error creating character device";
+
+	bch_fs_debug_init(c);
 
 	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
 	    kobject_add(&c->internal, &c->kobj, "internal") ||
 	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
 	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
-		return -1;
+		return "error creating sysfs objects";
 
 	for_each_cache(ca, c, i)
 		if (bch_dev_online(ca)) {
 			percpu_ref_put(&ca->ref);
-			return -1;
+			return "error creating sysfs objects";
 		}
 
+	mutex_lock(&c->state_lock);
+
+	if (bch_blockdev_volumes_start(c)) {
+		mutex_unlock(&c->state_lock);
+		return "can't bring up blockdev volumes";
+	}
+
+	bch_attach_backing_devs(c);
+
+	mutex_unlock(&c->state_lock);
+
+	list_add(&c->list, &bch_fs_list);
+
 	return 0;
 }
 
-static const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+	const char *err;
+
+	mutex_lock(&bch_register_lock);
+	err = __bch_fs_online(c);
+	mutex_unlock(&bch_register_lock);
+
+	return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
@@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c)
 	struct jset *j;
 	int ret = -EINVAL;
 
-	lockdep_assert_held(&bch_register_lock);
-	BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
-	/* We don't want bch_fatal_error() to free underneath us */
-	closure_get(&c->caching);
+	BUG_ON(c->state != BCH_FS_STARTING);
 
 	/*
 	 * Make sure that each cache object's mi is up to date before
@@ -826,22 +851,8 @@ static const char *bch_fs_start(struct cache_set *c)
 
 		bch_notice(c, "initializing new filesystem");
 
-		err = "unable to allocate journal buckets";
-		for_each_cache(ca, c, i)
-			if (bch_dev_journal_alloc(ca)) {
-				percpu_ref_put(&ca->ref);
-				goto err;
-			}
-
 		bch_initial_gc(c, NULL);
 
-		/*
-		 * journal_res_get() will crash if called before this has
-		 * set up the journal.pin FIFO and journal.cur pointer:
-		 */
-		bch_journal_start(c);
-		bch_journal_set_replay_done(&c->journal);
-
 		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
 			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
@@ -850,6 +861,20 @@ static const char *bch_fs_start(struct cache_set *c)
 				goto err;
 			}
 
+		err = "unable to allocate journal buckets";
+		for_each_cache(ca, c, i)
+			if (bch_dev_journal_alloc(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
+		/*
+		 * journal_res_get() will crash if called before this has
+		 * set up the journal.pin FIFO and journal.cur pointer:
+		 */
+		bch_journal_start(c);
+		bch_journal_set_replay_done(&c->journal);
+
 		err = "cannot allocate new btree root";
 		for (id = 0; id < BTREE_ID_NR; id++)
 			if (bch_btree_root_alloc(c, id, &cl)) {
@@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c)
 			goto err;
 	}
 recovery_done:
+	err = "dynamic fault";
+	if (bch_fs_init_fault("fs_start"))
+		goto err;
+
 	if (c->opts.read_only) {
-		bch_fs_read_only_sync(c);
+		bch_fs_read_only(c);
 	} else {
-		err = __bch_fs_read_write(c);
+		err = bch_fs_read_write(c);
 		if (err)
 			goto err;
 	}
@@ -901,27 +930,9 @@ recovery_done:
 	bch_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	err = "dynamic fault";
-	if (bch_fs_init_fault("fs_start"))
-		goto err;
-
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err;
-
-	err = "can't bring up blockdev volumes";
-	if (bch_blockdev_volumes_start(c))
-		goto err;
-
-	bch_debug_init_cache_set(c);
-	set_bit(BCH_FS_RUNNING, &c->flags);
-	bch_attach_backing_devs(c);
-
-	bch_notify_fs_read_write(c);
 	err = NULL;
 out:
 	bch_journal_entries_free(&journal);
-	closure_put(&c->caching);
 	return err;
 err:
 	switch (ret) {
@@ -955,6 +966,11 @@ err:
 	goto out;
 }
 
+const char *bch_fs_start(struct cache_set *c)
+{
+	return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 {
 	struct bch_sb_field_members *sb_mi;
@@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 	return NULL;
 }
 
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
 
 bool bch_dev_read_only(struct cache *ca)
 {
@@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca)
 
 	bdevname(ca->disk_sb.bdev, buf);
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
 		return false;
 
 	if (!bch_dev_may_remove(ca)) {
 		bch_err(c, "required member %s going RO, forcing fs RO", buf);
-		bch_fs_read_only_sync(c);
+		bch_fs_read_only(c);
 	}
 
 	trace_bcache_cache_read_only(ca);
@@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
 
 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 {
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
 		return NULL;
@@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 	if (bch_dev_allocator_start(ca))
 		return "error starting allocator thread";
 
-	if (bch_moving_gc_thread_start(ca))
+	if (bch_moving_gc_start(ca))
 		return "error starting moving GC thread";
 
-	bch_dev_group_add(&c->journal.devs, ca);
-
-	wake_up_process(c->tiering_read);
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
 
 	bch_notify_dev_read_write(ca);
 	trace_bcache_cache_read_write_done(ca);
@@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
 	return NULL;
 }
 
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
 void bch_dev_release(struct kobject *kobj)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 
-	percpu_ref_exit(&ca->ref);
 	kfree(ca);
 }
 
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
 {
-	struct cache *ca = container_of(work, struct cache, free_work);
 	struct cache_set *c = ca->set;
 	unsigned i;
 
@@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work)
 		kobject_del(&ca->kobj);
 
 	bch_free_super(&ca->disk_sb);
-
-	/*
-	 * bch_dev_stop can be called in the middle of initialization
-	 * of the struct cache object.
-	 * As such, not all the sub-structures may be initialized.
-	 * However, they were zeroed when the object was allocated.
-	 */
-
-	bch_journal_free_cache(ca);
+	bch_dev_journal_exit(ca);
 	free_percpu(ca->sectors_written);
 	bioset_exit(&ca->replica_set);
 	free_percpu(ca->bucket_stats_percpu);
@@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
+	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
 
 	if (c)
 		kobject_put(&c->kobj);
 }
 
+static void bch_dev_free_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, free_work);
+
+	bch_dev_free(ca);
+}
+
 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
 {
 	struct cache *ca = container_of(ref, struct cache, ref);
@@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
-	if (c) {
-		BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
-		rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
-	}
+	BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+	rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
 
 	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }
@@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	closure_get(&c->cl);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+
 	bch_dev_stop(ca);
 
 	/*
@@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	synchronize_rcu();
 
-	lockdep_assert_held(&bch_register_lock);
-
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
@@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work)
 	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
 
 	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
 
 	closure_put(&c->cl);
 }
 
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
 {
-	mutex_lock(&bch_register_lock);
-
 	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
 		return false;
 
 	if (!bch_dev_may_remove(ca)) {
-		bch_err(ca->set, "Can't remove last device in tier %u",
-			ca->mi.tier);
+		bch_err(ca->set, "Can't remove last RW device");
 		bch_notify_dev_remove_failed(ca);
 		return false;
 	}
@@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
 
 	if (force)
 		set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
 	set_bit(BCH_DEV_REMOVING, &ca->flags);
 	bch_notify_dev_removing(ca);
 
-	mutex_unlock(&bch_register_lock);
-
 	/* Migrate the data and finish removal asynchronously: */
 
 	queue_work(system_long_wq, &ca->remove_work);
 	return true;
 }
 
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+	struct cache_set *c = ca->set;
+	bool ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_remove(c, ca, force);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
 static int bch_dev_online(struct cache *ca)
 {
 	char buf[12];
 
-	lockdep_assert_held(&bch_register_lock);
-
 	sprintf(buf, "cache%u", ca->dev_idx);
 
 	if (kobject_add(&ca->kobj,
@@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	kobject_init(&ca->kobj, &bch_dev_ktype);
 
 	spin_lock_init(&ca->self.lock);
-	ca->self.nr_devices = 1;
+	ca->self.nr = 1;
 	rcu_assign_pointer(ca->self.d[0].dev, ca);
 	ca->dev_idx = sb->sb->dev_idx;
 
@@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
-	bch_moving_init_cache(ca);
+	bch_dev_moving_gc_init(ca);
 
 	ca->disk_sb = *sb;
-	ca->disk_sb.bdev->bd_holder = ca;
+	if (sb->mode & FMODE_EXCL)
+		ca->disk_sb.bdev->bd_holder = ca;
 	memset(sb, 0, sizeof(*sb));
 
 	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
@@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
 	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
-	    bch_journal_init_cache(ca))
+	    bch_dev_journal_init(ca))
 		goto err;
 
 	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	err = "error creating kobject";
 	if (c->kobj.state_in_sysfs &&
 	    bch_dev_online(ca))
-		goto err;
+		pr_warn("error creating sysfs objects");
 
 	if (ret)
 		*ret = ca;
@@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 		kobject_put(&ca->kobj);
 	return NULL;
 err:
-	bch_dev_stop(ca);
+	bch_dev_free(ca);
 	return err;
 }
 
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
-	struct cache_set *c;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
-			return c;
-
-	return NULL;
-}
-
 int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
 	const char *err;
 	struct cache *ca;
-	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi, *dev_mi;
 	struct bch_member saved_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret = -EINVAL;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, c->opts, path);
 	if (err)
-		goto err_unlock_register;
+		return -EINVAL;
 
 	err = bch_validate_cache_super(&sb);
 	if (err)
-		goto err_unlock_register;
-
-	mutex_lock(&c->sb_lock);
+		return -EINVAL;
 
 	err = bch_dev_may_add(sb.sb, c);
 	if (err)
-		goto err_unlock;
+		return -EINVAL;
+
+	mutex_lock(&c->state_lock);
+	mutex_lock(&c->sb_lock);
 
 	/*
 	 * Preserve the old cache member information (esp. tier)
@@ -1571,17 +1568,14 @@ have_slot:
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 	err = "no space in superblock for member info";
 
-	f = bch_fs_sb_field_resize(c, &mi->field, u64s);
-	if (!f)
+	mi = bch_fs_sb_resize_members(c, u64s);
+	if (!mi)
 		goto err_unlock;
 
-	mi = container_of(f, struct bch_sb_field_members, field);
-
-	f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
-	if (!f)
+	dev_mi = bch_sb_resize_members(&sb, u64s);
+	if (!dev_mi)
 		goto err_unlock;
 
-	dev_mi = container_of(f, struct bch_sb_field_members, field);
 	memcpy(dev_mi, mi, u64s * sizeof(u64));
 	dev_mi->members[dev_idx] = saved_mi;
 
@@ -1619,14 +1613,13 @@ have_slot:
 
 	kobject_put(&ca->kobj);
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	return 0;
 err_put:
 	bch_dev_stop(ca);
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-err_unlock_register:
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	bch_free_super(&sb);
 
 	bch_err(c, "Unable to add device: %s", err);
@@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	const char *err;
 	struct cache_set *c = NULL;
 	struct bcache_superblock *sb;
-	uuid_le uuid;
 	unsigned i;
 
-	memset(&uuid, 0, sizeof(uuid_le));
-
 	if (!nr_devices)
 		return "need at least one device";
 
@@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	if (!sb)
 		goto err;
 
-	/*
-	 * bch_read_super() needs to happen under register_lock, so that the
-	 * exclusive open is atomic with adding the new cache set to the list of
-	 * cache sets:
-	 */
-	mutex_lock(&bch_register_lock);
-
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_read_super(&sb[i], opts, devices[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 
 		err = "attempting to register backing device";
 		if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
-			goto err_unlock;
+			goto err;
 
 		err = bch_validate_cache_super(&sb[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
-	err = "cache set already registered";
-	if (bch_fs_lookup(sb->sb->uuid))
-		goto err_unlock;
-
 	err = "cannot allocate memory";
 	c = bch_fs_alloc(sb[0].sb, opts);
 	if (!c)
-		goto err_unlock;
+		goto err;
 
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_dev_alloc(&sb[i], c, NULL);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
 	err = "insufficient devices";
 	if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
-		goto err_unlock;
+		goto err;
 
-	err = bch_fs_start(c);
-	if (err)
-		goto err_unlock;
-
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err_unlock;
-
-	if (ret) {
-		closure_get(&c->cl);
-		*ret = c;
+	if (!c->opts.nostart) {
+		err = __bch_fs_start(c);
+		if (err)
+			goto err;
 	}
 
-	mutex_unlock(&bch_register_lock);
+	err = bch_fs_online(c);
+	if (err)
+		goto err;
+
+	if (ret)
+		*ret = c;
+	else
+		closure_put(&c->cl);
 
 	err = NULL;
 out:
@@ -1717,20 +1696,18 @@ out:
 	if (err)
 		c = NULL;
 	return err;
-err_unlock:
+err:
 	if (c)
 		bch_fs_stop(c);
-	mutex_unlock(&bch_register_lock);
-err:
+
 	for (i = 0; i < nr_devices; i++)
 		bch_free_super(&sb[i]);
 	goto out;
 }
 
 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
-				  struct bch_opts opts)
+					     struct bch_opts opts)
 {
-	char name[BDEVNAME_SIZE];
 	const char *err;
 	struct cache_set *c;
 	bool allocated_cache_set = false;
@@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		return err;
 
-	bdevname(sb->bdev, name);
-
+	mutex_lock(&bch_register_lock);
 	c = bch_fs_lookup(sb->sb->uuid);
 	if (c) {
+		closure_get(&c->cl);
+
 		err = bch_dev_in_fs(sb->sb, c);
 		if (err)
-			return err;
+			goto err;
 	} else {
 		c = bch_fs_alloc(sb->sb, opts);
+		err = "cannot allocate memory";
 		if (!c)
-			return "cannot allocate memory";
+			goto err;
 
 		allocated_cache_set = true;
 	}
@@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		goto err;
 
-	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
-		err = bch_fs_start(c);
+	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
+	    !c->opts.nostart) {
+		err = __bch_fs_start(c);
 		if (err)
 			goto err;
-	} else {
-		err = "error creating kobject";
-		if (bch_fs_online(c))
-			goto err;
 	}
 
-	bch_info(c, "started");
+	err = __bch_fs_online(c);
+	if (err)
+		goto err;
+
+	closure_put(&c->cl);
+	mutex_unlock(&bch_register_lock);
+
 	return NULL;
 err:
+	mutex_unlock(&bch_register_lock);
+
 	if (allocated_cache_set)
 		bch_fs_stop(c);
+	else if (c)
+		closure_put(&c->cl);
+
 	return err;
 }
 
@@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
 	struct bch_opts opts = bch_opts_empty();
 	const char *err;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, opts, path);
 	if (err)
-		goto err;
+		return err;
 
-	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+		mutex_lock(&bch_register_lock);
 		err = bch_backing_dev_register(&sb);
-	else
+		mutex_unlock(&bch_register_lock);
+	} else {
 		err = __bch_fs_open_incremental(&sb, opts);
+	}
 
 	bch_free_super(&sb);
-err:
-	mutex_unlock(&bch_register_lock);
+
 	return err;
 }
 
@@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
 			pr_info("Setting all devices read only:");
 
 		list_for_each_entry(c, &bch_fs_list, list)
-			bch_fs_read_only(c);
+			bch_fs_read_only_async(c);
 
 		list_for_each_entry(c, &bch_fs_list, list)
-			bch_fs_read_only_sync(c);
+			bch_fs_read_only(c);
 
 		mutex_unlock(&bch_register_lock);
 	}
@@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot,		reboot_test);
 static void bcache_exit(void)
 {
 	bch_debug_exit();
-	bch_fs_exit();
+	bch_vfs_exit();
 	bch_blockdev_exit();
 	bch_chardev_exit();
 	if (bcache_kset)
@@ -1917,7 +1904,7 @@ static int __init bcache_init(void)
 	    sysfs_create_files(&bcache_kset->kobj, files) ||
 	    bch_chardev_init() ||
 	    bch_blockdev_init() ||
-	    bch_fs_init() ||
+	    bch_vfs_init() ||
 	    bch_debug_init())
 		goto err;
 
diff --git a/libbcache/super.h b/libbcache/super.h
index bcf7d983..bafd88e0 100644
--- a/libbcache/super.h
+++ b/libbcache/super.h
@@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 static inline bool bch_dev_may_remove(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *grp = &c->cache_all;
 
-	/*
-	 * Right now, we can't remove the last device from a tier,
-	 * - For tier 0, because all metadata lives in tier 0 and because
-	 *   there is no way to have foreground writes go directly to tier 1.
-	 * - For tier 1, because the code doesn't completely support an
-	 *   empty tier 1.
-	 */
-
-	/*
-	 * Turning a device read-only removes it from the cache group,
-	 * so there may only be one read-write device in a tier, and yet
-	 * the device we are removing is in the same tier, so we have
-	 * to check for identity.
-	 * Removing the last RW device from a tier requires turning the
-	 * whole cache set RO.
-	 */
-
-	return tier->nr_devices != 1 ||
-		rcu_access_pointer(tier->d[0].dev) != ca;
+	/* Can't remove the last RW device: */
+	return grp->nr != 1 ||
+		rcu_access_pointer(grp->d[0].dev) != ca;
 }
 
 void bch_dev_release(struct kobject *);
@@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
 
 void bch_fs_detach(struct cache_set *);
 
-bool bch_fs_read_only(struct cache_set *);
 bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
 const char *bch_fs_read_write(struct cache_set *);
 
 void bch_fs_release(struct kobject *);
+void bch_fs_stop_async(struct cache_set *);
 void bch_fs_stop(struct cache_set *);
-void bch_fs_stop_sync(struct cache_set *);
 
+const char *bch_fs_start(struct cache_set *);
 const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
 			struct cache_set **);
 const char *bch_fs_open_incremental(const char *path);
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
index 41eaf0dd..69c747de 100644
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@@ -6,6 +6,7 @@ struct bcache_superblock {
 	struct block_device	*bdev;
 	struct bio		*bio;
 	unsigned		page_order;
+	fmode_t			mode;
 };
 
 #endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
index 9f45a6b0..48f9f1f6 100644
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@@ -22,6 +22,7 @@
 #include "opts.h"
 #include "request.h"
 #include "super-io.h"
+#include "tier.h"
 #include "writeback.h"
 
 #include <linux/blkdev.h>
@@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(tier);
 rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
@@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
 rw_attribute(size);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
-read_attribute(tier);
 
 #define BCH_DEBUG_PARAM(name, description)				\
 	rw_attribute(name);
@@ -680,7 +682,8 @@ SHOW(bch_fs)
 
 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+
+	sysfs_pd_controller_show(tiering,	&c->tiers[1].pd); /* XXX */
 
 	sysfs_printf(meta_replicas_have, "%u",	c->sb.meta_replicas_have);
 	sysfs_printf(data_replicas_have, "%u",	c->sb.data_replicas_have);
@@ -694,7 +697,7 @@ SHOW(bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
 	if (attr == &sysfs_bset_tree_stats)
@@ -723,7 +726,7 @@ STORE(__bch_fs)
 	}
 
 	if (attr == &sysfs_stop) {
-		bch_fs_stop(c);
+		bch_fs_stop_async(c);
 		return size;
 	}
 
@@ -773,25 +776,18 @@ STORE(__bch_fs)
 		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
 			?: (ssize_t) size;
 
-		if (c->tiering_read)
-			wake_up_process(c->tiering_read);
+		bch_tiering_start(c); /* issue wakeups */
 		return ret;
 	}
 
 	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
 
-	if (attr == &sysfs_journal_flush) {
-		bch_journal_meta_async(&c->journal, NULL);
-
-		return size;
-	}
-
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
 	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 
 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+	sysfs_pd_controller_store(tiering,	&c->tiers[1].pd); /* XXX */
 
 	/* Debugging: */
 
@@ -799,11 +795,14 @@ STORE(__bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
-		return -EINTR;
+	if (attr == &sysfs_journal_flush) {
+		bch_journal_meta_async(&c->journal, NULL);
+
+		return size;
+	}
 
 	if (attr == &sysfs_blockdev_volume_create) {
 		u64 v = strtoi_h_or_return(buf);
@@ -836,9 +835,9 @@ STORE(bch_fs)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
 	size = __bch_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 
 	if (attr == &sysfs_add_device) {
 		char *path = kstrdup(buf, GFP_KERNEL);
@@ -1273,6 +1272,31 @@ STORE(__bch_dev)
 		mutex_unlock(&c->sb_lock);
 	}
 
+	if (attr == &sysfs_tier) {
+		unsigned prev_tier;
+		unsigned v = strtoul_restrict_or_return(buf,
+					0, BCH_TIER_MAX - 1);
+
+		mutex_lock(&c->sb_lock);
+		prev_tier = ca->mi.tier;
+
+		if (v == ca->mi.tier) {
+			mutex_unlock(&c->sb_lock);
+			return size;
+		}
+
+		mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+		SET_BCH_MEMBER_TIER(mi, v);
+		bch_write_super(c);
+
+		bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
+		bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+		mutex_unlock(&c->sb_lock);
+
+		bch_recalc_capacity(c);
+		bch_tiering_start(c);
+	}
+
 	if (attr == &sysfs_state_rw) {
 		char name[BDEVNAME_SIZE];
 		const char *err = NULL;
diff --git a/libbcache/tier.c b/libbcache/tier.c
index 46864594..0ab17708 100644
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@@ -16,8 +16,7 @@
 #include <trace/events/bcache.h>
 
 struct tiering_state {
-	struct cache_group	*tier;
-	unsigned		tier_idx;
+	struct bch_tier		*tier;
 	unsigned		sectors;
 	unsigned		stripe_size;
 	unsigned		dev_idx;
@@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
 		mi = cache_member_info_get(c);
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev < mi->nr_devices &&
-			    mi->m[ptr->dev].tier >= s->tier_idx)
+			    mi->m[ptr->dev].tier >= s->tier->idx)
 				replicas++;
 		cache_member_info_put();
 
@@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
 		s->sectors = 0;
 		s->dev_idx++;
 
-		spin_lock(&s->tier->lock);
-		if (s->dev_idx >= s->tier->nr_devices)
+		spin_lock(&s->tier->devs.lock);
+		if (s->dev_idx >= s->tier->devs.nr)
 			s->dev_idx = 0;
 
-		if (s->tier->nr_devices) {
-			s->ca = s->tier->d[s->dev_idx].dev;
+		if (s->tier->devs.nr) {
+			s->ca = s->tier->devs.d[s->dev_idx].dev;
 			percpu_ref_get(&s->ca->ref);
 		}
-		spin_unlock(&s->tier->lock);
+		spin_unlock(&s->tier->devs.lock);
 	}
 }
 
@@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
  * tiering_next_cache - issue a move to write an extent to the next cache
  * device in round robin order
  */
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
 {
 	struct moving_context ctxt;
 	struct tiering_state s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned nr_devices = READ_ONCE(tier->nr_devices);
+	unsigned nr_devices = READ_ONCE(tier->devs.nr);
 	int ret;
 
 	if (!nr_devices)
@@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
 
 	memset(&s, 0, sizeof(s));
 	s.tier		= tier;
-	s.tier_idx	= tier - c->cache_tiers;
 	s.stripe_size	= 2048; /* 1 mb for now */
 
-	bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+	bch_move_ctxt_init(&ctxt, &tier->pd.rate,
 			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
 	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
 
@@ -164,8 +162,8 @@ next:
 
 static int bch_tiering_thread(void *arg)
 {
-	struct cache_set *c = arg;
-	struct cache_group *tier = &c->cache_tiers[1];
+	struct bch_tier *tier = arg;
+	struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct cache *ca;
 	u64 tier_capacity, available_sectors;
@@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
 
 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   tier->nr_devices))
+					   tier->devs.nr))
 			break;
 
 		while (1) {
-			struct cache_group *faster_tier;
+			struct bch_tier *faster_tier;
 
 			last = atomic_long_read(&clock->now);
 
 			tier_capacity = available_sectors = 0;
 			rcu_read_lock();
-			for (faster_tier = c->cache_tiers;
+			for (faster_tier = c->tiers;
 			     faster_tier != tier;
 			     faster_tier++) {
-				group_for_each_cache_rcu(ca, faster_tier, i) {
+				group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
 					tier_capacity +=
 						(ca->mi.nbuckets -
 						 ca->mi.first_bucket) << ca->bucket_bits;
@@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
 	return 0;
 }
 
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
 {
-	bch_pd_controller_init(&c->tiering_pd);
+	tier->pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&tier->pd.rate);
+
+	if (tier->migrate)
+		kthread_stop(tier->migrate);
+
+	tier->migrate = NULL;
 }
 
-int bch_tiering_read_start(struct cache_set *c)
+void bch_tiering_stop(struct cache_set *c)
 {
-	struct task_struct *t;
+	struct bch_tier *tier;
+
+	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
+		__bch_tiering_stop(tier);
+}
+
+static int __bch_tiering_start(struct bch_tier *tier)
+{
+	if (!tier->migrate) {
+		struct task_struct *p =
+			kthread_create(bch_tiering_thread, tier,
+				       "bch_tier[%u]", tier->idx);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+
+		tier->migrate = p;
+	}
+
+	wake_up_process(tier->migrate);
+	return 0;
+}
+
+int bch_tiering_start(struct cache_set *c)
+{
+	struct bch_tier *tier;
+	bool have_faster_tier = false;
 
 	if (c->opts.nochanges)
 		return 0;
 
-	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
-	if (IS_ERR(t))
-		return PTR_ERR(t);
+	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+		if (!tier->devs.nr)
+			continue;
 
-	c->tiering_read = t;
-	wake_up_process(c->tiering_read);
+		if (have_faster_tier) {
+			int ret = __bch_tiering_start(tier);
+			if (ret)
+				return ret;
+		} else {
+			__bch_tiering_stop(tier);
+		}
+
+		have_faster_tier = true;
+	}
 
 	return 0;
 }
 
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
 {
-	if (!IS_ERR_OR_NULL(c->tiering_read)) {
-		kthread_stop(c->tiering_read);
-		c->tiering_read = NULL;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+		c->tiers[i].idx = i;
+		bch_pd_controller_init(&c->tiers[i].pd);
 	}
 }
diff --git a/libbcache/tier.h b/libbcache/tier.h
index 89c2bffd..b53e83d9 100644
--- a/libbcache/tier.h
+++ b/libbcache/tier.h
@@ -1,8 +1,8 @@
 #ifndef _BCACHE_TIER_H
 #define _BCACHE_TIER_H
 
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
 
 #endif
diff --git a/linux/blkdev.c b/linux/blkdev.c
index 0bae9b0d..93459d0b 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -20,8 +20,14 @@ int submit_bio_wait(struct bio *bio)
 	ssize_t ret;
 	unsigned i;
 
-	if (bio->bi_opf & REQ_PREFLUSH)
-		fdatasync(bio->bi_bdev->bd_fd);
+	if (bio->bi_opf & REQ_PREFLUSH) {
+		ret = fdatasync(bio->bi_bdev->bd_fd);
+		if (ret) {
+			fprintf(stderr, "fsync error: %s\n",
+				strerror(errno));
+			return -EIO;
+		}
+	}
 
 	i = 0;
 	bio_for_each_segment(bv, bio, iter)
@@ -49,10 +55,22 @@ int submit_bio_wait(struct bio *bio)
 		BUG();
 	}
 
-	if (bio->bi_opf & REQ_FUA)
-		fdatasync(bio->bi_bdev->bd_fd);
+	if (ret != bio->bi_iter.bi_size) {
+		fprintf(stderr, "IO error: %li (%s)\n",
+			ret, strerror(errno));
+		return -EIO;
+	}
 
-	return ret == bio->bi_iter.bi_size ? 0 : -EIO;
+	if (bio->bi_opf & REQ_FUA) {
+		ret = fdatasync(bio->bi_bdev->bd_fd);
+		if (ret) {
+			fprintf(stderr, "fsync error: %s\n",
+				strerror(errno));
+			return -EIO;
+		}
+	}
+
+	return 0;
 }
 
 void generic_make_request(struct bio *bio)
diff --git a/qcow2.c b/qcow2.c
index cbc8d4c4..b7aa8c26 100644
--- a/qcow2.c
+++ b/qcow2.c
@@ -2,7 +2,6 @@
 #include <errno.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <linux/sort.h>
 
 #include "qcow2.h"
 #include "tools-util.h"
@@ -69,18 +68,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
 	img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED);
 }
 
-static int range_cmp(const void *_l, const void *_r)
-{
-	const struct range *l = _l, *r = _r;
-
-	if (l->start < r->start)
-		return -1;
-	if (l->start > r->start)
-		return  1;
-	return 0;
-}
-
-void qcow2_write_image(int infd, int outfd, sparse_data *data,
+void qcow2_write_image(int infd, int outfd, ranges *data,
 		       unsigned block_size)
 {
 	u64 image_size = get_size(NULL, infd);
@@ -98,30 +86,11 @@ void qcow2_write_image(int infd, int outfd, sparse_data *data,
 	struct range *r;
 	char *buf = xmalloc(block_size);
 	u64 src_offset, dst_offset;
-	sparse_data m;
 
 	assert(is_power_of_2(block_size));
 
-	sort(&darray_item(*data, 0),
-	     darray_size(*data),
-	     sizeof(darray_item(*data, 0)),
-	     range_cmp, NULL);
-
-	/* Round to blocksize, merge contiguous ranges: */
-	darray_init(m);
-	darray_foreach(r, *data) {
-		struct range *l = m.size ?  &m.item[m.size - 1] : NULL;
-
-		r->start = round_down(r->start, block_size);
-		r->end	= round_up(r->end, block_size);
-
-		if (l && l->end >= r->start)
-			l->end = max(l->end, r->end);
-		else
-			darray_append(m, *r);
-	}
-	darray_free(*data);
-	*data = m;
+	ranges_roundup(data, block_size);
+	ranges_sort_merge(data);
 
 	/* Write data: */
 	darray_foreach(r, *data)
diff --git a/qcow2.h b/qcow2.h
index c6f0b6ba..0943d55c 100644
--- a/qcow2.h
+++ b/qcow2.h
@@ -2,23 +2,8 @@
 #define _QCOW2_H
 
 #include <linux/types.h>
-#include "ccan/darray/darray.h"
+#include "tools-util.h"
 
-struct range {
-	u64		start;
-	u64		end;
-};
-
-typedef darray(struct range) sparse_data;
-
-static inline void data_add(sparse_data *data, u64 offset, u64 size)
-{
-	darray_append(*data, (struct range) {
-		.start = offset,
-		.end = offset + size
-	});
-}
-
-void qcow2_write_image(int, int, sparse_data *, unsigned);
+void qcow2_write_image(int, int, ranges *, unsigned);
 
 #endif /* _QCOW2_H */
diff --git a/tools-util.c b/tools-util.c
index 0a95fbe9..07fb82d1 100644
--- a/tools-util.c
+++ b/tools-util.c
@@ -1,4 +1,3 @@
-#include <alloca.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
@@ -19,6 +18,7 @@
 #include "ccan/crc/crc.h"
 
 #include "linux/bcache-ioctl.h"
+#include "linux/sort.h"
 #include "tools-util.h"
 #include "util.h"
 
@@ -59,20 +59,12 @@ struct units_buf __pr_units(u64 v, enum units units)
 
 char *read_file_str(int dirfd, const char *path)
 {
-	int fd = openat(dirfd, path, O_RDONLY);
+	int fd = xopenat(dirfd, path, O_RDONLY);
+	size_t len = xfstat(fd).st_size;
 
-	if (fd < 0)
-		die("Unable to open %s\n", path);
+	char *buf = malloc(len + 1);
 
-	struct stat statbuf;
-	if (fstat(fd, &statbuf) < 0)
-		die("fstat error\n");
-
-	char *buf = malloc(statbuf.st_size + 1);
-
-	int len = read(fd, buf, statbuf.st_size);
-	if (len < 0)
-		die("read error while reading from file %s\n", path);
+	xpread(fd, buf, len, 0);
 
 	buf[len] = '\0';
 	if (len && buf[len - 1] == '\n')
@@ -107,48 +99,33 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
 /* Returns size of file or block device: */
 u64 get_size(const char *path, int fd)
 {
-	struct stat statbuf;
-	u64 ret;
-
-	if (fstat(fd, &statbuf))
-		die("Error statting %s: %s", path, strerror(errno));
+	struct stat statbuf = xfstat(fd);
 
 	if (!S_ISBLK(statbuf.st_mode))
 		return statbuf.st_size;
 
-	if (ioctl(fd, BLKGETSIZE64, &ret))
-		die("Error getting block device size on %s: %s\n",
-		    path, strerror(errno));
-
+	u64 ret;
+	xioctl(fd, BLKGETSIZE64, &ret);
 	return ret;
 }
 
 /* Returns blocksize in units of 512 byte sectors: */
 unsigned get_blocksize(const char *path, int fd)
 {
-	struct stat statbuf;
-	if (fstat(fd, &statbuf))
-		die("Error statting %s: %s", path, strerror(errno));
+	struct stat statbuf = xfstat(fd);
 
 	if (!S_ISBLK(statbuf.st_mode))
 		return statbuf.st_blksize >> 9;
 
 	unsigned ret;
-	if (ioctl(fd, BLKPBSZGET, &ret))
-		die("Error getting blocksize on %s: %s\n",
-		    path, strerror(errno));
-
+	xioctl(fd, BLKPBSZGET, &ret);
 	return ret >> 9;
 }
 
 /* Global control device: */
 int bcachectl_open(void)
 {
-	int fd = open("/dev/bcache-ctl", O_RDWR);
-	if (fd < 0)
-		die("Can't open bcache device: %s", strerror(errno));
-
-	return fd;
+	return xopen("/dev/bcache-ctl", O_RDWR);
 }
 
 /* Filesystem handles (ioctl, sysfs dir): */
@@ -162,47 +139,29 @@ struct bcache_handle bcache_fs_open(const char *path)
 
 	if (!uuid_parse(path, tmp)) {
 		/* It's a UUID, look it up in sysfs: */
-
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
-		sprintf(sysfs, "%s%s", SYSFS_BASE, path);
-
-		ret.sysfs_fd = open(sysfs, O_RDONLY);
-		if (!ret.sysfs_fd)
-			die("Unable to open %s\n", path);
+		char *sysfs = mprintf("%s%s", SYSFS_BASE, path);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
 
 		char *minor = read_file_str(ret.sysfs_fd, "minor");
-		char *ctl = alloca(20 + strlen(minor));
+		char *ctl = mprintf("/dev/bcache%s-ctl", minor);
+		ret.ioctl_fd = xopen(ctl, O_RDWR);
 
-		sprintf(ctl, "/dev/bcache%s-ctl", minor);
+		free(sysfs);
 		free(minor);
-
-		ret.ioctl_fd = open(ctl, O_RDWR);
-		if (ret.ioctl_fd < 0)
-			die("Error opening control device: %s\n",
-			    strerror(errno));
+		free(ctl);
 	} else {
 		/* It's a path: */
-
-		ret.ioctl_fd = open(path, O_RDONLY);
-		if (ret.ioctl_fd < 0)
-			die("Error opening %s: %s\n",
-			    path, strerror(errno));
+		ret.ioctl_fd = xopen(path, O_RDONLY);
 
 		struct bch_ioctl_query_uuid uuid;
-		if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid))
-			die("ioctl error (not a bcache fs?): %s\n",
-			    strerror(errno));
+		xioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid);
 
 		char uuid_str[40];
 		uuid_unparse(uuid.uuid.b, uuid_str);
 
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
-		sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
-
-		ret.sysfs_fd = open(sysfs, O_RDONLY);
-		if (ret.sysfs_fd < 0)
-			die("Unable to open sysfs dir %s: %s\n",
-			    sysfs, strerror(errno));
+		char *sysfs = mprintf("%s%s", SYSFS_BASE, uuid_str);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
+		free(sysfs);
 	}
 
 	return ret;
@@ -225,3 +184,89 @@ bool ask_yn(void)
 	free(buf);
 	return ret;
 }
+
+static int range_cmp(const void *_l, const void *_r)
+{
+	const struct range *l = _l, *r = _r;
+
+	if (l->start < r->start)
+		return -1;
+	if (l->start > r->start)
+		return  1;
+	return 0;
+}
+
+void ranges_sort_merge(ranges *r)
+{
+	struct range *t, *i;
+	ranges tmp = { NULL };
+
+	sort(&darray_item(*r, 0), darray_size(*r),
+	     sizeof(darray_item(*r, 0)), range_cmp, NULL);
+
+	/* Merge contiguous ranges: */
+	darray_foreach(i, *r) {
+		t = tmp.size ?  &tmp.item[tmp.size - 1] : NULL;
+
+		if (t && t->end >= i->start)
+			t->end = max(t->end, i->end);
+		else
+			darray_append(tmp, *i);
+	}
+
+	darray_free(*r);
+	*r = tmp;
+}
+
+void ranges_roundup(ranges *r, unsigned block_size)
+{
+	struct range *i;
+
+	darray_foreach(i, *r) {
+		i->start = round_down(i->start, block_size);
+		i->end	= round_up(i->end, block_size);
+	}
+}
+
+void ranges_rounddown(ranges *r, unsigned block_size)
+{
+	struct range *i;
+
+	darray_foreach(i, *r) {
+		i->start = round_up(i->start, block_size);
+		i->end	= round_down(i->end, block_size);
+		i->end	= max(i->end, i->start);
+	}
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter)
+{
+	struct fiemap_extent e;
+
+	BUG_ON(iter->idx > iter->f.fm_mapped_extents);
+
+	if (iter->idx == iter->f.fm_mapped_extents) {
+		xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f);
+
+		if (!iter->f.fm_mapped_extents)
+			return (struct fiemap_extent) { .fe_length = 0 };
+
+		iter->idx = 0;
+	}
+
+	e = iter->f.fm_extents[iter->idx++];
+	BUG_ON(!e.fe_length);
+
+	iter->f.fm_start = e.fe_logical + e.fe_length;
+
+	return e;
+}
+
+const char *strcmp_prefix(const char *a, const char *a_prefix)
+{
+	while (*a_prefix && *a == *a_prefix) {
+		a++;
+		a_prefix++;
+	}
+	return *a_prefix ? NULL : a;
+}
diff --git a/tools-util.h b/tools-util.h
index 09f00efe..1aac56ae 100644
--- a/tools-util.h
+++ b/tools-util.h
@@ -5,21 +5,31 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <linux/bug.h>
 #include <linux/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include "ccan/darray/darray.h"
 
-#define die(arg, ...)					\
-do {							\
-	fprintf(stderr, arg "\n", ##__VA_ARGS__);	\
-	exit(EXIT_FAILURE);				\
+#define die(arg, ...)							\
+do {									\
+	fprintf(stderr, arg "\n", ##__VA_ARGS__);			\
+	exit(EXIT_FAILURE);						\
 } while (0)
 
+#define mprintf(...)							\
+({									\
+	char *_str;							\
+	asprintf(&_str, __VA_ARGS__);					\
+	_str;								\
+})
+
 static inline void *xcalloc(size_t count, size_t size)
 {
 	void *p = calloc(count, size);
@@ -57,6 +67,38 @@ static inline void xpwrite(int fd, const void *buf, size_t count, off_t offset)
 		die("write error (ret %zi err %s)", r, strerror(errno));
 }
 
+#define xopenat(_dirfd, _path, ...)					\
+({									\
+	int _fd = openat((_dirfd), (_path), __VA_ARGS__);		\
+	if (_fd < 0)							\
+		die("Error opening %s: %s", (_path), strerror(errno));	\
+	_fd;								\
+})
+
+#define xopen(...)	xopenat(AT_FDCWD, __VA_ARGS__)
+
+static inline struct stat xfstatat(int dirfd, const char *path, int flags)
+{
+	struct stat stat;
+	if (fstatat(dirfd, path, &stat, flags))
+		die("stat error: %s", strerror(errno));
+	return stat;
+}
+
+static inline struct stat xfstat(int fd)
+{
+	struct stat stat;
+	if (fstat(fd, &stat))
+		die("stat error: %s", strerror(errno));
+	return stat;
+}
+
+#define xioctl(_fd, _nr, ...)						\
+do {									\
+	if (ioctl((_fd), (_nr), ##__VA_ARGS__))				\
+		die(#_nr " ioctl error: %s", strerror(errno));		\
+} while (0)
+
 enum units {
 	BYTES,
 	SECTORS,
@@ -91,4 +133,74 @@ struct bcache_handle bcache_fs_open(const char *);
 
 bool ask_yn(void);
 
+struct range {
+	u64		start;
+	u64		end;
+};
+
+typedef darray(struct range) ranges;
+
+static inline void range_add(ranges *data, u64 offset, u64 size)
+{
+	darray_append(*data, (struct range) {
+		.start = offset,
+		.end = offset + size
+	});
+}
+
+void ranges_sort_merge(ranges *);
+void ranges_roundup(ranges *, unsigned);
+void ranges_rounddown(ranges *, unsigned);
+
+struct hole_iter {
+	ranges		r;
+	size_t		idx;
+	u64		end;
+};
+
+static inline struct range hole_iter_next(struct hole_iter *iter)
+{
+	struct range r = {
+		.start	= iter->idx ? iter->r.item[iter->idx - 1].end : 0,
+		.end	= iter->idx < iter->r.size
+			? iter->r.item[iter->idx].start : iter->end,
+	};
+
+	BUG_ON(r.start > r.end);
+
+	iter->idx++;
+	return r;
+}
+
+#define for_each_hole(_iter, _ranges, _end, _i)				\
+	for (_iter = (struct hole_iter) { .r = _ranges, .end = _end };	\
+	     (_iter.idx <= _iter.r.size &&				\
+	      (_i = hole_iter_next(&_iter), true));)
+
+#include <linux/fiemap.h>
+
+struct fiemap_iter {
+	struct fiemap		f;
+	struct fiemap_extent	fe[1024];
+	unsigned		idx;
+	int			fd;
+};
+
+static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->f.fm_extent_count	= ARRAY_SIZE(iter->fe);
+	iter->f.fm_length	= FIEMAP_MAX_OFFSET;
+	iter->fd		= fd;
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
+
+#define fiemap_for_each(fd, iter, extent)				\
+	for (fiemap_iter_init(&iter, fd);				\
+	     (extent = fiemap_iter_next(&iter)).fe_length;)
+
+const char *strcmp_prefix(const char *, const char *);
+
 #endif /* _TOOLS_UTIL_H */