cmd_migrate

2025-04-02 00:00:04 +03:00 · 2017-03-01 01:45:15 -09:00 · 2017-03-01 01:45:15 -09:00 · a17f7bcec7
commit a17f7bcec7
parent 171ee48e57
61 changed files with 2745 additions and 1366 deletions
--- a/.bcache_revision
+++ b/.bcache_revision
@ -1 +1 @@
-BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c
+BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc
--- a/1
+++ b/1
@ -56,6 +56,7 @@ OBJS=bcache.o			\
     cmd_fsck.o			\
     cmd_format.o		\
     cmd_key.o			\
     cmd_migrate.o		\
     cmd_run.o			\
     crypto.o			\
     libbcache.o		\
--- a/bcache.c
+++ b/bcache.c
@ -50,7 +50,12 @@ static void usage(void)
 	     "\n"
 	     "Debug:\n"
 	     "  bcache dump    Dump filesystem metadata to a qcow2 image\n"
-	     "  bcache list    List filesystem metadata in textual form\n");
+	     "  bcache list    List filesystem metadata in textual form\n"
 	     "\n"
 	     "Migrate:\n"
 	     "  bcache migrate Migrate an existing filesystem to bcachefs, in place\n"
 	     "  bcache migrate_superblock\n"
 	     "                 Add default superblock, after bcache migrate\n");
 }
 int main(int argc, char *argv[])
@ -104,6 +109,11 @@ int main(int argc, char *argv[])
 	if (!strcmp(cmd, "list"))
 		return cmd_list(argc, argv);
 	if (!strcmp(cmd, "migrate"))
 		return cmd_migrate(argc, argv);
 	if (!strcmp(cmd, "migrate_superblock"))
 		return cmd_migrate_superblock(argc, argv);
 	usage();
 	return 0;
 }
--- a/cmd_debug.c
+++ b/cmd_debug.c
@ -30,35 +30,35 @@ static void dump_usage(void)
 static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 {
 	struct bch_sb *sb = ca->disk_sb.sb;
-	sparse_data data;
+	ranges data;
 	unsigned i;
 	darray_init(data);
 	/* Superblock: */
-	data_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+	range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
-		 sizeof(struct bch_sb_layout));
+		  sizeof(struct bch_sb_layout));
 	for (i = 0; i < sb->layout.nr_superblocks; i++)
-		data_add(&data,
+		range_add(&data,
-			 le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+			  le64_to_cpu(sb->layout.sb_offset[i]) << 9,
-			 vstruct_bytes(sb));
+			  vstruct_bytes(sb));
 	/* Journal: */
 	for (i = 0; i < ca->journal.nr; i++)
 		if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
 			u64 bucket = ca->journal.buckets[i];
-			data_add(&data,
+			range_add(&data,
-				 bucket_bytes(ca) * bucket,
+				  bucket_bytes(ca) * bucket,
-				 bucket_bytes(ca));
+				  bucket_bytes(ca));
 		}
 	/* Prios/gens: */
 	for (i = 0; i < prio_buckets(ca); i++)
-		data_add(&data,
+		range_add(&data,
-			 bucket_bytes(ca) * ca->prio_last_buckets[i],
+			  bucket_bytes(ca) * ca->prio_last_buckets[i],
-			 bucket_bytes(ca));
+			  bucket_bytes(ca));
 	/* Btree: */
 	for (i = 0; i < BTREE_ID_NR; i++) {
@ -71,9 +71,9 @@ static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 			extent_for_each_ptr(e, ptr)
 				if (ptr->dev == ca->dev_idx)
-					data_add(&data,
+					range_add(&data,
-						 ptr->offset << 9,
+						  ptr->offset << 9,
-						 b->written << 9);
+						  b->written << 9);
 		}
 		bch_btree_iter_unlock(&iter);
 	}
@ -87,7 +87,7 @@ int cmd_dump(int argc, char *argv[])
 	struct bch_opts opts = bch_opts_empty();
 	struct cache_set *c = NULL;
 	const char *err;
-	char *out = NULL, *buf;
+	char *out = NULL;
 	unsigned i, nr_devices = 0;
 	bool force = false;
 	int fd, opt;
@ -116,9 +116,6 @@ int cmd_dump(int argc, char *argv[])
 	if (!out)
 		die("Please supply output filename");
 	buf = alloca(strlen(out) + 10);
 	strcpy(buf, out);
 	err = bch_fs_open(argv + optind, argc - optind, opts, &c);
 	if (err)
 		die("error opening %s: %s", argv[optind], err);
@ -140,12 +137,11 @@ int cmd_dump(int argc, char *argv[])
 		if (!c->cache[i])
 			continue;
-		if (nr_devices > 1)
+		char *path = nr_devices > 1
-			sprintf(buf, "%s.%u", out, i);
+			? mprintf("%s.%u", out, i)
-
+			: strdup(out);
-		fd = open(buf, mode, 0600);
+		fd = xopen(path, mode, 0600);
-		if (fd < 0)
+		free(path);
 			die("error opening %s: %s", buf, strerror(errno));
 		dump_one_device(c, c->cache[i], fd);
 		close(fd);
@ -153,7 +149,7 @@ int cmd_dump(int argc, char *argv[])
 	up_read(&c->gc_lock);
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
@ -213,14 +209,20 @@ static void list_keys_usage(void)
 	     "Usage: bcache list_keys [OPTION]... <devices>\n"
 	     "\n"
 	     "Options:\n"
-	     "  -b btree_id   Integer btree id to list\n"
+	     "  -b (extents|inodes|dirents|xattrs)    Btree to list from\n"
-	     "  -s start      Start pos (as inode:offset)\n"
+	     "  -s inode:offset                       Start position to list from\n"
-	     "  -e end        End pos\n"
+	     "  -e inode:offset                       End position\n"
-	     "  -m mode       Mode for listing\n"
+	     "  -m (keys|formats)                     List mode\n"
-	     "  -h            Display this help and exit\n"
+	     "  -h                                    Display this help and exit\n"
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 static const char * const list_modes[] = {
 	"keys",
 	"formats",
 	NULL
 };
 int cmd_list(int argc, char *argv[])
 {
 	struct bch_opts opts = bch_opts_empty();
@ -229,7 +231,6 @@ int cmd_list(int argc, char *argv[])
 	struct bpos start = POS_MIN, end = POS_MAX;
 	const char *err;
 	int mode = 0, opt;
 	u64 v;
 	opts.nochanges	= true;
 	opts.norecovery	= true;
@ -239,10 +240,8 @@ int cmd_list(int argc, char *argv[])
 	while ((opt = getopt(argc, argv, "b:s:e:m:h")) != -1)
 		switch (opt) {
 		case 'b':
-			if (kstrtoull(optarg, 10, &v) ||
+			btree_id = read_string_list_or_die(optarg,
-			    v >= BTREE_ID_NR)
+						bch_btree_ids, "btree id");
 				die("invalid btree id");
 			btree_id = v;
 			break;
 		case 's':
 			start	= parse_pos(optarg);
@ -251,6 +250,8 @@ int cmd_list(int argc, char *argv[])
 			end	= parse_pos(optarg);
 			break;
 		case 'm':
 			mode = read_string_list_or_die(optarg,
 						list_modes, "list mode");
 			break;
 		case 'h':
 			list_keys_usage();
@ -275,6 +276,6 @@ int cmd_list(int argc, char *argv[])
 		die("Invalid mode");
 	}
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
--- a/cmd_device.c
+++ b/cmd_device.c
@ -121,10 +121,7 @@ int cmd_device_show(int argc, char *argv[])
 		char *dev_name = basename(dirname(link));
-		int fd = openat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
+		int fd = xopenat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
 		if (fd < 0)
 			die("couldn't open device %s: %s\n",
 			    entry->d_name, strerror(errno));
 		devices[nr_devices] = fill_dev(strdup(dev_name), nr, fd);
 		tiers[devices[nr_devices].tier]++;
--- a/cmd_format.c
+++ b/cmd_format.c
@ -34,10 +34,8 @@ static int open_for_format(const char *dev, bool force)
 	blkid_probe pr;
 	const char *fs_type = NULL, *fs_label = NULL;
 	size_t fs_type_len, fs_label_len;
 	int fd;
-	if ((fd = open(dev, O_RDWR|O_EXCL)) == -1)
+	int fd = xopen(dev, O_RDWR|O_EXCL);
 		die("Can't open dev %s: %s\n", dev, strerror(errno));
 	if (force)
 		return fd;
@ -70,8 +68,41 @@ static int open_for_format(const char *dev, bool force)
 	return fd;
 }
 #define OPTS									\
 t("bcache format - create a new bcache filesystem on one or more devices")	\
 t("Usage: bcache format [OPTION]... <devices>")					\
 t("")										\
 x('b',	block_size,		"size",			NULL)			\
 x(0,	btree_node_size,	"size",			"Default 256k")		\
 x(0,	metadata_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
 x(0,	data_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
 x(0,	compression_type,	"(none|lz4|gzip)",	NULL)			\
 x(0,	encrypted,		NULL,			"Enable whole filesystem encryption (chacha20/poly1305)")\
 x(0,	no_passphrase,		NULL,			"Don't encrypt master encryption key")\
 x('e',	error_action,		"(continue|readonly|panic)", NULL)		\
 x(0,	max_journal_entry_size,	"size",			NULL)			\
 x('L',	label,			"label",		NULL)			\
 x('U',	uuid,			"uuid",			NULL)			\
 x('f',	force,			NULL,			NULL)			\
 t("")										\
 t("Device specific options:")							\
 x(0,	fs_size,		"size",			"Size of filesystem on device")\
 x(0,	bucket_size,		"size",			"Bucket size")		\
 x('t',	tier,			"#",			"Higher tier indicates slower devices")\
 x(0,	discard,		NULL,			NULL)			\
 t("Device specific options must come before corresponding devices, e.g.")	\
 t("  bcache format --tier 0 /dev/sdb --tier 1 /dev/sdc")			\
 t("")										\
 x('h',	help,			NULL,			"display this help and exit")
 static void usage(void)
 {
 #define t(text)				puts(text "\n")
 #define x(shortopt, longopt, arg, help) do {				\
 	OPTS
 #undef x
 #undef t
 	puts("bcache format - create a new bcache filesystem on one or more devices\n"
 	     "Usage: bcache format [OPTION]... <devices>\n"
 	     "\n"
@ -81,7 +112,8 @@ static void usage(void)
 	     "      --metadata_checksum_type=(none|crc32c|crc64)\n"
 	     "      --data_checksum_type=(none|crc32c|crc64)\n"
 	     "      --compression_type=(none|lz4|gzip)\n"
-	     "      --encrypted\n"
+	     "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
 	     "      --no_passphrase         Don't encrypt master encryption key\n"
 	     "      --error_action=(continue|readonly|panic)\n"
 	     "                              Action to take on filesystem error\n"
 	     "      --max_journal_entry_size=size\n"
@ -103,37 +135,26 @@ static void usage(void)
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 #define OPTS								\
 	OPT('b',	block_size,		required_argument)	\
 	OPT(0,		btree_node_size,	required_argument)	\
 	OPT(0,		metadata_checksum_type,	required_argument)	\
 	OPT(0,		data_checksum_type,	required_argument)	\
 	OPT(0,		compression_type,	required_argument)	\
 	OPT(0,		encrypted,		no_argument)		\
 	OPT('e',	error_action,		required_argument)	\
 	OPT(0,		max_journal_entry_size,	required_argument)	\
 	OPT('L',	label,			required_argument)	\
 	OPT('U',	uuid,			required_argument)	\
 	OPT('f',	force,			no_argument)		\
 	OPT(0,		fs_size,		required_argument)	\
 	OPT(0,		bucket_size,		required_argument)	\
 	OPT('t',	tier,			required_argument)	\
 	OPT(0,		discard,		no_argument)		\
 	OPT('h',	help,			no_argument)
 enum {
 	Opt_no_opt = 1,
-#define OPT(shortopt, longopt, has_arg)	Opt_##longopt,
+#define t(text)
 #define x(shortopt, longopt, arg, help)	Opt_##longopt,
 	OPTS
-#undef OPT
+#undef x
 #undef t
 };
 static const struct option format_opts[] = {
-#define OPT(shortopt, longopt, has_arg)	{				\
+#define t(text)
-		#longopt,  has_arg, NULL, Opt_##longopt			\
+#define x(shortopt, longopt, arg, help)	{				\
-	},
+	.name		= #longopt,					\
 	.has_arg	= arg ? required_argument : no_argument,	\
 	.flag		= NULL,						\
 	.val		= Opt_##longopt,				\
 },
 	OPTS
-#undef OPT
+#undef x
 #undef t
 	{ NULL }
 };
@ -161,29 +182,12 @@ static unsigned hatoi_validate(const char *s, const char *msg)
 int cmd_format(int argc, char *argv[])
 {
 	darray(struct dev_opts) devices;
-	struct dev_opts *dev;
+	struct format_opts opts = format_opts_default();
-	unsigned block_size = 0;
+	struct dev_opts dev_opts = { 0 }, *dev;
-	unsigned btree_node_size = 0;
+	bool force = false, no_passphrase = false;
 	unsigned meta_csum_type = BCH_CSUM_CRC32C;
 	unsigned data_csum_type = BCH_CSUM_CRC32C;
 	unsigned compression_type = BCH_COMPRESSION_NONE;
 	bool encrypted = false;
 	unsigned on_error_action = BCH_ON_ERROR_RO;
 	char *label = NULL;
 	uuid_le uuid;
 	bool force = false;
 	/* Device specific options: */
 	u64 filesystem_size = 0;
 	unsigned bucket_size = 0;
 	unsigned tier = 0;
 	bool discard = false;
 	unsigned max_journal_entry_size = 0;
 	char *passphrase = NULL;
 	int opt;
 	darray_init(devices);
 	uuid_clear(uuid.b);
 	while ((opt = getopt_long(argc, argv,
 				  "-b:e:L:U:ft:h",
@ -192,45 +196,52 @@ int cmd_format(int argc, char *argv[])
 		switch (opt) {
 		case Opt_block_size:
 		case 'b':
-			block_size = hatoi_validate(optarg,
+			opts.block_size =
-						"block size");
+				hatoi_validate(optarg, "block size");
 			break;
 		case Opt_btree_node_size:
-			btree_node_size = hatoi_validate(optarg,
+			opts.btree_node_size =
-						"btree node size");
+				hatoi_validate(optarg, "btree node size");
 			break;
 		case Opt_metadata_checksum_type:
-			meta_csum_type = read_string_list_or_die(optarg,
+			opts.meta_csum_type =
 				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
 		case Opt_data_checksum_type:
-			data_csum_type = read_string_list_or_die(optarg,
+			opts.data_csum_type =
 				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
 		case Opt_compression_type:
-			compression_type = read_string_list_or_die(optarg,
+			opts.compression_type =
 				read_string_list_or_die(optarg,
 						bch_compression_types,
 						"compression type");
 			break;
 		case Opt_encrypted:
-			encrypted = true;
+			opts.encrypted = true;
 			break;
 		case Opt_no_passphrase:
 			no_passphrase = true;
 			break;
 		case Opt_error_action:
 		case 'e':
-			on_error_action = read_string_list_or_die(optarg,
+			opts.on_error_action =
 				read_string_list_or_die(optarg,
 						bch_error_actions, "error action");
 			break;
 		case Opt_max_journal_entry_size:
-			max_journal_entry_size = hatoi_validate(optarg,
+			opts.max_journal_entry_size =
-						"journal entry size");
+				hatoi_validate(optarg, "journal entry size");
 			break;
 		case Opt_label:
 		case 'L':
-			label = strdup(optarg);
+			opts.label = strdup(optarg);
 			break;
 		case Opt_uuid:
 		case 'U':
-			if (uuid_parse(optarg, uuid.b))
+			if (uuid_parse(optarg, opts.uuid.b))
 				die("Bad uuid");
 			break;
 		case Opt_force:
@ -238,31 +249,28 @@ int cmd_format(int argc, char *argv[])
 			force = true;
 			break;
 		case Opt_fs_size:
-			if (bch_strtoull_h(optarg, &filesystem_size))
+			if (bch_strtoull_h(optarg, &dev_opts.size))
 				die("invalid filesystem size");
-			filesystem_size >>= 9;
+			dev_opts.size >>= 9;
 			break;
 		case Opt_bucket_size:
-			bucket_size = hatoi_validate(optarg, "bucket size");
+			dev_opts.bucket_size =
 				hatoi_validate(optarg, "bucket size");
 			break;
 		case Opt_tier:
 		case 't':
-			if (kstrtouint(optarg, 10, &tier) ||
+			if (kstrtouint(optarg, 10, &dev_opts.tier) ||
-			    tier >= BCH_TIER_MAX)
+			    dev_opts.tier >= BCH_TIER_MAX)
 				die("invalid tier");
 			break;
 		case Opt_discard:
-			discard = true;
+			dev_opts.discard = true;
 			break;
 		case Opt_no_opt:
-			darray_append(devices, (struct dev_opts) {
+			dev_opts.path = strdup(optarg);
-				.path			= strdup(optarg),
+			darray_append(devices, dev_opts);
-				.size			= filesystem_size,
+			dev_opts.size = 0;
 				.bucket_size		= bucket_size,
 				.tier			= tier,
 				.discard		= discard,
 			});
 			break;
 		case Opt_help:
 		case 'h':
@ -274,18 +282,16 @@ int cmd_format(int argc, char *argv[])
 	if (!darray_size(devices))
 		die("Please supply a device");
-	if (uuid_is_null(uuid.b))
+	if (opts.encrypted && !no_passphrase) {
-		uuid_generate(uuid.b);
+		opts.passphrase = read_passphrase("Enter passphrase: ");
 	if (encrypted) {
 		passphrase = read_passphrase("Enter passphrase: ");
 		if (isatty(STDIN_FILENO)) {
 			char *pass2 =
 				read_passphrase("Enter same passphrase again: ");
-			if (strcmp(passphrase, pass2)) {
+			if (strcmp(opts.passphrase, pass2)) {
-				memzero_explicit(passphrase, strlen(passphrase));
+				memzero_explicit(opts.passphrase,
 						 strlen(opts.passphrase));
 				memzero_explicit(pass2, strlen(pass2));
 				die("Passphrases do not match");
 			}
@ -298,23 +304,14 @@ int cmd_format(int argc, char *argv[])
 	darray_foreach(dev, devices)
 		dev->fd = open_for_format(dev->path, force);
-	bcache_format(devices.item, darray_size(devices),
+	struct bch_sb *sb =
-		      block_size,
+		bcache_format(opts, devices.item, darray_size(devices));
-		      btree_node_size,
+	bcache_super_print(sb, HUMAN_READABLE);
-		      meta_csum_type,
+	free(sb);
 		      data_csum_type,
 		      compression_type,
 		      passphrase,
 		      1,
 		      1,
 		      on_error_action,
 		      max_journal_entry_size,
 		      label,
 		      uuid);
-	if (passphrase) {
+	if (opts.passphrase) {
-		memzero_explicit(passphrase, strlen(passphrase));
+		memzero_explicit(opts.passphrase, strlen(opts.passphrase));
-		free(passphrase);
+		free(opts.passphrase);
 	}
 	return 0;
--- a/cmd_fsck.c
+++ b/cmd_fsck.c
@ -56,6 +56,6 @@ int cmd_fsck(int argc, char *argv[])
 	if (err)
 		die("error opening %s: %s", argv[optind], err);
-	bch_fs_stop_sync(c);
+	bch_fs_stop(c);
 	return 0;
 }
--- a/cmd_key.c
+++ b/cmd_key.c
@ -1,6 +1,5 @@
 #include <errno.h>
 #include <unistd.h>
 #include <keyutils.h>
 #include <uuid/uuid.h>
 #include "cmds.h"
@ -10,52 +9,18 @@
 int cmd_unlock(int argc, char *argv[])
 {
 	struct bch_encrypted_key sb_key;
 	struct bch_key passphrase_key;
 	struct bch_sb *sb;
 	struct bch_sb_field_crypt *crypt;
 	char *passphrase;
 	char uuid[40];
 	char description[60];
 	if (argc != 2)
 		die("please supply a single device");
 	sb = bcache_super_read(argv[1]);
 	crypt = bch_sb_get_crypt(sb);
 	if (!crypt)
 		die("filesystem is not encrypted");
 	sb_key = crypt->key;
 	if (!bch_key_is_encrypted(&sb_key))
 		die("filesystem does not have encryption key");
 	passphrase = read_passphrase("Enter passphrase: ");
 	derive_passphrase(crypt, &passphrase_key, passphrase);
-	/* Check if the user supplied the correct passphrase: */
+	add_bcache_key(sb, passphrase);
 	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
 				   &sb_key, sizeof(sb_key)))
 		die("error encrypting key");
 	if (bch_key_is_encrypted(&sb_key))
 		die("incorrect passphrase");
 	uuid_unparse_lower(sb->user_uuid.b, uuid);
 	sprintf(description, "bcache:%s", uuid);
 	if (add_key("logon", description,
 		    &passphrase_key, sizeof(passphrase_key),
 		    KEY_SPEC_USER_KEYRING) < 0 ||
 	    add_key("user", description,
 		    &passphrase_key, sizeof(passphrase_key),
 		    KEY_SPEC_USER_KEYRING) < 0)
 		die("add_key error: %s", strerror(errno));
 	memzero_explicit(&sb_key, sizeof(sb_key));
 	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
 	memzero_explicit(passphrase, strlen(passphrase));
 	free(passphrase);
 	return 0;
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@ -0,0 +1,835 @@
 #include </usr/include/dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
 #include <unistd.h>
 #include <attr/xattr.h>
 #include <linux/fiemap.h>
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <uuid/uuid.h>
 #include "cmds.h"
 #include "crypto.h"
 #include "libbcache.h"
 #include "linux/bcache.h"
 #include <linux/dcache.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/xattr.h>
 #include "btree_update.h"
 #include "buckets.h"
 #include "dirent.h"
 #include "fs.h"
 #include "inode.h"
 #include "io.h"
 #include "str_hash.h"
 #include "super.h"
 #include "xattr.h"
 static char *dev_t_to_path(dev_t dev)
 {
 	char link[PATH_MAX], *p;
 	int ret;
 	char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
 				  major(dev), minor(dev));
 	ret = readlink(sysfs_dev, link, sizeof(link));
 	free(sysfs_dev);
 	if (ret < 0 || ret >= sizeof(link))
 		die("readlink error while looking up block device: %s", strerror(errno));
 	link[ret] = '\0';
 	p = strrchr(link, '/');
 	if (!p)
 		die("error looking up device name");
 	p++;
 	return mprintf("/dev/%s", p);
 }
 static bool path_is_fs_root(char *path)
 {
 	char *line = NULL, *p, *mount;
 	size_t n = 0;
 	FILE *f;
 	bool ret = true;
 	f = fopen("/proc/self/mountinfo", "r");
 	if (!f)
 		die("Error getting mount information");
 	while (getline(&line, &n, f) != -1) {
 		p = line;
 		strsep(&p, " "); /* mount id */
 		strsep(&p, " "); /* parent id */
 		strsep(&p, " "); /* dev */
 		strsep(&p, " "); /* root */
 		mount = strsep(&p, " ");
 		strsep(&p, " ");
 		if (mount && !strcmp(path, mount))
 			goto found;
 	}
 	ret = false;
 found:
 	fclose(f);
 	free(line);
 	return ret;
 }
 static void mark_unreserved_space(struct cache_set *c, ranges extents)
 {
 	struct cache *ca = c->cache[0];
 	struct hole_iter iter;
 	struct range i;
 	for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
 		struct bucket_mark new;
 		u64 b;
 		if (i.start == i.end)
 			return;
 		b = sector_to_bucket(ca, i.start >> 9);
 		do {
 			bucket_cmpxchg(&ca->buckets[b], new, new.nouse = 1);
 			b++;
 		} while (bucket_to_sector(ca, b) << 9 < i.end);
 	}
 }
 static void update_inode(struct cache_set *c,
 			 struct bch_inode_unpacked *inode)
 {
 	struct bkey_inode_buf packed;
 	int ret;
 	bch_inode_pack(&packed, inode);
 	ret = bch_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL);
 	if (ret)
 		die("error creating file: %s", strerror(-ret));
 }
 static void create_dirent(struct cache_set *c,
 			  struct bch_inode_unpacked *parent,
 			  const char *name, u64 inum, mode_t mode)
 {
 	struct bch_hash_info parent_hash_info = bch_hash_info_init(parent);
 	struct qstr qname = { { { .len = strlen(name), } }, .name = name };
 	int ret = bch_dirent_create(c, parent->inum, &parent_hash_info,
 				    mode_to_type(mode), &qname,
 				    inum, NULL, BCH_HASH_SET_MUST_CREATE);
 	if (ret)
 		die("error creating file: %s", strerror(-ret));
 	if (S_ISDIR(mode))
 		parent->i_nlink++;
 }
 static void create_link(struct cache_set *c,
 			struct bch_inode_unpacked *parent,
 			const char *name, u64 inum, mode_t mode)
 {
 	struct bch_inode_unpacked inode;
 	int ret = bch_inode_find_by_inum(c, inum, &inode);
 	if (ret)
 		die("error looking up hardlink: %s", strerror(-ret));
 	inode.i_nlink++;
 	update_inode(c, &inode);
 	create_dirent(c, parent, name, inum, mode);
 }
 static struct bch_inode_unpacked create_file(struct cache_set *c,
 					     struct bch_inode_unpacked *parent,
 					     const char *name,
 					     uid_t uid, gid_t gid,
 					     mode_t mode, dev_t rdev)
 {
 	struct bch_inode_unpacked new_inode;
 	struct bkey_inode_buf packed;
 	int ret;
 	bch_inode_init(c, &new_inode, uid, gid, mode, rdev);
 	bch_inode_pack(&packed, &new_inode);
 	ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
 			       &c->unused_inode_hint);
 	if (ret)
 		die("error creating file: %s", strerror(-ret));
 	new_inode.inum = packed.inode.k.p.inode;
 	create_dirent(c, parent, name, new_inode.inum, mode);
 	return new_inode;
 }
 #define for_each_xattr_handler(handlers, handler)		\
 	if (handlers)						\
 		for ((handler) = *(handlers)++;			\
 			(handler) != NULL;			\
 			(handler) = *(handlers)++)
 static const struct xattr_handler *xattr_resolve_name(const char **name)
 {
 	const struct xattr_handler **handlers = bch_xattr_handlers;
 	const struct xattr_handler *handler;
 	for_each_xattr_handler(handlers, handler) {
 		const char *n;
 		n = strcmp_prefix(*name, xattr_prefix(handler));
 		if (n) {
 			if (!handler->prefix ^ !*n) {
 				if (*n)
 					continue;
 				return ERR_PTR(-EINVAL);
 			}
 			*name = n;
 			return handler;
 		}
 	}
 	return ERR_PTR(-EOPNOTSUPP);
 }
 static void copy_times(struct cache_set *c, struct bch_inode_unpacked *dst,
 		       struct stat *src)
 {
 	dst->i_atime = timespec_to_bch_time(c, src->st_atim);
 	dst->i_mtime = timespec_to_bch_time(c, src->st_mtim);
 	dst->i_ctime = timespec_to_bch_time(c, src->st_ctim);
 }
 static void copy_xattrs(struct cache_set *c, struct bch_inode_unpacked *dst,
 			char *src)
 {
 	struct bch_hash_info hash_info = bch_hash_info_init(dst);
 	ssize_t size = llistxattr(src, NULL, 0);
 	if (size < 0)
 		die("listxattr error: %s", strerror(errno));
 	if (!size)
 		return;
 	char *buf = malloc(size);
 	size = llistxattr(src, buf, size);
 	if (size < 0)
 		die("listxattr error: %s", strerror(errno));
 	for (const char *next, *attr = buf;
 	     attr <= buf + size;
 	     attr = next) {
 		next = attr + strlen(attr) + 1;
 		/* max possible xattr val: */
 		static char val[64 << 10];
 		ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
 		if (val_size < 0)
 			die("error getting xattr val: %s", strerror(errno));
 		const struct xattr_handler *h = xattr_resolve_name(&attr);
 		int ret = __bch_xattr_set(c, dst->inum, &hash_info, attr,
 					  val, val_size, 0, h->flags, NULL);
 		if (ret < 0)
 			die("error creating xattr: %s", strerror(-ret));
 	}
 	free(buf);
 }
 static void write_data(struct cache_set *c,
 		       struct bch_inode_unpacked *dst_inode,
 		       u64 dst_offset, void *buf, size_t len)
 {
 	struct disk_reservation res;
 	struct bch_write_op op;
 	struct bch_write_bio bio;
 	struct bio_vec bv;
 	struct closure cl;
 	BUG_ON(dst_offset	& (block_bytes(c) - 1));
 	BUG_ON(len		& (block_bytes(c) - 1));
 	closure_init_stack(&cl);
 	bio_init(&bio.bio);
 	bio.bio.bi_max_vecs	= 1;
 	bio.bio.bi_io_vec	= &bv;
 	bio.bio.bi_iter.bi_size	= len;
 	bch_bio_map(&bio.bio, buf);
 	int ret = bch_disk_reservation_get(c, &res, len >> 9, 0);
 	if (ret)
 		die("error reserving space in new filesystem: %s", strerror(-ret));
 	bch_write_op_init(&op, c, &bio, res, c->write_points,
 			  POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
 	closure_call(&op.cl, bch_write, NULL, &cl);
 	closure_sync(&cl);
 	dst_inode->i_sectors += len >> 9;
 }
 static char buf[1 << 20] __aligned(PAGE_SIZE);
 static void copy_data(struct cache_set *c,
 		      struct bch_inode_unpacked *dst_inode,
 		      int src_fd, u64 start, u64 end)
 {
 	while (start < end) {
 		unsigned len = min_t(u64, end - start, sizeof(buf));
 		xpread(src_fd, buf, len, start);
 		write_data(c, dst_inode, start, buf, len);
 		start += len;
 	}
 }
 static void link_data(struct cache_set *c, struct bch_inode_unpacked *dst,
 		      u64 logical, u64 physical, u64 length)
 {
 	struct cache *ca = c->cache[0];
 	BUG_ON(logical	& (block_bytes(c) - 1));
 	BUG_ON(physical & (block_bytes(c) - 1));
 	BUG_ON(length	& (block_bytes(c) - 1));
 	logical		>>= 9;
 	physical	>>= 9;
 	length		>>= 9;
 	BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
 	while (length) {
 		struct bkey_i_extent *e;
 		BKEY_PADDED(k) k;
 		u64 b = sector_to_bucket(ca, physical >> 9);
 		struct disk_reservation res;
 		unsigned sectors;
 		int ret;
 		sectors = min(ca->mi.bucket_size -
 			      (physical & (ca->mi.bucket_size - 1)),
 			      length);
 		e = bkey_extent_init(&k.k);
 		e->k.p.inode	= dst->inum;
 		e->k.p.offset	= logical + sectors;
 		e->k.size	= sectors;
 		extent_ptr_append(e, (struct bch_extent_ptr) {
 					.offset = physical,
 					.dev = 0,
 					.gen = ca->buckets[b].mark.gen,
 				  });
 		ret = bch_disk_reservation_get(c, &res, sectors,
 					       BCH_DISK_RESERVATION_NOFAIL);
 		if (ret)
 			die("error reserving space in new filesystem: %s",
 			    strerror(-ret));
 		ret = bch_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
 				       &res, NULL, NULL, 0);
 		if (ret)
 			die("btree insert error %s", strerror(-ret));
 		bch_disk_reservation_put(c, &res);
 		dst->i_sectors	+= sectors;
 		logical		+= sectors;
 		physical	+= sectors;
 		length		-= sectors;
 	}
 }
 static void copy_link(struct cache_set *c, struct bch_inode_unpacked *dst,
 		      char *src)
 {
 	ssize_t ret = readlink(src, buf, sizeof(buf));
 	if (ret < 0)
 		die("readlink error: %s", strerror(errno));
 	write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
 }
 static void copy_file(struct cache_set *c, struct bch_inode_unpacked *dst,
 		      int src, char *src_path, ranges *extents)
 {
 	struct fiemap_iter iter;
 	struct fiemap_extent e;
 	fiemap_for_each(src, iter, e)
 		if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
 			fsync(src);
 			break;
 		}
 	fiemap_for_each(src, iter, e) {
 		if ((e.fe_logical	& (block_bytes(c) - 1)) ||
 		    (e.fe_length	& (block_bytes(c) - 1)))
 			die("Unaligned extent in %s - can't handle", src_path);
 		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
 				  FIEMAP_EXTENT_ENCODED|
 				  FIEMAP_EXTENT_NOT_ALIGNED|
 				  FIEMAP_EXTENT_DATA_INLINE)) {
 			copy_data(c, dst,
 				  src,
 				  round_down(e.fe_logical, block_bytes(c)),
 				  round_up(e.fe_logical + e.fe_length,
 					   block_bytes(c)));
 			continue;
 		}
 		if ((e.fe_physical	& (block_bytes(c) - 1)))
 			die("Unaligned extent in %s - can't handle", src_path);
 		range_add(extents, e.fe_physical, e.fe_length);
 		link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
 	}
 }
 struct copy_fs_state {
 	u64			bcachefs_inum;
 	dev_t			dev;
 	GENRADIX(u64)		hardlinks;
 	ranges			extents;
 };
 static void copy_dir(struct copy_fs_state *s,
 		     struct cache_set *c,
 		     struct bch_inode_unpacked *dst,
 		     int src_fd, const char *src_path)
 {
 	DIR *dir = fdopendir(src_fd);
 	struct dirent *d;
 	while ((errno = 0), (d = readdir(dir))) {
 		struct bch_inode_unpacked inode;
 		int fd;
 		if (fchdir(src_fd))
 			die("chdir error: %s", strerror(errno));
 		struct stat stat =
 			xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
 		if (!strcmp(d->d_name, ".") ||
 		    !strcmp(d->d_name, "..") ||
 		    stat.st_ino == s->bcachefs_inum)
 			continue;
 		char *child_path = mprintf("%s/%s", src_path, d->d_name);
 		if (stat.st_dev != s->dev)
 			die("%s does not have correct st_dev!", child_path);
 		u64 *dst_inum = S_ISREG(stat.st_mode)
 			? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
 			: NULL;
 		if (dst_inum && *dst_inum) {
 			create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
 			goto next;
 		}
 		inode = create_file(c, dst, d->d_name,
 				    stat.st_uid, stat.st_gid,
 				    stat.st_mode, stat.st_rdev);
 		if (dst_inum)
 			*dst_inum = inode.inum;
 		copy_times(c, &inode, &stat);
 		copy_xattrs(c, &inode, d->d_name);
 		/* copy xattrs */
 		switch (mode_to_type(stat.st_mode)) {
 		case DT_DIR:
 			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
 			copy_dir(s, c, &inode, fd, child_path);
 			close(fd);
 			break;
 		case DT_REG:
 			inode.i_size = stat.st_size;
 			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
 			copy_file(c, &inode, fd, child_path, &s->extents);
 			close(fd);
 			break;
 		case DT_LNK:
 			inode.i_size = stat.st_size;
 			copy_link(c, &inode, d->d_name);
 			break;
 		case DT_FIFO:
 		case DT_CHR:
 		case DT_BLK:
 		case DT_SOCK:
 		case DT_WHT:
 			/* nothing else to copy for these: */
 			break;
 		default:
 			BUG();
 		}
 		update_inode(c, &inode);
 next:
 		free(child_path);
 	}
 	if (errno)
 		die("readdir error: %s", strerror(errno));
 }
 static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
 				   u64 size, u64 *bcachefs_inum, dev_t dev)
 {
 	int fd = open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
 	if (fd < 0)
 		die("Error creating %s for bcachefs metadata: %s",
 		    file_path, strerror(errno));
 	struct stat statbuf = xfstat(fd);
 	if (statbuf.st_dev != dev)
 		die("bcachefs file has incorrect device");
 	*bcachefs_inum = statbuf.st_ino;
 	if (fallocate(fd, 0, 0, size))
 		die("Error reserving space for bcachefs metadata: %s",
 		    strerror(errno));
 	fsync(fd);
 	struct fiemap_iter iter;
 	struct fiemap_extent e;
 	ranges extents = { NULL };
 	fiemap_for_each(fd, iter, e) {
 		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
 				  FIEMAP_EXTENT_ENCODED|
 				  FIEMAP_EXTENT_NOT_ALIGNED|
 				  FIEMAP_EXTENT_DATA_INLINE))
 			die("Unable to continue: metadata file not fully mapped");
 		if ((e.fe_physical	& (block_size - 1)) ||
 		    (e.fe_length	& (block_size - 1)))
 			die("Unable to continue: unaligned extents in metadata file");
 		range_add(&extents, e.fe_physical, e.fe_length);
 	}
 	close(fd);
 	ranges_sort_merge(&extents);
 	return extents;
 }
 static void reserve_old_fs_space(struct cache_set *c,
 				 struct bch_inode_unpacked *root_inode,
 				 ranges *extents)
 {
 	struct cache *ca = c->cache[0];
 	struct bch_inode_unpacked dst;
 	struct hole_iter iter;
 	struct range i;
 	dst = create_file(c, root_inode, "old_migrated_filesystem",
 			  0, 0, S_IFREG|0400, 0);
 	dst.i_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
 	ranges_sort_merge(extents);
 	for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
 		link_data(c, &dst, i.start, i.start, i.end - i.start);
 	update_inode(c, &dst);
 }
 static void copy_fs(struct cache_set *c, int src_fd, const char *src_path,
 		    u64 bcachefs_inum, ranges *extents)
 {
 	syncfs(src_fd);
 	struct bch_inode_unpacked root_inode;
 	int ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, &root_inode);
 	if (ret)
 		die("error looking up root directory: %s", strerror(-ret));
 	if (fchdir(src_fd))
 		die("chdir error: %s", strerror(errno));
 	struct stat stat = xfstat(src_fd);
 	copy_times(c, &root_inode, &stat);
 	copy_xattrs(c, &root_inode, ".");
 	struct copy_fs_state s = {
 		.bcachefs_inum	= bcachefs_inum,
 		.dev		= stat.st_dev,
 		.extents	= *extents,
 	};
 	/* now, copy: */
 	copy_dir(&s, c, &root_inode, src_fd, src_path);
 	reserve_old_fs_space(c, &root_inode, &s.extents);
 	update_inode(c, &root_inode);
 	darray_free(s.extents);
 	genradix_free(&s.hardlinks);
 }
 static void find_superblock_space(ranges extents, struct dev_opts *dev)
 {
 	struct range *i;
 	darray_foreach(i, extents) {
 		u64 offset = max(256ULL << 10, i->start);
 		if (offset + (128 << 10) <= i->end) {
 			dev->sb_offset	= offset >> 9;
 			dev->sb_end	= dev->sb_offset + 256;
 			return;
 		}
 	}
 	die("Couldn't find a valid location for superblock");
 }
 static void migrate_usage(void)
 {
 	puts("bcache migrate - migrate an existing filesystem to bcachefs\n"
 	     "Usage: bcache migrate [OPTION]...\n"
 	     "\n"
 	     "Options:\n"
 	     "  -f fs                  Root of filesystem to migrate(s)\n"
 	     "      --encrypted        Enable whole filesystem encryption (chacha20/poly1305)\n"
 	     "      --no_passphrase    Don't encrypt master encryption key\n"
 	     "  -h                     Display this help and exit\n"
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 static const struct option migrate_opts[] = {
 	{ "encrypted",		no_argument, NULL, 'e' },
 	{ "no_passphrase",	no_argument, NULL, 'p' },
 	{ NULL }
 };
 int cmd_migrate(int argc, char *argv[])
 {
 	struct format_opts format_opts = format_opts_default();
 	char *fs_path = NULL;
 	unsigned block_size;
 	bool no_passphrase = false;
 	int opt;
 	while ((opt = getopt_long(argc, argv, "f:h",
 				  migrate_opts, NULL)) != -1)
 		switch (opt) {
 		case 'f':
 			fs_path = optarg;
 			break;
 		case 'e':
 			format_opts.encrypted = true;
 			break;
 		case 'p':
 			no_passphrase = true;
 			break;
 		case 'h':
 			migrate_usage();
 			exit(EXIT_SUCCESS);
 		}
 	if (!fs_path)
 		die("Please specify a filesytem to migrate");
 	if (!path_is_fs_root(fs_path))
 		die("%s is not a filysestem root", fs_path);
 	int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
 	struct stat stat = xfstat(fs_fd);
 	if (!S_ISDIR(stat.st_mode))
 		die("%s is not a directory", fs_path);
 	struct dev_opts dev = { 0 };
 	dev.path = dev_t_to_path(stat.st_dev);
 	dev.fd = xopen(dev.path, O_RDWR);
 	block_size = min_t(unsigned, stat.st_blksize,
 			   get_blocksize(dev.path, dev.fd) << 9);
 	BUG_ON(!is_power_of_2(block_size) || block_size < 512);
 	format_opts.block_size = block_size >> 9;
 	u64 bcachefs_inum;
 	char *file_path = mprintf("%s/bcachefs", fs_path);
 	ranges extents = reserve_new_fs_space(file_path,
 				block_size, get_size(dev.path, dev.fd) / 5,
 				&bcachefs_inum, stat.st_dev);
 	find_superblock_space(extents, &dev);
 	if (format_opts.encrypted && !no_passphrase) {
 		format_opts.passphrase = read_passphrase("Enter passphrase: ");
 		if (isatty(STDIN_FILENO)) {
 			char *pass2 =
 				read_passphrase("Enter same passphrase again: ");
 			if (strcmp(format_opts.passphrase, pass2)) {
 				memzero_explicit(format_opts.passphrase,
 						 strlen(format_opts.passphrase));
 				memzero_explicit(pass2, strlen(pass2));
 				die("Passphrases do not match");
 			}
 			memzero_explicit(pass2, strlen(pass2));
 			free(pass2);
 		}
 	}
 	struct bch_sb *sb = bcache_format(format_opts, &dev, 1);
 	u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
 	if (format_opts.passphrase)
 		add_bcache_key(sb, format_opts.passphrase);
 	free(sb);
 	printf("Creating new filesystem on %s in space reserved at %s\n"
 	       "To mount, run\n"
 	       "  mount -t bcache -o sb=%llu %s dir\n"
 	       "\n"
 	       "After verifying that the new filesystem is correct, to create a\n"
 	       "superblock at the default offset and finish the migration run\n"
 	       "  bcache migrate_superblock -d %s -o %llu\n"
 	       "\n"
 	       "The new filesystem will have a file at /old_migrated_filestem\n"
 	       "referencing all disk space that might be used by the existing\n"
 	       "filesystem. That file can be deleted once the old filesystem is\n"
 	       "no longer needed (and should be deleted prior to running\n"
 	       "bcache migrate_superblock)\n",
 	       dev.path, file_path, sb_offset, dev.path,
 	       dev.path, sb_offset);
 	struct bch_opts opts = bch_opts_empty();
 	struct cache_set *c = NULL;
 	char *path[1] = { dev.path };
 	const char *err;
 	opts.sb		= sb_offset;
 	opts.nostart	= true;
 	opts.noexcl	= true;
 	err = bch_fs_open(path, 1, opts, &c);
 	if (err)
 		die("Error opening new filesystem: %s", err);
 	mark_unreserved_space(c, extents);
 	err = bch_fs_start(c);
 	if (err)
 		die("Error starting new filesystem: %s", err);
 	copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
 	bch_fs_stop(c);
 	printf("Migrate complete, running fsck:\n");
 	opts.nostart	= false;
 	opts.nochanges	= true;
 	fsck_err_opt	= FSCK_ERR_NO;
 	err = bch_fs_open(path, 1, opts, &c);
 	if (err)
 		die("Error opening new filesystem: %s", err);
 	bch_fs_stop(c);
 	printf("fsck complete\n");
 	return 0;
 }
 static void migrate_superblock_usage(void)
 {
 	puts("bcache migrate_superblock - create default superblock after migrating\n"
 	     "Usage: bcache migrate_superblock [OPTION]...\n"
 	     "\n"
 	     "Options:\n"
 	     "  -d device     Device to create superblock for\n"
 	     "  -o offset     Offset of existing superblock\n"
 	     "  -h            Display this help and exit\n"
 	     "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 int cmd_migrate_superblock(int argc, char *argv[])
 {
 	char *dev = NULL;
 	u64 offset = 0;
 	int opt, ret;
 	while ((opt = getopt(argc, argv, "d:o:h")) != -1)
 		switch (opt) {
 			case 'd':
 				dev = optarg;
 				break;
 			case 'o':
 				ret = kstrtou64(optarg, 10, &offset);
 				if (ret)
 					die("Invalid offset");
 				break;
 			case 'h':
 				migrate_superblock_usage();
 				exit(EXIT_SUCCESS);
 		}
 	if (!dev)
 		die("Please specify a device");
 	if (!offset)
 		die("Please specify offset of existing superblock");
 	int fd = xopen(dev, O_RDWR);
 	struct bch_sb *sb = __bcache_super_read(fd, offset);
 	if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
 		die("Can't add superblock: no space left in superblock layout");
 	for (unsigned i = 0; i < sb->layout.nr_superblocks; i++)
 		if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
 			die("Superblock layout already has default superblock");
 	memmove(&sb->layout.sb_offset[1],
 		&sb->layout.sb_offset[0],
 		sb->layout.nr_superblocks * sizeof(u64));
 	sb->layout.nr_superblocks++;
 	sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
 	bcache_super_write(fd, sb);
 	close(fd);
 	return 0;
 }
--- a/cmd_run.c
+++ b/cmd_run.c
@ -25,9 +25,6 @@ int cmd_stop(int argc, char *argv[])
 		die("Please supply a filesystem");
 	struct bcache_handle fs = bcache_fs_open(argv[1]);
-
+	xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
 	if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP))
 		die("BCH_IOCTL_STOP error: %s", strerror(errno));
 	return 0;
 }
--- a/cmds.h
+++ b/cmds.h
@ -29,4 +29,7 @@ int cmd_fsck(int argc, char *argv[]);
 int cmd_dump(int argc, char *argv[]);
 int cmd_list(int argc, char *argv[]);
 int cmd_migrate(int argc, char *argv[]);
 int cmd_migrate_superblock(int argc, char *argv[]);
 #endif /* _CMDS_H */
--- a/crypto.c
+++ b/crypto.c
@ -10,8 +10,10 @@
 #include <time.h>
 #include <unistd.h>
 #include <keyutils.h>
 #include <linux/random.h>
 #include <libscrypt.h>
 #include <uuid/uuid.h>
 #include "checksum.h"
 #include "crypto.h"
@ -75,29 +77,71 @@ void derive_passphrase(struct bch_sb_field_crypt *crypt,
 	}
 }
 void add_bcache_key(struct bch_sb *sb, const char *passphrase)
 {
 	struct bch_sb_field_crypt *crypt = bch_sb_get_crypt(sb);
 	if (!crypt)
 		die("filesystem is not encrypted");
 	struct bch_encrypted_key sb_key = crypt->key;
 	if (!bch_key_is_encrypted(&sb_key))
 		die("filesystem does not have encryption key");
 	struct bch_key passphrase_key;
 	derive_passphrase(crypt, &passphrase_key, passphrase);
 	/* Check if the user supplied the correct passphrase: */
 	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
 				   &sb_key, sizeof(sb_key)))
 		die("error encrypting key");
 	if (bch_key_is_encrypted(&sb_key))
 		die("incorrect passphrase");
 	char uuid[40];
 	uuid_unparse_lower(sb->user_uuid.b, uuid);
 	char *description = mprintf("bcache:%s", uuid);
 	if (add_key("logon", description,
 		    &passphrase_key, sizeof(passphrase_key),
 		    KEY_SPEC_USER_KEYRING) < 0 ||
 	    add_key("user", description,
 		    &passphrase_key, sizeof(passphrase_key),
 		    KEY_SPEC_USER_KEYRING) < 0)
 		die("add_key error: %s", strerror(errno));
 	memzero_explicit(description, strlen(description));
 	free(description);
 	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
 	memzero_explicit(&sb_key, sizeof(sb_key));
 }
 void bch_sb_crypt_init(struct bch_sb *sb,
 		       struct bch_sb_field_crypt *crypt,
 		       const char *passphrase)
 {
 	struct bch_key passphrase_key;
 	SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
 	SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
 	SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
 	SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
 	derive_passphrase(crypt, &passphrase_key, passphrase);
 	crypt->key.magic = BCH_KEY_MAGIC;
 	get_random_bytes(&crypt->key.key, sizeof(crypt->key.key));
-	assert(!bch_key_is_encrypted(&crypt->key));
+	if (passphrase) {
 		struct bch_key passphrase_key;
-	if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+		SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
-				   &crypt->key, sizeof(crypt->key)))
+		SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
-		die("error encrypting key");
+		SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
 		SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
-	assert(bch_key_is_encrypted(&crypt->key));
+		derive_passphrase(crypt, &passphrase_key, passphrase);
-	memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+		assert(!bch_key_is_encrypted(&crypt->key));
 		if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
 					   &crypt->key, sizeof(crypt->key)))
 			die("error encrypting key");
 		assert(bch_key_is_encrypted(&crypt->key));
 		memzero_explicit(&passphrase_key, sizeof(passphrase_key));
 	}
 }
--- a/crypto.h
+++ b/crypto.h
@ -1,12 +1,16 @@
 #ifndef _CRYPTO_H
 #define _CRYPTO_H
 #include "super-io.h"
 #include "tools-util.h"
 struct bch_sb;
 struct bch_sb_field_crypt;
 struct bch_key;
 char *read_passphrase(const char *);
 void derive_passphrase(struct bch_sb_field_crypt *,
 		       struct bch_key *, const char *);
 void add_bcache_key(struct bch_sb *, const char *);
 void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
 		       const char *);
--- a/include/linux/bcache.h
+++ b/include/linux/bcache.h
@ -821,7 +821,7 @@ struct bch_sb_field {
 	__le32			type;
 };
-enum bch_sb_field_types {
+enum bch_sb_field_type {
 	BCH_SB_FIELD_journal	= 0,
 	BCH_SB_FIELD_members	= 1,
 	BCH_SB_FIELD_crypt	= 2,
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -110,6 +110,7 @@ struct super_block {
 * NOTE! These match bits 12..15 of stat.st_mode
 * (ie "(i_mode >> 12) & 15").
 */
 #ifndef DT_UNKNOWN
 #define DT_UNKNOWN	0
 #define DT_FIFO		1
 #define DT_CHR		2
@ -119,6 +120,7 @@ struct super_block {
 #define DT_LNK		10
 #define DT_SOCK		12
 #define DT_WHT		14
 #endif
 /*
 * This is the "filldir" function type, used by readdir() to let
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@ -8,7 +8,6 @@
 * interior nodes.
 */
 #include <linux/page.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@ -41,20 +40,14 @@ struct __genradix {
 * genradix.
 */
-#define DECLARE_GENRADIX_TYPE(_name, _type)			\
+#define GENRADIX(_type)						\
-struct _name {							\
+struct {							\
 	struct __genradix	tree;				\
 	_type			type[0] __aligned(1);		\
 }
 #define DECLARE_GENRADIX(_name, _type)				\
 struct {							\
 	struct __genradix	tree;				\
 	_type			type[0] __aligned(1);		\
 } _name
 #define DEFINE_GENRADIX(_name, _type)				\
-	DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER
+	GENRADIX(_type) _name = __GENRADIX_INITIALIZER
 #define genradix_init(_radix)					\
 do {								\
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@ -180,4 +180,9 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
 	return !atomic_long_read(&ref->count);
 }
 static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
 {
 	return percpu_ref_is_zero(ref);
 }
 #endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */
--- a/libbcache.c
+++ b/libbcache.c
@ -23,66 +23,82 @@
 #define BCH_MIN_NR_NBUCKETS	(1 << 10)
 /* first bucket should start 1 mb in, in sectors: */
 #define FIRST_BUCKET_OFFSET	(1 << 11)
 /* minimum size filesystem we can create, given a bucket size: */
 static u64 min_size(unsigned bucket_size)
 {
-	return (DIV_ROUND_UP(FIRST_BUCKET_OFFSET, bucket_size) +
+	return BCH_MIN_NR_NBUCKETS * bucket_size;
 		BCH_MIN_NR_NBUCKETS) * bucket_size;
 }
-static void init_layout(struct bch_sb_layout *l)
+static void init_layout(struct bch_sb_layout *l, unsigned block_size,
 			u64 start, u64 end)
 {
 	unsigned sb_size;
 	u64 backup; /* offset of 2nd sb */
 	memset(l, 0, sizeof(*l));
 	if (start != BCH_SB_SECTOR)
 		start = round_up(start, block_size);
 	end = round_down(end, block_size);
 	if (start >= end)
 		die("insufficient space for superblocks");
 	/*
 	 * Create two superblocks in the allowed range: reserve a maximum of 64k
 	 */
 	sb_size = min_t(u64, 128, end - start / 2);
 	backup = start + sb_size;
 	backup = round_up(backup, block_size);
 	backup = min(backup, end);
 	sb_size = min(end - backup, backup- start);
 	sb_size = rounddown_pow_of_two(sb_size);
 	if (sb_size < 8)
 		die("insufficient space for superblocks");
 	l->magic		= BCACHE_MAGIC;
 	l->layout_type		= 0;
 	l->nr_superblocks	= 2;
-	l->sb_max_size_bits	= 7;
+	l->sb_max_size_bits	= ilog2(sb_size);
-	l->sb_offset[0]		= cpu_to_le64(BCH_SB_SECTOR);
+	l->sb_offset[0]		= cpu_to_le64(start);
-	l->sb_offset[1]		= cpu_to_le64(BCH_SB_SECTOR +
+	l->sb_offset[1]		= cpu_to_le64(backup);
 					      (1 << l->sb_max_size_bits));
 }
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
+struct bch_sb *bcache_format(struct format_opts opts,
-		   unsigned block_size,
+			     struct dev_opts *devs, size_t nr_devs)
 		   unsigned btree_node_size,
 		   unsigned meta_csum_type,
 		   unsigned data_csum_type,
 		   unsigned compression_type,
 		   const char *passphrase,
 		   unsigned meta_replicas,
 		   unsigned data_replicas,
 		   unsigned on_error_action,
 		   unsigned max_journal_entry_size,
 		   char *label,
 		   uuid_le uuid)
 {
 	struct bch_sb *sb;
 	struct dev_opts *i;
 	struct bch_sb_field_members *mi;
-	unsigned u64s, j;
+	unsigned u64s;
 	/* calculate block size: */
-	if (!block_size)
+	if (!opts.block_size)
 		for (i = devs; i < devs + nr_devs; i++)
-			block_size = max(block_size,
+			opts.block_size = max(opts.block_size,
-					 get_blocksize(i->path, i->fd));
+					      get_blocksize(i->path, i->fd));
 	/* calculate bucket sizes: */
 	for (i = devs; i < devs + nr_devs; i++) {
 		if (!i->sb_offset) {
 			i->sb_offset	= BCH_SB_SECTOR;
 			i->sb_end	= BCH_SB_SECTOR + 256;
 		}
 		if (!i->size)
 			i->size = get_size(i->path, i->fd) >> 9;
 		if (!i->bucket_size) {
-			if (i->size < min_size(block_size))
+			if (i->size < min_size(opts.block_size))
 				die("cannot format %s, too small (%llu sectors, min %llu)",
-				    i->path, i->size, min_size(block_size));
+				    i->path, i->size, min_size(opts.block_size));
 			/* Want a bucket size of at least 128k, if possible: */
-			i->bucket_size = max(block_size, 256U);
+			i->bucket_size = max(opts.block_size, 256U);
 			if (i->size >= min_size(i->bucket_size)) {
 				unsigned scale = max(1,
@ -99,34 +115,36 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 			}
 		}
 		/* first bucket: 1 mb in */
 		i->first_bucket	= DIV_ROUND_UP(FIRST_BUCKET_OFFSET, i->bucket_size);
 		i->nbuckets	= i->size / i->bucket_size;
-		if (i->bucket_size < block_size)
+		if (i->bucket_size < opts.block_size)
 			die("Bucket size cannot be smaller than block size");
-		if (i->nbuckets - i->first_bucket < BCH_MIN_NR_NBUCKETS)
+		if (i->nbuckets < BCH_MIN_NR_NBUCKETS)
 			die("Not enough buckets: %llu, need %u (bucket size %u)",
-			    i->nbuckets - i->first_bucket, BCH_MIN_NR_NBUCKETS,
+			    i->nbuckets, BCH_MIN_NR_NBUCKETS, i->bucket_size);
 			    i->bucket_size);
 	}
 	/* calculate btree node size: */
-	if (!btree_node_size) {
+	if (!opts.btree_node_size) {
 		/* 256k default btree node size */
-		btree_node_size = 512;
+		opts.btree_node_size = 512;
 		for (i = devs; i < devs + nr_devs; i++)
-			btree_node_size = min(btree_node_size, i->bucket_size);
+			opts.btree_node_size =
 				min(opts.btree_node_size, i->bucket_size);
 	}
-	if (!max_journal_entry_size) {
+	if (!opts.max_journal_entry_size) {
 		/* 2 MB default: */
-		max_journal_entry_size = 4096;
+		opts.max_journal_entry_size = 4096;
 	}
-	max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size);
+	opts.max_journal_entry_size =
 		roundup_pow_of_two(opts.max_journal_entry_size);
 	if (uuid_is_null(opts.uuid.b))
 		uuid_generate(opts.uuid.b);
 	sb = calloc(1, sizeof(*sb) +
 		    sizeof(struct bch_sb_field_members) +
@ -135,35 +153,29 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	sb->version	= cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4);
 	sb->magic	= BCACHE_MAGIC;
-	sb->block_size	= cpu_to_le16(block_size);
+	sb->block_size	= cpu_to_le16(opts.block_size);
-	sb->user_uuid	= uuid;
+	sb->user_uuid	= opts.uuid;
 	sb->nr_devices	= nr_devs;
 	init_layout(&sb->layout);
 	uuid_generate(sb->uuid.b);
-	if (label)
+	if (opts.label)
-		strncpy((char *) sb->label, label, sizeof(sb->label));
+		strncpy((char *) sb->label, opts.label, sizeof(sb->label));
-	/*
+	SET_BCH_SB_CSUM_TYPE(sb,		opts.meta_csum_type);
-	 * don't have a userspace crc32c implementation handy, just always use
+	SET_BCH_SB_META_CSUM_TYPE(sb,		opts.meta_csum_type);
-	 * crc64
+	SET_BCH_SB_DATA_CSUM_TYPE(sb,		opts.data_csum_type);
-	 */
+	SET_BCH_SB_COMPRESSION_TYPE(sb,		opts.compression_type);
 	SET_BCH_SB_CSUM_TYPE(sb,		BCH_CSUM_CRC64);
 	SET_BCH_SB_META_CSUM_TYPE(sb,		meta_csum_type);
 	SET_BCH_SB_DATA_CSUM_TYPE(sb,		data_csum_type);
 	SET_BCH_SB_COMPRESSION_TYPE(sb,		compression_type);
-	SET_BCH_SB_BTREE_NODE_SIZE(sb,		btree_node_size);
+	SET_BCH_SB_BTREE_NODE_SIZE(sb,		opts.btree_node_size);
 	SET_BCH_SB_GC_RESERVE(sb,		8);
-	SET_BCH_SB_META_REPLICAS_WANT(sb,	meta_replicas);
+	SET_BCH_SB_META_REPLICAS_WANT(sb,	opts.meta_replicas);
-	SET_BCH_SB_META_REPLICAS_HAVE(sb,	meta_replicas);
+	SET_BCH_SB_META_REPLICAS_HAVE(sb,	opts.meta_replicas);
-	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	data_replicas);
+	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	opts.data_replicas);
-	SET_BCH_SB_DATA_REPLICAS_HAVE(sb,	data_replicas);
+	SET_BCH_SB_DATA_REPLICAS_HAVE(sb,	opts.data_replicas);
-	SET_BCH_SB_ERROR_ACTION(sb,		on_error_action);
+	SET_BCH_SB_ERROR_ACTION(sb,		opts.on_error_action);
 	SET_BCH_SB_STR_HASH_TYPE(sb,		BCH_STR_HASH_SIPHASH);
-	SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,	ilog2(max_journal_entry_size));
+	SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,	ilog2(opts.max_journal_entry_size));
 	struct timespec now;
 	if (clock_gettime(CLOCK_REALTIME, &now))
@ -172,7 +184,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
 	sb->time_precision	= cpu_to_le32(1);
-	if (passphrase) {
+	if (opts.encrypted) {
 		struct bch_sb_field_crypt *crypt = vstruct_end(sb);
 		u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
@ -181,7 +193,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 		crypt->field.u64s = cpu_to_le32(u64s);
 		crypt->field.type = BCH_SB_FIELD_crypt;
-		bch_sb_crypt_init(sb, crypt, passphrase);
+		bch_sb_crypt_init(sb, crypt, opts.passphrase);
 		SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
 	}
@ -198,7 +210,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 		uuid_generate(m->uuid.b);
 		m->nbuckets	= cpu_to_le64(i->nbuckets);
-		m->first_bucket	= cpu_to_le16(i->first_bucket);
+		m->first_bucket	= 0;
 		m->bucket_size	= cpu_to_le16(i->bucket_size);
 		SET_BCH_MEMBER_TIER(m,		i->tier);
@ -209,42 +221,49 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 	for (i = devs; i < devs + nr_devs; i++) {
 		sb->dev_idx = i - devs;
-		static const char zeroes[BCH_SB_SECTOR << 9];
+		init_layout(&sb->layout, opts.block_size,
-		struct nonce nonce = { 0 };
+			    i->sb_offset, i->sb_end);
-		/* Zero start of disk */
+		if (i->sb_offset == BCH_SB_SECTOR) {
-		xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+			/* Zero start of disk */
 			static const char zeroes[BCH_SB_SECTOR << 9];
-		xpwrite(i->fd, &sb->layout, sizeof(sb->layout),
+			xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
 			BCH_SB_LAYOUT_SECTOR << 9);
 		for (j = 0; j < sb->layout.nr_superblocks; j++) {
 			sb->offset = sb->layout.sb_offset[j];
 			sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb),
 						   nonce, sb);
 			xpwrite(i->fd, sb, vstruct_bytes(sb),
 				le64_to_cpu(sb->offset) << 9);
 		}
-		fsync(i->fd);
+		bcache_super_write(i->fd, sb);
 		close(i->fd);
 	}
-	bcache_super_print(sb, HUMAN_READABLE);
+	return sb;
 	free(sb);
 }
-struct bch_sb *bcache_super_read(const char *path)
+void bcache_super_write(int fd, struct bch_sb *sb)
 {
 	struct nonce nonce = { 0 };
 	for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) {
 		sb->offset = sb->layout.sb_offset[i];
 		if (sb->offset == BCH_SB_SECTOR) {
 			/* Write backup layout */
 			xpwrite(fd, &sb->layout, sizeof(sb->layout),
 				BCH_SB_LAYOUT_SECTOR << 9);
 		}
 		sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
 		xpwrite(fd, sb, vstruct_bytes(sb),
 			le64_to_cpu(sb->offset) << 9);
 	}
 	fsync(fd);
 }
 struct bch_sb *__bcache_super_read(int fd, u64 sector)
 {
 	struct bch_sb sb, *ret;
-	int fd = open(path, O_RDONLY);
+	xpread(fd, &sb, sizeof(sb), sector << 9);
 	if (fd < 0)
 		die("couldn't open %s", path);
 	xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9);
 	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
 		die("not a bcache superblock");
@ -253,11 +272,19 @@ struct bch_sb *bcache_super_read(const char *path)
 	ret = malloc(bytes);
-	xpread(fd, ret, bytes, BCH_SB_SECTOR << 9);
+	xpread(fd, ret, bytes, sector << 9);
 	return ret;
 }
 struct bch_sb *bcache_super_read(const char *path)
 {
 	int fd = xopen(path, O_RDONLY);
 	struct bch_sb *sb = __bcache_super_read(fd, BCH_SB_SECTOR);
 	close(fd);
 	return sb;
 }
 void bcache_super_print(struct bch_sb *sb, int units)
 {
 	struct bch_sb_field_members *mi;
--- a/libbcache.h
+++ b/libbcache.h
@ -1,6 +1,7 @@
 #ifndef _LIBBCACHE_H
 #define _LIBBCACHE_H
 #include <linux/bcache.h>
 #include <linux/uuid.h>
 #include "tools-util.h"
 #include "vstructs.h"
@ -18,32 +19,56 @@ enum fsck_err_opts {
 extern enum fsck_err_opts fsck_err_opt;
 struct format_opts {
 	char		*label;
 	uuid_le		uuid;
 	unsigned	on_error_action;
 	unsigned	max_journal_entry_size; /* will be removed */
 	unsigned	block_size;
 	unsigned	btree_node_size;
 	unsigned	meta_replicas;
 	unsigned	data_replicas;
 	unsigned	meta_csum_type;
 	unsigned	data_csum_type;
 	unsigned	compression_type;
 	bool		encrypted;
 	char		*passphrase;
 };
 static inline struct format_opts format_opts_default()
 {
 	return (struct format_opts) {
 		.on_error_action	= BCH_ON_ERROR_RO,
 		.meta_csum_type		= BCH_CSUM_CRC32C,
 		.data_csum_type		= BCH_CSUM_CRC32C,
 		.meta_replicas		= 1,
 		.data_replicas		= 1,
 	};
 }
 struct dev_opts {
 	int		fd;
-	const char	*path;
+	char		*path;
 	u64		size; /* 512 byte sectors */
 	unsigned	bucket_size;
 	unsigned	tier;
 	bool		discard;
 	u64		first_bucket;
 	u64		nbuckets;
 	u64		sb_offset;
 	u64		sb_end;
 };
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
+struct bch_sb *bcache_format(struct format_opts, struct dev_opts *, size_t);
 		   unsigned block_size,
 		   unsigned btree_node_size,
 		   unsigned meta_csum_type,
 		   unsigned data_csum_type,
 		   unsigned compression_type,
 		   const char *passphrase,
 		   unsigned meta_replicas,
 		   unsigned data_replicas,
 		   unsigned on_error_action,
 		   unsigned max_journal_entry_size,
 		   char *label,
 		   uuid_le uuid);
 void bcache_super_write(int, struct bch_sb *);
 struct bch_sb *__bcache_super_read(int, u64);
 struct bch_sb *bcache_super_read(const char *);
 void bcache_super_print(struct bch_sb *, int);
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@ -73,7 +73,6 @@
 #include <linux/rcupdate.h>
 #include <trace/events/bcache.h>
 static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
 static void __bch_bucket_free(struct cache *, struct bucket *);
 /* Allocation groups: */
@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
 	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca) {
-			grp->nr_devices--;
+			grp->nr--;
 			memmove(&grp->d[i],
 				&grp->d[i + 1],
-				(grp->nr_devices - i) * sizeof(grp->d[0]));
+				(grp->nr- i) * sizeof(grp->d[0]));
 			break;
 		}
@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
 	unsigned i;
 	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca)
 			goto out;
-	BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+	BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
-	rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+	rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
 out:
 	spin_unlock(&grp->lock);
 }
@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
 					   struct cache_set,
 					   pd_controllers_update);
 	struct cache *ca;
-	unsigned iter;
+	unsigned i, iter;
 	int i;
 	/* All units are in bytes */
-	u64 tier_size[BCH_TIER_MAX];
+	u64 faster_tiers_size	= 0;
-	u64 tier_free[BCH_TIER_MAX];
+	u64 faster_tiers_dirty	= 0;
 	u64 tier_dirty[BCH_TIER_MAX];
 	u64 tier0_can_free = 0;
-	memset(tier_size, 0, sizeof(tier_size));
+	u64 fastest_tier_size	= 0;
-	memset(tier_free, 0, sizeof(tier_free));
+	u64 fastest_tier_free	= 0;
-	memset(tier_dirty, 0, sizeof(tier_dirty));
+	u64 copygc_can_free	= 0;
 	rcu_read_lock();
-	for (i = BCH_TIER_MAX - 1; i >= 0; --i)
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
-		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+		bch_pd_controller_update(&c->tiers[i].pd,
 				div_u64(faster_tiers_size *
 					c->tiering_percent, 100),
 				faster_tiers_dirty,
 				-1);
 		group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
 			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
 			unsigned bucket_bits = ca->bucket_bits + 9;
 			u64 size = (ca->mi.nbuckets -
 				    ca->mi.first_bucket) << bucket_bits;
 			u64 dirty = stats.buckets_dirty << bucket_bits;
 			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
 			/*
 			 * Bytes of internal fragmentation, which can be
 			 * reclaimed by copy GC
@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
 				((stats.sectors_dirty +
 				  stats.sectors_cached) << 9);
 			u64 dev_size = (ca->mi.nbuckets -
 					ca->mi.first_bucket) << bucket_bits;
 			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
 			if (fragmented < 0)
 				fragmented = 0;
 			bch_pd_controller_update(&ca->moving_gc_pd,
 						 free, fragmented, -1);
-			if (i == 0)
+			faster_tiers_size		+= size;
-				tier0_can_free += fragmented;
+			faster_tiers_dirty		+= dirty;
-			tier_size[i] += dev_size;
+			if (!c->fastest_tier ||
-			tier_free[i] += free;
+			    c->fastest_tier == &c->tiers[i]) {
-			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
+				fastest_tier_size	+= size;
 				fastest_tier_free	+= free;
 			}
 			copygc_can_free			+= fragmented;
 		}
 	rcu_read_unlock();
 	if (tier_size[1]) {
 		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
 		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
 		bch_pd_controller_update(&c->tiering_pd,
 					 target,
 					 tier_dirty[0],
 					 -1);
 	}
 	rcu_read_unlock();
 	/*
 	 * Throttle foreground writes if tier 0 is running out of free buckets,
-	 * and either tiering or copygc can free up space (but don't take both
+	 * and either tiering or copygc can free up space.
 	 * into account).
 	 *
 	 * Target will be small if there isn't any work to do - we don't want to
 	 * throttle foreground writes if we currently have all the free space
@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
 	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
 	 * for foreground writes.
 	 */
 	if (c->fastest_tier)
 		copygc_can_free = U64_MAX;
 	bch_pd_controller_update(&c->foreground_write_pd,
-				 min(tier0_can_free,
+				 min(copygc_can_free,
-				     div_u64(tier_size[0] *
+				     div_u64(fastest_tier_size *
 					     c->foreground_target_percent,
 					     100)),
-				 tier_free[0],
+				 fastest_tier_free,
 				 -1);
 	schedule_delayed_work(&c->pd_controllers_update,
@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
 		 * it getting gc'd from under us
 		 */
 		ca->prio_buckets[i] = r;
-		bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+		bch_mark_metadata_bucket(ca, ca->buckets + r,
 					 BUCKET_PRIOS, false);
 		spin_unlock(&ca->prio_buckets_lock);
 		SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
 	do {
 		unsigned u64s = jset_u64s(0);
 		if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
 			break;
 		ret = bch_journal_res_get(j, &res, u64s, u64s);
 		if (ret)
 			return ret;
@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
 		if (is_available_bucket(m) &&
 		    !m.cached_sectors &&
 		    !m.had_metadata &&
-		    (!m.wait_on_journal ||
+		    !bucket_needs_journal_commit(m, last_seq_ondisk)) {
 		     ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
 			spin_lock(&ca->freelist_lock);
 			bch_mark_alloc_bucket(ca, g, true);
@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
 	set_freezable();
 	bch_find_empty_buckets(c, ca);
 	while (1) {
 		/*
 		 * First, we pull buckets off of the free_inc list, possibly
@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
 		 * See if we have buckets we can reuse without invalidating them
 		 * or forcing a journal commit:
 		 */
-		bch_find_empty_buckets(c, ca);
+		//bch_find_empty_buckets(c, ca);
 		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
 			up_read(&c->gc_lock);
@ -967,7 +970,7 @@ out:
 *
 * Returns index of bucket on success, 0 on failure
 * */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
 {
 	struct bucket *g;
 	long r;
@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
 	u64 available_buckets = 1; /* avoid a divide by zero... */
 	unsigned i;
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		ca = devs->d[i].dev;
 		devs->d[i].weight = buckets_free_cache(ca);
 		available_buckets += devs->d[i].weight;
 	}
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		const unsigned min_weight = U32_MAX >> 4;
 		const unsigned max_weight = U32_MAX;
 		devs->d[i].weight =
 			min_weight +
 			div64_u64(devs->d[i].weight *
-				  devs->nr_devices *
+				  devs->nr *
 				  (max_weight - min_weight),
 				  available_buckets);
 		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 	rcu_read_lock();
 	spin_lock(&devs->lock);
-	for (i = 0; i < devs->nr_devices; i++)
+	for (i = 0; i < devs->nr; i++)
 		available += !test_bit(devs->d[i].dev->dev_idx,
 				       caches_used);
@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 		}
 		i++;
-		i %= devs->nr_devices;
+		i %= devs->nr;
 		ret = FREELIST_EMPTY;
 		if (i == fail_idx)
@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 						    enum alloc_reserve reserve,
 						    long *caches_used)
 {
 	struct bch_tier *tier;
 	/*
 	 * this should implement policy - for a given type of allocation, decide
 	 * which devices to allocate from:
 	 *
 	 * XXX: switch off wp->type and do something more intelligent here
 	 */
 	if (wp->group)
 		return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
 					      wp->group, caches_used);
-	/* foreground writes: prefer tier 0: */
+	/* foreground writes: prefer fastest tier: */
-	if (wp->group == &c->cache_all)
+	tier = READ_ONCE(c->fastest_tier);
 	if (tier)
 		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				       &c->cache_tiers[0], caches_used);
+				       &tier->devs, caches_used);
 	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				      wp->group, caches_used);
+				      &c->cache_all, caches_used);
 }
 static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
 		? 0 : BTREE_NODE_RESERVE;
 	int ret;
 	BUG_ON(!wp->group);
 	BUG_ON(!reserve);
 	BUG_ON(!nr_replicas);
 retry:
@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 				   unsigned nr_replicas, struct open_bucket *ob,
 				   unsigned sectors)
 {
-	struct bch_extent_ptr tmp, *ptr;
+	struct bch_extent_ptr tmp;
 	struct cache *ca;
 	bool has_data = false;
 	unsigned i;
@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 	if (nr_replicas < ob->nr_ptrs)
 		has_data = true;
 	rcu_read_lock();
 	for (i = 0; i < nr_replicas; i++) {
 		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 		extent_ptr_append(e, tmp);
 		ob->ptr_offset[i] += sectors;
 		if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
 			this_cpu_add(*ca->sectors_written, sectors);
 	}
-	open_bucket_for_each_online_device(c, ob, ptr, ca)
+	rcu_read_unlock();
 		this_cpu_add(*ca->sectors_written, sectors);
 }
 /*
@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 /* Startup/shutdown (ro/rw): */
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
 {
-	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+	struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
 	struct cache *ca;
 	u64 total_capacity, capacity = 0, reserved_sectors = 0;
 	unsigned long ra_pages = 0;
@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
 	c->bdi.ra_pages = ra_pages;
 	/* Find fastest, slowest tiers with devices: */
 	for (tier = c->tiers;
 	     tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
 		if (!tier->devs.nr)
 			continue;
 		if (!fastest_tier)
 			fastest_tier = tier;
 		slowest_tier = tier;
 	}
 	c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
 	c->promote_write_point.group = &fastest_tier->devs;
 	if (!fastest_tier)
 		goto set_capacity;
 	/*
 	 * Capacity of the cache set is the capacity of all the devices in the
 	 * slowest (highest) tier - we don't include lower tier devices.
 	 */
-	for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
+	group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
 	     tier > c->cache_tiers && !tier->nr_devices;
 	     --tier)
 		;
 	group_for_each_cache_rcu(ca, tier, i) {
 		size_t reserve = 0;
 		/*
@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
 			     ca->mi.first_bucket) <<
 			ca->bucket_bits;
 	}
 set_capacity:
 	rcu_read_unlock();
 	total_capacity = capacity;
 	capacity *= (100 - c->opts.gc_reserve_percent);
@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
 void bch_dev_allocator_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *p;
 	struct closure cl;
 	unsigned i;
@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
 int bch_dev_allocator_start(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *k;
 	/*
@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
 	bch_dev_group_add(tier, ca);
 	bch_dev_group_add(&c->cache_all, ca);
 	bch_dev_group_add(&c->journal.devs, ca);
 	bch_recalc_capacity(c);
@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
 	return 0;
 }
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
 {
 	unsigned i;
@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
 	spin_lock_init(&c->cache_all.lock);
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
 		spin_lock_init(&c->tiers[i].devs.lock);
 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
 		c->write_points[i].throttle = true;
 		c->write_points[i].group = &c->cache_tiers[0];
 	}
 	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
 		spin_lock_init(&c->cache_tiers[i].lock);
 	c->promote_write_point.group = &c->cache_tiers[0];
 	c->migration_write_point.group = &c->cache_all;
 	c->btree_write_point.group = &c->cache_all;
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
 void bch_recalc_min_prio(struct cache *, int);
 size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
 {
 	struct cache *ret = NULL;
-	while (*iter < devs->nr_devices &&
+	while (*iter < devs->nr &&
 	       !(ret = rcu_dereference(devs->d[*iter].dev)))
 		(*iter)++;
@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
 	     ((_ca) = __open_bucket_next_online_device(_c, _ob,	_ptr, _ca));\
 	     (_ptr)++)
 void bch_recalc_capacity(struct cache_set *);
 void bch_dev_allocator_stop(struct cache *);
 int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
 #endif /* _BCACHE_ALLOC_H */
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
 struct cache_group {
 	spinlock_t		lock;
-	unsigned		nr_devices;
+	unsigned		nr;
 	unsigned		cur_device;
 	struct {
 		u64		weight;
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@ -464,24 +464,10 @@ struct cache {
 * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
 * all the backing devices first (their cached data gets invalidated, and they
 * won't automatically reattach).
 *
 * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
 * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
 * flushing dirty data).
 *
 * BCH_FS_RUNNING means all cache devices have been registered and journal
 * replay is complete.
 */
 enum {
 	/* Startup: */
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_RUNNING,
 	/* Shutdown: */
 	BCH_FS_DETACHING,
 	BCH_FS_STOPPING,
 	BCH_FS_RO,
 	BCH_FS_RO_COMPLETE,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_GC_STOPPING,
@ -498,6 +484,21 @@ struct btree_debug {
 	struct dentry		*failed;
 };
 struct bch_tier {
 	unsigned		idx;
 	struct task_struct	*migrate;
 	struct bch_pd_controller pd;
 	struct cache_group	devs;
 };
 enum bch_fs_state {
 	BCH_FS_STARTING		= 0,
 	BCH_FS_STOPPING,
 	BCH_FS_RO,
 	BCH_FS_RW,
 };
 struct cache_set {
 	struct closure		cl;
@ -506,7 +507,6 @@ struct cache_set {
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
 	struct completion	*stop_completion;
 	unsigned long		flags;
 	int			minor;
@ -514,6 +514,10 @@ struct cache_set {
 	struct super_block	*vfs_sb;
 	char			name[40];
 	/* ro/rw, add/remove devices: */
 	struct mutex		state_lock;
 	enum bch_fs_state	state;
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
 	struct work_struct	read_only_work;
@ -640,7 +644,9 @@ struct cache_set {
 	 * allocate from:
 	 */
 	struct cache_group	cache_all;
-	struct cache_group	cache_tiers[BCH_TIER_MAX];
+	struct bch_tier		tiers[BCH_TIER_MAX];
 	/* NULL if we only have devices in one tier: */
 	struct bch_tier		*fastest_tier;
 	u64			capacity; /* sectors */
@ -753,10 +759,6 @@ struct cache_set {
 	unsigned		writeback_pages_max;
 	atomic_long_t		nr_inodes;
 	/* TIERING */
 	struct task_struct	*tiering_read;
 	struct bch_pd_controller tiering_pd;
 	/* NOTIFICATIONS */
 	struct mutex		uevent_lock;
 	struct kobj_uevent_env	uevent_env;
@ -828,6 +830,11 @@ struct cache_set {
 #undef BCH_TIME_STAT
 };
 static inline bool bch_fs_running(struct cache_set *c)
 {
 	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
 }
 static inline unsigned bucket_pages(const struct cache *ca)
 {
 	return ca->mi.bucket_size / PAGE_SECTORS;
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	bool found;
 	int ret;
 	lockdep_assert_held(&c->state_lock);
 	bdevname(dc->disk_sb.bdev, buf);
 	if (memcmp(&dc->disk_sb.sb->set_uuid,
@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 		return -EINVAL;
 	}
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c)) {
-		return 0;
+		pr_err("Can't attach %s: not running", buf);
 	if (test_bit(BCH_FS_STOPPING, &c->flags)) {
 		pr_err("Can't attach %s: shutting down", buf);
 		return -EINVAL;
 	}
@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
 	struct cached_dev *dc, *t;
 	lockdep_assert_held(&bch_register_lock);
 	lockdep_assert_held(&c->state_lock);
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
 		bch_cached_dev_attach(dc, c);
@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
 	struct bkey_s_c_inode_blockdev inode;
 	int ret = 0;
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EINVAL;
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@ -11,8 +11,9 @@
 #define DEF_BTREE_ID(kwd, val, name) name,
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
 	DEFINE_BCH_BTREE_IDS()
 	NULL
 };
 #undef DEF_BTREE_ID
@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
 	return mca_can_free(c) * btree_pages(c);
 }
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
 {
 	struct btree *b;
 	unsigned i;
@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
 		rhashtable_destroy(&c->btree_cache_table);
 }
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
 {
 	unsigned i;
 	int ret;
--- a/libbcache/btree_cache.h
+++ b/libbcache/btree_cache.h
@ -6,7 +6,7 @@
 struct btree_iter;
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
 void bch_recalc_btree_reserve(struct cache_set *);
@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
 struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
 				 unsigned, enum six_lock_type);
-void bch_btree_cache_free(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
 	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
 	}
 }
 static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
 				  enum bucket_data_type type)
 {
 	u64 b = start >> ca->bucket_bits;
 	do {
 		bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
 		b++;
 	} while (b < end >> ca->bucket_bits);
 }
 /*
 * Mark non btree metadata - prios, journal
 */
 static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	unsigned i;
 	u64 b;
 	/* Mark superblocks: */
 	for (i = 0; i < layout->nr_superblocks; i++) {
 		if (layout->sb_offset[i] == BCH_SB_SECTOR)
 			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
 					      BUCKET_SB);
 		mark_metadata_sectors(ca,
 				      layout->sb_offset[i],
 				      layout->sb_offset[i] +
 				      (1 << layout->sb_max_size_bits),
 				      BUCKET_SB);
 	}
 	spin_lock(&c->journal.lock);
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
 		bch_mark_metadata_bucket(ca, ca->buckets + b,
 					 BUCKET_JOURNAL, true);
 	}
 	spin_unlock(&c->journal.lock);
 	spin_lock(&ca->prio_buckets_lock);
 	for (i = 0; i < prio_buckets(ca) * 2; i++) {
 		b = ca->prio_buckets[i];
 		if (b)
 			bch_mark_metadata_bucket(ca, ca->buckets + b,
 						 BUCKET_PRIOS, true);
 	}
 	spin_unlock(&ca->prio_buckets_lock);
 }
 static void bch_mark_metadata(struct cache_set *c)
 {
 	struct cache *ca;
-	unsigned i, j;
+	unsigned i;
 	u64 b;
-	for_each_cache(ca, c, i) {
+	mutex_lock(&c->sb_lock);
 		for (j = 0; j < ca->journal.nr; j++) {
 			b = ca->journal.buckets[j];
 			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
 		}
-		spin_lock(&ca->prio_buckets_lock);
+	for_each_cache(ca, c, i)
 		bch_mark_dev_metadata(c, ca);
-		for (j = 0; j < prio_buckets(ca) * 2; j++) {
+	mutex_unlock(&c->sb_lock);
 			b = ca->prio_buckets[j];
 			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
 		}
 		spin_unlock(&ca->prio_buckets_lock);
 	}
 }
 /* Also see bch_pending_btree_node_free_insert_done() */
@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, new, ({
 				new.owned_by_allocator	= 0;
-				new.is_metadata		= 0;
+				new.data_type		= 0;
 				new.cached_sectors	= 0;
 				new.dirty_sectors	= 0;
 			}));
@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
 	u64 start_time;
 	enum btree_id id;
 	if (btree_gc_coalesce_disabled(c))
 		return;
 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return;
@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
 		last_kick = atomic_read(&c->kick_gc);
 		bch_gc(c);
-		bch_coalesce(c);
+		if (!btree_gc_coalesce_disabled(c))
 			bch_coalesce(c);
 		debug_check_no_locks_held();
 	}
@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
 {
 	set_bit(BCH_FS_GC_STOPPING, &c->flags);
-	if (!IS_ERR_OR_NULL(c->gc_thread))
+	if (c->gc_thread)
 		kthread_stop(c->gc_thread);
 	c->gc_thread = NULL;
 	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 }
 int bch_gc_thread_start(struct cache_set *c)
 {
-	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+	struct task_struct *p;
-	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	BUG_ON(c->gc_thread);
 	if (IS_ERR(c->gc_thread))
 		return PTR_ERR(c->gc_thread);
 	p = kthread_create(bch_gc_thread, c, "bcache_gc");
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 	c->gc_thread = p;
 	wake_up_process(c->gc_thread);
 	return 0;
 }
@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 {
 	enum btree_id id;
-	if (journal) {
+	bch_mark_metadata(c);
 		for (id = 0; id < BTREE_ID_NR; id++)
 			bch_initial_gc_btree(c, id);
 	for (id = 0; id < BTREE_ID_NR; id++)
 		bch_initial_gc_btree(c, id);
 	if (journal)
 		bch_journal_mark(c, journal);
 	}
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 	bch_mark_metadata(c);
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@ -66,6 +66,7 @@
 #include "alloc.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "error.h"
 #include <linux/preempt.h>
 #include <trace/events/bcache.h>
@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
 #endif
 /*
 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
 * wraparound:
 */
 void bch_bucket_seq_cleanup(struct cache_set *c)
 {
 	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, m, ({
-				if (!m.wait_on_journal ||
+				if (!m.journal_seq_valid ||
-				    ((s16) last_seq_ondisk -
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
 				     (s16) m.journal_seq < 0))
 					break;
-				m.wait_on_journal = 0;
+				m.journal_seq_valid = 0;
 			}));
 		}
 }
@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
 static inline int is_meta_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && m.is_metadata;
+	return m.data_type != BUCKET_DATA;
 }
 static inline int is_dirty_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+	return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
 }
 static inline int is_cached_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+	return m.data_type == BUCKET_DATA &&
 		!m.dirty_sectors && !!m.cached_sectors;
 }
 void bch_fs_stats_apply(struct cache_set *c,
@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
 	memset(stats, 0, sizeof(*stats));
 }
 static bool bucket_became_unavailable(struct cache_set *c,
 				      struct bucket_mark old,
 				      struct bucket_mark new)
 {
 	return is_available_bucket(old) &&
 	       !is_available_bucket(new) &&
 	       c->gc_pos.phase == GC_PHASE_DONE;
 }
 static void bucket_stats_update(struct cache *ca,
 			struct bucket_mark old, struct bucket_mark new,
 			bool may_make_unavailable,
 			struct bucket_stats_cache_set *bch_alloc_stats)
 {
 	struct cache_set *c = ca->set;
 	struct bucket_stats_cache *cache_stats;
-	BUG_ON(!may_make_unavailable &&
+	bch_fs_inconsistent_on(old.data_type && new.data_type &&
-	       is_available_bucket(old) &&
+			old.data_type != new.data_type, c,
-	       !is_available_bucket(new) &&
+			"different types of metadata in same bucket: %u, %u",
-	       c->gc_pos.phase == GC_PHASE_DONE);
+			old.data_type, new.data_type);
 	if (bch_alloc_stats) {
 		bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
 			(int) new.cached_sectors - (int) old.cached_sectors;
 		bch_alloc_stats->s[S_COMPRESSED]
-			[old.is_metadata ? S_META : S_DIRTY] -=
+			[is_meta_bucket(old) ? S_META : S_DIRTY] -=
 			old.dirty_sectors;
 		bch_alloc_stats->s[S_COMPRESSED]
-			[new.is_metadata ? S_META : S_DIRTY] +=
+			[is_meta_bucket(new) ? S_META : S_DIRTY] +=
 			new.dirty_sectors;
 	}
@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
 	cache_stats->sectors_cached +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
-	if (old.is_metadata)
+	if (is_meta_bucket(old))
 		cache_stats->sectors_meta -= old.dirty_sectors;
 	else
 		cache_stats->sectors_dirty -= old.dirty_sectors;
-	if (new.is_metadata)
+	if (is_meta_bucket(new))
 		cache_stats->sectors_meta += new.dirty_sectors;
 	else
 		cache_stats->sectors_dirty += new.dirty_sectors;
@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
 		bch_wake_allocator(ca);
 }
 #define bucket_data_cmpxchg(ca, g, new, expr)			\
 ({								\
 	struct bucket_stats_cache_set _stats = { 0 };		\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
 	bucket_stats_update(ca, _old, new, &_stats);		\
 	_old;							\
 })
 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 {
 	struct bucket_stats_cache_set stats = { 0 };
@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= 1;
-		new.is_metadata		= 0;
+		new.had_metadata	= 0;
 		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 		new.copygc		= 0;
 		new.gen++;
 	}));
-	BUG_ON(old.dirty_sectors);
+	bucket_stats_update(ca, old, new, &stats);
-	bucket_stats_update(ca, old, new, true, &stats);
+	BUG_ON(old.dirty_sectors);
 	/*
 	 * Ick:
@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
 {
 	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
-	old = bucket_cmpxchg(g, new, ({
+	old = bucket_data_cmpxchg(ca, g, new, ({
 		new.owned_by_allocator	= 0;
-		new.is_metadata		= 0;
+		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 	}));
-	bucket_stats_update(ca, old, new, false, &stats);
+	BUG_ON(bucket_became_unavailable(ca->set, old, new));
 }
 void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
 			   bool owned_by_allocator)
 {
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bucket_mark new;
 	struct bucket_mark old, new;
-	old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
+	bucket_data_cmpxchg(ca, g, new, ({
-
+		new.owned_by_allocator = owned_by_allocator;
-	bucket_stats_update(ca, old, new, true, &stats);
+	}));
 }
 void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
 			      enum bucket_data_type type,
 			      bool may_make_unavailable)
 {
 	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
-	old = bucket_cmpxchg(g, new, ({
+	BUG_ON(!type);
-		new.is_metadata = 1;
+
 	old = bucket_data_cmpxchg(ca, g, new, ({
 		new.data_type = type;
 		new.had_metadata = 1;
 	}));
 	BUG_ON(old.cached_sectors);
 	BUG_ON(old.dirty_sectors);
-
+	BUG_ON(!may_make_unavailable &&
-	bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+	       bucket_became_unavailable(ca->set, old, new));
 }
 #define saturated_add(ca, dst, src, max)			\
@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
-			new.is_metadata = false;
+			new.data_type	= 0;
 			if (journal_seq) {
-				new.wait_on_journal = true;
+				new.journal_seq_valid = 1;
 				new.journal_seq = journal_seq;
 			}
 		} else {
-			new.is_metadata = (type == S_META);
+			new.data_type = type == S_META
 				? BUCKET_BTREE : BUCKET_DATA;
 		}
-		new.had_metadata |= new.is_metadata;
+		new.had_metadata |= is_meta_bucket(new);
 	} while ((v = cmpxchg(&g->_mark.counter,
 			      old.counter,
 			      new.counter)) != old.counter);
-	bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+	bucket_stats_update(ca, old, new, NULL);
 	BUG_ON(!may_make_unavailable &&
 	       bucket_became_unavailable(c, old, new));
 	if (saturated &&
 	    atomic_long_add_return(saturated,
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
-		!mark.is_metadata &&
+		mark.data_type == BUCKET_DATA &&
-		!mark.dirty_sectors);
+		!mark.dirty_sectors &&
 		!mark.nouse);
 }
 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 					       u16 last_seq_ondisk)
 {
 	return m.journal_seq_valid &&
 		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
 }
 void bch_bucket_seq_cleanup(struct cache_set *);
@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
 void bch_invalidate_bucket(struct cache *, struct bucket *);
 void bch_mark_free_bucket(struct cache *, struct bucket *);
 void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
 			      enum bucket_data_type, bool);
 void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
 		       struct bucket_stats_cache_set *);
--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@ -1,6 +1,14 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 enum bucket_data_type {
 	BUCKET_DATA	= 0,
 	BUCKET_BTREE,
 	BUCKET_PRIOS,
 	BUCKET_JOURNAL,
 	BUCKET_SB,
 };
 struct bucket_mark {
 	union {
 	struct {
@ -12,23 +20,30 @@ struct bucket_mark {
 		/* generation copygc is going to move this bucket into */
 		unsigned	copygc:1;
-		unsigned	wait_on_journal:1;
+
 		unsigned	journal_seq_valid:1;
 		/*
-		 * If this bucket ever had metadata in it, the allocator must
+		 * If this bucket had metadata while at the current generation
-		 * increment its gen before we reuse it:
+		 * number, the allocator must increment its gen before we reuse
 		 * it:
 		 */
 		unsigned	had_metadata:1;
 		unsigned	owned_by_allocator:1;
 		unsigned	is_metadata:1;
-		u16		cached_sectors;
+		unsigned	data_type:3;
 		unsigned	nouse:1;
 		u16		dirty_sectors;
 		u16		cached_sectors;
 		/*
 		 * low bits of journal sequence number when this bucket was most
-		 * recently modified:
+		 * recently modified: if journal_seq_valid is set, this bucket
 		 * can't be reused until the journal sequence number written to
 		 * disk is >= the bucket's journal sequence number:
 		 */
 		u16		journal_seq;
 	};
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
 static long bch_ioctl_stop(struct cache_set *c)
 {
-	bch_fs_stop(c);
+	bch_fs_stop_async(c);
 	return 0;
 }
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
 	if (ret)
 		goto err;
-	crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
+	crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
 						sizeof(*crypt) / sizeof(u64)),
 				     struct bch_sb_field_crypt, field);
 	if (!crypt) {
 		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
 		goto err;
 	}
 	crypt->field.type = BCH_SB_FIELD_crypt;
 	crypt->key = key;
 	/* write superblock */
@ -560,7 +557,7 @@ err:
 	return ret;
 }
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->poly1305))
 		crypto_free_shash(c->poly1305);
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
 int bch_disable_encryption(struct cache_set *);
 int bch_enable_encryption(struct cache_set *, bool);
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
 int bch_fs_encryption_init(struct cache_set *);
 static inline unsigned bch_data_checksum_type(struct cache_set *c)
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
 		break;
 	}
-	return bch_compress_init(c);
+	return bch_fs_compress_init(c);
 }
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
 {
 	vfree(c->zlib_workspace);
 	mempool_exit(&c->lz4_workspace_pool);
@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
 	max_t(size_t, zlib_inflate_workspacesize(),			\
 	      zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
 {
 	unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
 	int ret, cpu;
 	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
 	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
 		return 0;
 	if (!c->bio_decompress_worker) {
 		c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
 		if (!c->bio_decompress_worker)
@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
 		}
 	}
 	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
 	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
 		return 0;
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_page_pool(&c->compression_bounce[READ],
 					     1, order);
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
 		      struct bio *, size_t *, unsigned *);
 int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
 #endif /* _BCACHE_COMPRESS_H */
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
 	.read		= bch_read_bfloat_failed,
 };
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->debug))
 		debugfs_remove_recursive(c->debug);
 }
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
 {
 	struct btree_debug *bd;
 	char name[100];
@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
 	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 	     bd++) {
 		bd->id = bd - c->btree_debug;
-		bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+		bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
 						0400, c->debug, bd,
 						&btree_debug_ops);
 		snprintf(name, sizeof(name), "%s-formats",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
 						       &btree_format_debug_ops);
 		snprintf(name, sizeof(name), "%s-bfloat-failed",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
 						 &bfloat_failed_debug_ops);
--- a/libbcache/debug.h
+++ b/libbcache/debug.h
@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
 }
 #ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
 #else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
 #endif
 void bch_debug_exit(void);
--- a/libbcache/error.c
+++ b/libbcache/error.c
@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
 	case BCH_ON_ERROR_RO:
 		if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 			/* XXX do something better here? */
-			bch_fs_stop(c);
+			bch_fs_stop_async(c);
 			return;
 		}
@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 	} else {
 		bch_notify_dev_error(ca, true);
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&c->state_lock);
 		dev = bch_dev_may_remove(ca);
 		if (dev
 		    ? bch_dev_read_only(ca)
@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 				"too many IO errors on %s, setting %s RO",
 				bdevname(ca->disk_sb.bdev, buf),
 				dev ? "device" : "filesystem");
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&c->state_lock);
 	}
 }
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
 			do {
 				seq = read_seqcount_begin(&c->gc_pos_lock);
 				bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				       !g->mark.is_metadata;
+				       g->mark.data_type != BUCKET_BTREE;
 			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 			err = "inconsistent";
@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
 	struct extent_pick_ptr pick = { .ca = NULL };
 	struct cache *ca;
 	rcu_read_lock();
@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 				PTR_BUCKET_NR(ca, ptr)))
 			continue;
-		percpu_ref_get(&ca->ref);
+		if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
-		rcu_read_unlock();
+			continue;
-		return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+		pick.ca		= ca;
 		pick.ptr	= *ptr;
 	}
 	if (pick.ca)
 		percpu_ref_get(&pick.ca->ref);
 	rcu_read_unlock();
-	return (struct extent_pick_ptr) { .ca = NULL, };
+	return pick;
 }
 const struct bkey_ops bch_bkey_btree_ops = {
@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
 				if (stale)
 					break;
-				bad = (mark.is_metadata ||
+				bad = (mark.data_type != BUCKET_DATA ||
 				       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
 					!mark.owned_by_allocator &&
 					!(ptr->cached
@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 		rcu_read_lock();
 		ret->ca = NULL;
-		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
-			if (!ptr_stale(ca, ptr)) {
+			if (ptr_stale(ca, ptr))
-				*ret = (struct extent_pick_ptr) {
+				continue;
 					.crc = crc_to_128(e.k, crc),
 					.ptr = *ptr,
 					.ca = ca,
 				};
-				if (ca != avoid)
+			if (ret->ca &&
-					break;
+			    (ca == avoid ||
-			}
+			     ret->ca->mi.tier < ca->mi.tier))
 				continue;
 			*ret = (struct extent_pick_ptr) {
 				.crc = crc_to_128(e.k, crc),
 				.ptr = *ptr,
 				.ca = ca,
 			};
 		}
 		if (ret->ca)
 			percpu_ref_get(&ret->ca->ref);
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@ -545,9 +545,9 @@ struct nlink {
 	u32	dir_count;
 };
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
 		     u64 range_start, u64 *range_end,
 		     u64 inum, bool dir)
 {
@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
 }
 noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
 			       u64 range_start, u64 *range_end)
 {
 	struct btree_iter iter;
@ -776,7 +776,7 @@ fsck_err:
 noinline_for_stack
 static int bch_gc_walk_inodes(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode,
-			      struct nlinks *links,
+			      nlink_table *links,
 			      u64 range_start, u64 range_end)
 {
 	struct btree_iter iter;
@ -850,7 +850,7 @@ noinline_for_stack
 static int check_inode_nlinks(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode)
 {
-	struct nlinks links;
+	nlink_table links;
 	u64 this_iter_range_start, next_iter_range_start = 0;
 	int ret = 0;
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
 		if (!c)
 			goto err_unlock;
-		if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+		mutex_lock(&c->state_lock);
 		if (!bch_fs_running(c)) {
 			mutex_unlock(&c->state_lock);
 			err = "incomplete cache set";
 			c = NULL;
 			goto err_unlock;
 		}
 		closure_get(&c->cl);
 		mutex_unlock(&c->state_lock);
 		mutex_unlock(&bch_register_lock);
 	}
@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		return ret;
 	mutex_lock(&bch_register_lock);
 	if (opts.read_only >= 0 &&
 	    opts.read_only != c->opts.read_only) {
 		const char *err = NULL;
 		if (opts.read_only) {
-			bch_fs_read_only_sync(c);
+			bch_fs_read_only(c);
 			sb->s_flags |= MS_RDONLY;
 		} else {
 			err = bch_fs_read_write(c);
 			if (err) {
 				bch_err(c, "error going rw: %s", err);
-				ret = -EINVAL;
+				return -EINVAL;
 				goto unlock;
 			}
 			sb->s_flags &= ~MS_RDONLY;
@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.errors >= 0)
 		c->opts.errors = opts.errors;
 unlock:
 	mutex_unlock(&bch_register_lock);
 	return ret;
 }
@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-		bch_fs_stop_sync(c);
+		bch_fs_stop(c);
 	else
 		closure_put(&c->cl);
 }
@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
 MODULE_ALIAS_FS("bcache");
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
 	if (bch_dio_write_bioset)
@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
 		kmem_cache_destroy(bch_inode_cache);
 }
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
 {
 	int ret = -ENOMEM;
@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
 	return 0;
 err:
-	bch_fs_exit();
+	bch_vfs_exit();
 	return ret;
 }
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
 int __must_check bch_write_inode(struct cache_set *,
 				 struct bch_inode_info *);
-void bch_fs_exit(void);
+void bch_vfs_exit(void);
-int bch_fs_init(void);
+int bch_vfs_init(void);
 #else
-static inline void bch_fs_exit(void) {}
+static inline void bch_vfs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline int bch_vfs_init(void) { return 0; }
 #endif
--- a/libbcache/io.c
+++ b/libbcache/io.c
@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
 	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
 	while ((op = c->write_wait_head)) {
-		if (!test_bit(BCH_FS_RO, &c->flags) &&
+		if (time_after(op->expires, jiffies)) {
 		    !test_bit(BCH_FS_STOPPING, &c->flags) &&
 		    time_after(op->expires, jiffies)) {
 			mod_timer(&c->foreground_write_wakeup, op->expires);
 			break;
 		}
@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
 		return;
 	}
-	if (rbio->promote &&
+	if (rbio->promote) {
 	    !test_bit(BCH_FS_RO, &c->flags) &&
 	    !test_bit(BCH_FS_STOPPING, &c->flags)) {
 		struct cache_promote_op *promote = rbio->promote;
 		struct closure *cl = &promote->cl;
@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
 		preempt_disable();
 		d = this_cpu_ptr(c->bio_decompress_worker);
 		llist_add(&rbio->list, &d->bio_list);
-		queue_work(system_unbound_wq, &d->work);
+		queue_work(system_highpri_wq, &d->work);
 		preempt_enable();
 	} else {
 		__bch_read_endio(c, rbio);
 	}
 }
 static bool should_promote(struct cache_set *c,
 			   struct extent_pick_ptr *pick, unsigned flags)
 {
 	if (!(flags & BCH_READ_PROMOTE))
 		return false;
 	if (percpu_ref_is_dying(&c->writes))
 		return false;
 	return c->fastest_tier &&
 		c->fastest_tier < c->tiers + pick->ca->mi.tier;
 }
 void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 			  struct bvec_iter iter, struct bkey_s_c k,
 			  struct extent_pick_ptr *pick, unsigned flags)
@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 	 * XXX: multiple promotes can race with each other, wastefully. Keep a
 	 * list of outstanding promotes?
 	 */
-	if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+	if (should_promote(c, pick, flags)) {
 		/*
 		 * biovec needs to be big enough to hold decompressed data, if
 		 * the bch_write_extent() has to decompress/recompress it:
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
-	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
+	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
 			bytes > c->journal.entry_size_max, c,
 			"journal entry too big (%zu bytes), sector %lluu",
 			bytes, sector)) {
 		/* XXX: note we might have missing journal entries */
@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
 {
 	struct journal *j = &c->journal;
 	struct journal_seq_blacklist *bl;
 	struct cache *ca;
 	u64 new_seq = 0;
 	unsigned i;
 	for_each_cache(ca, c, i)
 		if (is_journal_device(ca))
 			bch_dev_group_add(&c->journal.devs, ca);
 	list_for_each_entry(bl, &j->seq_blacklist, list)
 		new_seq = max(new_seq, bl->seq);
@ -1534,48 +1527,111 @@ err:
 	return ret;
 }
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
 				      unsigned nr, bool write_super)
 {
 	struct journal *j = &c->journal;
 	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets =
+	struct bch_sb_field_journal *journal_buckets;
-		bch_sb_get_journal(ca->disk_sb.sb);
+	struct disk_reservation disk_res = { 0, 0 };
-	struct bch_sb_field *f;
+	struct closure cl;
-	u64 *p;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
 	int ret = 0;
-	p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+	closure_init_stack(&cl);
 		     GFP_KERNEL|__GFP_ZERO);
 	if (!p)
 		return -ENOMEM;
-	ja->bucket_seq = p;
+	mutex_lock(&c->sb_lock);
-	p = krealloc(ja->buckets, nr * sizeof(u64),
+	/* don't handle reducing nr of buckets yet: */
-		     GFP_KERNEL|__GFP_ZERO);
+	if (nr <= ja->nr)
-	if (!p)
+		goto err;
 		return -ENOMEM;
-	ja->buckets = p;
+	/*
 	 * note: journal buckets aren't really counted as _sectors_ used yet, so
 	 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
 	 * when space used goes up without a reservation - but we do need the
 	 * reservation to ensure we'll actually be able to allocate:
 	 */
-	f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
+	ret = ENOSPC;
-				    sizeof(*journal_buckets) / sizeof(u64));
+	if (bch_disk_reservation_get(c, &disk_res,
-	if (!f)
+			(nr - ja->nr) << ca->bucket_bits, 0))
-		return -ENOMEM;
+		goto err;
 	f->type = BCH_SB_FIELD_journal;
-	ja->nr = nr;
+	ret = -ENOMEM;
-	return 0;
+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 	if (!new_buckets || !new_bucket_seq)
 		goto err;
 	journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
 				nr + sizeof(*journal_buckets) / sizeof(u64));
 	if (!journal_buckets)
 		goto err;
 	spin_lock(&j->lock);
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
 	while (ja->nr < nr) {
 		/* must happen under journal lock, to avoid racing with gc: */
 		u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
 		if (!b) {
 			if (!closure_wait(&c->freelist_wait, &cl)) {
 				spin_unlock(&j->lock);
 				closure_sync(&cl);
 				spin_lock(&j->lock);
 			}
 			continue;
 		}
 		bch_mark_metadata_bucket(ca, &ca->buckets[b],
 					 BUCKET_JOURNAL, false);
 		bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
 		memmove(ja->buckets + ja->last_idx + 1,
 			ja->buckets + ja->last_idx,
 			(ja->nr - ja->last_idx) * sizeof(u64));
 		memmove(ja->bucket_seq + ja->last_idx + 1,
 			ja->bucket_seq + ja->last_idx,
 			(ja->nr - ja->last_idx) * sizeof(u64));
 		memmove(journal_buckets->buckets + ja->last_idx + 1,
 			journal_buckets->buckets + ja->last_idx,
 			(ja->nr - ja->last_idx) * sizeof(u64));
 		ja->buckets[ja->last_idx] = b;
 		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
 		if (ja->last_idx < ja->nr) {
 			if (ja->cur_idx >= ja->last_idx)
 				ja->cur_idx++;
 			ja->last_idx++;
 		}
 		ja->nr++;
 	}
 	spin_unlock(&j->lock);
 	BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
 	if (write_super)
 		bch_write_super(c);
 	ret = 0;
 err:
 	mutex_unlock(&c->sb_lock);
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
 	bch_disk_reservation_put(c, &disk_res);
 	return ret;
 }
 int bch_dev_journal_alloc(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets;
 	int ret;
 	unsigned i;
 	if (ca->mi.tier != 0)
 		return 0;
 	if (dynamic_fault("bcache:add:journal_alloc"))
 		return -ENOMEM;
@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
 	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
 	 * is smaller:
 	 */
-	ret = bch_set_nr_journal_buckets(ca,
+	return bch_set_nr_journal_buckets(ca->set, ca,
 			clamp_t(unsigned, ca->mi.nbuckets >> 8,
 				BCH_JOURNAL_BUCKETS_MIN,
 				min(1 << 10,
-				    (1 << 20) / ca->mi.bucket_size)));
+				    (1 << 20) / ca->mi.bucket_size)),
-	if (ret)
+			false);
 		return ret;
 	journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
 	for (i = 0; i < ja->nr; i++) {
 		u64 bucket = ca->mi.first_bucket + i;
 		ja->buckets[i] = bucket;
 		journal_buckets->buckets[i] = cpu_to_le64(bucket);
 		bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
 	}
 	return 0;
 }
 /* Journalling */
@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
 	     fifo_entry_idx(&j->pin, pin->pin_list))) {
 		if (journal_pin_active(pin))
 			__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list,
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
 				  pin, NULL);
 	}
 	spin_unlock_irq(&j->pin_lock);
 }
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 {
@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 	return ret;
 }
 static bool journal_has_pins(struct journal *j)
 {
 	bool ret;
 	spin_lock(&j->lock);
 	journal_reclaim_fast(j);
 	ret = fifo_used(&j->pin) > 1 ||
 		atomic_read(&fifo_peek_front(&j->pin).count) > 1;
 	spin_unlock(&j->lock);
 	return ret;
 }
 void bch_journal_flush_pins(struct journal *j)
 {
 	struct journal_entry_pin *pin;
 	while ((pin = journal_get_next_pin(j, U64_MAX)))
 		pin->flush(j, pin);
 	wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
 }
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	struct cache_set *c = container_of(j, struct cache_set, journal);
 	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	struct bch_extent_ptr *ptr;
 	struct journal_device *ja;
 	struct cache *ca;
-	unsigned iter, replicas, replicas_want =
+	bool swapped;
 	unsigned i, replicas, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
 	spin_lock(&j->lock);
@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	replicas = bch_extent_nr_ptrs(e.c);
 	spin_lock(&j->devs.lock);
 	/* Sort by tier: */
 	do {
 		swapped = false;
 		for (i = 0; i + 1 < j->devs.nr; i++)
 			if (j->devs.d[i + 0].dev->mi.tier >
 			    j->devs.d[i + 1].dev->mi.tier) {
 				swap(j->devs.d[i], j->devs.d[i + 1]);
 				swapped = true;
 			}
 	} while (swapped);
 	/*
-	 * Determine location of the next journal write:
+	 * Pick devices for next journal write:
-	 * XXX: sort caches by free journal space
+	 * XXX: sort devices by free journal space?
 	 */
-	group_for_each_cache_rcu(ca, &j->devs, iter) {
+	for (i = 0; i < j->devs.nr; i++) {
-		struct journal_device *ja = &ca->journal;
+		ca = j->devs.d[i].dev;
 		ja = &ca->journal;
 		if (replicas >= replicas_want)
 			break;
@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
 	}
-
+	spin_unlock(&j->devs.lock);
 	rcu_read_unlock();
 	j->prev_buf_sectors = 0;
@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
 	return bch_journal_flush_seq(j, seq);
 }
 void bch_journal_free(struct journal *j)
 {
 	unsigned order = get_order(j->entry_size_max);
 	free_pages((unsigned long) j->buf[1].data, order);
 	free_pages((unsigned long) j->buf[0].data, order);
 	free_fifo(&j->pin);
 }
 int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
 {
 	static struct lock_class_key res_key;
 	unsigned order = get_order(entry_size_max);
 	spin_lock_init(&j->lock);
 	spin_lock_init(&j->pin_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
 	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	spin_lock_init(&j->devs.lock);
 	mutex_init(&j->reclaim_lock);
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 	j->entry_size_max	= entry_size_max;
 	j->write_delay_ms	= 100;
 	j->reclaim_delay_ms	= 100;
 	bkey_extent_init(&j->key);
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
 	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
 		return -ENOMEM;
 	return 0;
 }
 ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 {
 	union journal_res_state *s = &j->reservations;
@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
 	return ret;
 }
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
 {
 	if (!test_bit(JOURNAL_STARTED, &j->flags))
 		return;
 	/*
 	 * Empty out the journal by first flushing everything pinning existing
 	 * journal entries, then force a brand new empty journal entry to be
 	 * written:
 	 */
 	bch_journal_flush_pins(j);
 	bch_journal_flush_async(j, NULL);
 	bch_journal_meta(j);
 	cancel_delayed_work_sync(&j->write_work);
 	cancel_delayed_work_sync(&j->reclaim_work);
 }
 void bch_dev_journal_exit(struct cache *ca)
 {
 	kfree(ca->journal.buckets);
 	kfree(ca->journal.bucket_seq);
 }
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
 	return 0;
 }
 void bch_fs_journal_exit(struct journal *j)
 {
 	unsigned order = get_order(j->entry_size_max);
 	free_pages((unsigned long) j->buf[1].data, order);
 	free_pages((unsigned long) j->buf[0].data, order);
 	free_fifo(&j->pin);
 }
 int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
 {
 	static struct lock_class_key res_key;
 	unsigned order = get_order(entry_size_max);
 	spin_lock_init(&j->lock);
 	spin_lock_init(&j->pin_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
 	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	spin_lock_init(&j->devs.lock);
 	mutex_init(&j->reclaim_lock);
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 	j->entry_size_max	= entry_size_max;
 	j->write_delay_ms	= 100;
 	j->reclaim_delay_ms	= 100;
 	bkey_extent_init(&j->key);
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
 	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
 		return -ENOMEM;
 	return 0;
 }
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@ -111,7 +111,6 @@
 #include <linux/hash.h>
 #include "journal_types.h"
 //#include "super-io.h"
 /*
 * Only used for holding the journal entries we read in btree_journal_read()
@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
 void bch_journal_flush_pins(struct journal *);
 struct closure;
 struct cache_set;
@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
 		? -EIO : 0;
 }
 static inline bool is_journal_device(struct cache *ca)
 {
 	return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
 }
 static inline bool journal_flushes_device(struct cache *ca)
 {
 	return true;
@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
 	spin_unlock(&j->lock);
 }
 void bch_journal_free(struct journal *);
 int bch_journal_alloc(struct journal *, unsigned);
 ssize_t bch_journal_print_debug(struct journal *, char *);
 int bch_dev_journal_alloc(struct cache *);
@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 int bch_journal_move(struct cache *);
-void bch_journal_free_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
-int bch_journal_init_cache(struct cache *);
+void bch_dev_journal_exit(struct cache *);
 int bch_dev_journal_init(struct cache *);
 void bch_fs_journal_exit(struct journal *);
 int bch_fs_journal_init(struct journal *, unsigned);
 #endif /* _BCACHE_JOURNAL_H */
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
 		}
 		if (g->mark.owned_by_allocator ||
-		    g->mark.is_metadata)
+		    g->mark.data_type != BUCKET_DATA)
 			continue;
 		sectors_used = bucket_sectors_used(g);
@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
 	return 0;
 }
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
 {
-	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	ca->moving_gc_pd.d_term = 0;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
 	if (ca->moving_gc_read)
 		kthread_stop(ca->moving_gc_read);
 	ca->moving_gc_read = NULL;
 }
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
 {
 	struct task_struct *t;
-	/* The moving gc read thread must be stopped */
+	BUG_ON(ca->moving_gc_read);
 	BUG_ON(ca->moving_gc_read != NULL);
 	if (ca->set->opts.nochanges)
 		return 0;
@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
 	return 0;
 }
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_pd_controller_init(&ca->moving_gc_pd);
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+	ca->moving_gc_pd.d_term = 0;
 	if (ca->moving_gc_read)
 		kthread_stop(ca->moving_gc_read);
 	ca->moving_gc_read = NULL;
 }
--- a/libbcache/movinggc.h
+++ b/libbcache/movinggc.h
@ -23,8 +23,8 @@
 #define COPYGC_SECTORS_PER_ITER(ca)					\
 	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
 void bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
 void bch_dev_moving_gc_init(struct cache *);
 #endif
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@ -86,11 +86,17 @@ enum opt_type {
 	BCH_OPT(noreplay,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_OPT(norecovery,		0444,	NO_SB_OPT,		\
-		s8,  OPT_BOOL())
+		s8,  OPT_BOOL())					\
 	BCH_OPT(noexcl,			0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_OPT(sb,			0444,	NO_SB_OPT,		\
 		s64, OPT_UINT(0, S64_MAX))				\
 #define BCH_OPTS()							\
 	BCH_OPT(read_only,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_OPT(nostart,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_VISIBLE_OPTS()
 struct bch_opts {
@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 #define opt_defined(_opt)		((_opt) >= 0)
 void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
 struct bch_opts bch_sb_opts(struct bch_sb *);
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@ -10,6 +10,7 @@
 #include "vstructs.h"
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 static inline void __bch_sb_layout_size_assert(void)
 {
@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
 }
 struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
-				      enum bch_sb_field_types type)
+				      enum bch_sb_field_type type)
 {
 	struct bch_sb_field *f;
@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
 	if (sb->bio)
 		bio_put(sb->bio);
 	if (!IS_ERR_OR_NULL(sb->bdev))
-		blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+		blkdev_put(sb->bdev, sb->mode);
 	free_pages((unsigned long) sb->sb, sb->page_order);
 	memset(sb, 0, sizeof(*sb));
@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
 	return 0;
 }
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
 {
 	u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
 	u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
 	le32_add_cpu(&sb->u64s, u64s - old_u64s);
 	return f;
 }
 struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
 					 enum bch_sb_field_type type,
 					 unsigned u64s)
 {
 	struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 	if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
 		return NULL;
 	f = __bch_sb_field_resize(sb->sb, f, u64s);
 	f->type = type;
 	return f;
 }
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
-					    struct bch_sb_field *f,
+					    enum bch_sb_field_type type,
 					    unsigned u64s)
 {
 	struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 	struct cache *ca;
@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
 	for_each_cache(ca, c, i) {
 		struct bcache_superblock *sb = &ca->disk_sb;
-		if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+		if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
 			percpu_ref_put(&ca->ref);
 			return NULL;
 		}
 	}
-	return __bch_sb_field_resize(c->disk_sb, f, u64s);
+	f = __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
+	f->type = type;
-
+	return f;
 struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
 					     struct bch_sb_field *f,
 					     unsigned u64s)
 {
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 	if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
 		return NULL;
 	return __bch_sb_field_resize(sb->sb, f, u64s);
 }
 static const char *validate_sb_layout(struct bch_sb_layout *layout)
@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	prev_offset = le64_to_cpu(layout->sb_offset[0]);
 	if (prev_offset != BCH_SB_SECTOR)
 		return "Invalid superblock layout: doesn't have default superblock location";
 	for (i = 1; i < layout->nr_superblocks; i++) {
 		offset = le64_to_cpu(layout->sb_offset[i]);
@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	return NULL;
 }
 static int u64_cmp(const void *_l, const void *_r)
 {
 	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
 	return l < r ? -1 : l > r ? 1 : 0;
 }
 const char *bch_validate_journal_layout(struct bch_sb *sb,
 					struct cache_member_cpu mi)
 {
 	struct bch_sb_field_journal *journal;
 	const char *err;
 	unsigned nr;
 	unsigned i;
 	u64 *b;
 	journal = bch_sb_get_journal(sb);
 	if (!journal)
 		return NULL;
 	nr = bch_nr_journal_buckets(journal);
 	if (!nr)
 		return NULL;
 	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
 	if (!b)
 		return "cannot allocate memory";
 	for (i = 0; i < nr; i++)
 		b[i] = le64_to_cpu(journal->buckets[i]);
 	sort(b, nr, sizeof(u64), u64_cmp, NULL);
 	err = "journal bucket at sector 0";
 	if (!b[0])
 		goto err;
 	err = "journal bucket before first bucket";
 	if (b[0] < mi.first_bucket)
 		goto err;
 	err = "journal bucket past end of device";
 	if (b[nr - 1] >= mi.nbuckets)
 		goto err;
 	err = "duplicate journal buckets";
 	for (i = 0; i + 1 < nr; i++)
 		if (b[i] == b[i + 1])
 			goto err;
 	err = NULL;
 err:
 	kfree(b);
 	return err;
 }
 const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *sb_mi;
 	struct bch_sb_field_journal *journal;
 	struct cache_member_cpu	mi;
 	const char *err;
 	u16 block_size;
 	unsigned i;
 	switch (le64_to_cpu(sb->version)) {
 	case BCACHE_SB_VERSION_CDEV_V4:
@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
 	for (i = 0; i < sb->layout.nr_superblocks; i++) {
 		u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
 		u64 max_size = 1 << sb->layout.sb_max_size_bits;
 		if (offset + max_size > mi.first_bucket * mi.bucket_size)
 			return "Invalid superblock: first bucket comes before end of super";
 	}
 	if (mi.nbuckets > LONG_MAX)
 		return "Too many buckets";
@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	    mi.bucket_size * mi.nbuckets)
 		return "Invalid superblock: device too small";
-	/* Validate journal buckets: */
+	err = bch_validate_journal_layout(sb, mi);
-	journal = bch_sb_get_journal(sb);
+	if (err)
-	if (journal) {
+		return err;
 		for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
 			u64 b = le64_to_cpu(journal->buckets[i]);
 			if (b <  mi.first_bucket || b >= mi.nbuckets)
 				return "bad journal bucket";
 		}
 	}
 	return NULL;
 }
@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
 static bool bch_is_open(struct block_device *bdev)
 {
-	lockdep_assert_held(&bch_register_lock);
+	bool ret;
-	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_lock(&bch_register_lock);
 	ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
 	mutex_unlock(&bch_register_lock);
 	return ret;
 }
-static const char *bch_blkdev_open(const char *path, void *holder,
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
-				   struct bch_opts opts,
+				   void *holder, struct block_device **ret)
 				   struct block_device **ret)
 {
 	struct block_device *bdev;
 	fmode_t mode = opts.nochanges > 0
 		? FMODE_READ
 		: FMODE_READ|FMODE_WRITE|FMODE_EXCL;
 	const char *err;
 	*ret = NULL;
@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
 	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
 	int ret;
-	ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+	ret = bch_sb_realloc(&ca->disk_sb, u64s);
 	if (ret)
 		return ret;
@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
 reread:
 	bio_reset(sb->bio);
 	sb->bio->bi_bdev = sb->bdev;
-	sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+	sb->bio->bi_iter.bi_sector = offset;
 	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
 	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 	bch_bio_map(sb->bio, sb->sb);
@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
 			   struct bch_opts opts,
 			   const char *path)
 {
 	u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
 	struct bch_sb_layout layout;
 	const char *err;
 	unsigned i;
 	lockdep_assert_held(&bch_register_lock);
 	memset(sb, 0, sizeof(*sb));
 	sb->mode = FMODE_READ;
-	err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+	if (!(opt_defined(opts.noexcl) && opts.noexcl))
 		sb->mode |= FMODE_EXCL;
 	if (!(opt_defined(opts.nochanges) && opts.nochanges))
 		sb->mode |= FMODE_WRITE;
 	err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
 	if (err)
 		return err;
@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
 	if (bch_fs_init_fault("read_super"))
 		goto err;
-	err = read_one_super(sb, BCH_SB_SECTOR);
+	err = read_one_super(sb, offset);
 	if (!err)
 		goto got_super;
-	pr_err("error reading default super: %s", err);
+	if (offset != BCH_SB_SECTOR) {
 		pr_err("error reading superblock: %s", err);
 		goto err;
 	}
 	pr_err("error reading default superblock: %s", err);
 	/*
 	 * Error reading primary superblock - read location of backup
@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
 	lockdep_assert_held(&c->sb_lock);
 	if (c->opts.nochanges)
 		return;
 	closure_init_stack(cl);
 	le64_add_cpu(&c->disk_sb->seq, 1);
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@ -6,16 +6,35 @@
 #include <asm/byteorder.h>
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
 					 enum bch_sb_field_type, unsigned);
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
 					 enum bch_sb_field_type, unsigned);
-#define BCH_SB_FIELD_TYPE(_name)				\
+#define field_to_type(_f, _name)					\
-static inline struct bch_sb_field_##_name *			\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
-bch_sb_get_##_name(struct bch_sb *sb)				\
+
-{								\
+#define BCH_SB_FIELD_TYPE(_name)					\
-	struct bch_sb_field *f =				\
+static inline struct bch_sb_field_##_name *				\
-		bch_sb_field_get(sb, BCH_SB_FIELD_##_name);	\
+bch_sb_get_##_name(struct bch_sb *sb)					\
-								\
+{									\
-	return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+	return field_to_type(bch_sb_field_get(sb,			\
 				BCH_SB_FIELD_##_name), _name);		\
 }									\
 									\
 static inline struct bch_sb_field_##_name *				\
 bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s)	\
 {									\
 	return field_to_type(bch_sb_field_resize(sb,			\
 				BCH_SB_FIELD_##_name, u64s), _name);	\
 }									\
 									\
 static inline struct bch_sb_field_##_name *				\
 bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s)		\
 {									\
 	return field_to_type(bch_fs_sb_field_resize(c,			\
 				BCH_SB_FIELD_##_name, u64s), _name);	\
 }
 BCH_SB_FIELD_TYPE(journal);
@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
 int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
 int bch_sb_from_cache_set(struct cache_set *, struct cache *);
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
 				struct bch_sb_field *, unsigned);
 struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
 				struct bch_sb_field *, unsigned);
 void bch_free_super(struct bcache_superblock *);
 int bch_super_realloc(struct bcache_superblock *, unsigned);
 const char *bch_validate_journal_layout(struct bch_sb *,
 					struct cache_member_cpu);
 const char *bch_validate_cache_super(struct bcache_superblock *);
 const char *bch_read_super(struct bcache_superblock *,
--- a/libbcache/super.c
+++ b/libbcache/super.c
--- a/libbcache/super.h
+++ b/libbcache/super.h
@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 static inline bool bch_dev_may_remove(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *grp = &c->cache_all;
-	/*
+	/* Can't remove the last RW device: */
-	 * Right now, we can't remove the last device from a tier,
+	return grp->nr != 1 ||
-	 * - For tier 0, because all metadata lives in tier 0 and because
+		rcu_access_pointer(grp->d[0].dev) != ca;
 	 *   there is no way to have foreground writes go directly to tier 1.
 	 * - For tier 1, because the code doesn't completely support an
 	 *   empty tier 1.
 	 */
 	/*
 	 * Turning a device read-only removes it from the cache group,
 	 * so there may only be one read-write device in a tier, and yet
 	 * the device we are removing is in the same tier, so we have
 	 * to check for identity.
 	 * Removing the last RW device from a tier requires turning the
 	 * whole cache set RO.
 	 */
 	return tier->nr_devices != 1 ||
 		rcu_access_pointer(tier->d[0].dev) != ca;
 }
 void bch_dev_release(struct kobject *);
@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
 void bch_fs_detach(struct cache_set *);
 bool bch_fs_read_only(struct cache_set *);
 bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
 const char *bch_fs_read_write(struct cache_set *);
 void bch_fs_release(struct kobject *);
 void bch_fs_stop_async(struct cache_set *);
 void bch_fs_stop(struct cache_set *);
 void bch_fs_stop_sync(struct cache_set *);
 const char *bch_fs_start(struct cache_set *);
 const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
 			struct cache_set **);
 const char *bch_fs_open_incremental(const char *path);
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@ -6,6 +6,7 @@ struct bcache_superblock {
 	struct block_device	*bdev;
 	struct bio		*bio;
 	unsigned		page_order;
 	fmode_t			mode;
 };
 #endif /* _BCACHE_SUPER_TYPES_H */
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@ -22,6 +22,7 @@
 #include "opts.h"
 #include "request.h"
 #include "super-io.h"
 #include "tier.h"
 #include "writeback.h"
 #include <linux/blkdev.h>
@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
 rw_attribute(tier);
 rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
 rw_attribute(size);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
 read_attribute(tier);
 #define BCH_DEBUG_PARAM(name, description)				\
 	rw_attribute(name);
@ -680,7 +682,8 @@ SHOW(bch_fs)
 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+
 	sysfs_pd_controller_show(tiering,	&c->tiers[1].pd); /* XXX */
 	sysfs_printf(meta_replicas_have, "%u",	c->sb.meta_replicas_have);
 	sysfs_printf(data_replicas_have, "%u",	c->sb.data_replicas_have);
@ -694,7 +697,7 @@ SHOW(bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 	if (attr == &sysfs_bset_tree_stats)
@ -723,7 +726,7 @@ STORE(__bch_fs)
 	}
 	if (attr == &sysfs_stop) {
-		bch_fs_stop(c);
+		bch_fs_stop_async(c);
 		return size;
 	}
@ -773,25 +776,18 @@ STORE(__bch_fs)
 		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
 			?: (ssize_t) size;
-		if (c->tiering_read)
+		bch_tiering_start(c); /* issue wakeups */
 			wake_up_process(c->tiering_read);
 		return ret;
 	}
 	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
 	if (attr == &sysfs_journal_flush) {
 		bch_journal_meta_async(&c->journal, NULL);
 		return size;
 	}
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
 	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+	sysfs_pd_controller_store(tiering,	&c->tiers[1].pd); /* XXX */
 	/* Debugging: */
@ -799,11 +795,14 @@ STORE(__bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
+	if (attr == &sysfs_journal_flush) {
-		return -EINTR;
+		bch_journal_meta_async(&c->journal, NULL);
 		return size;
 	}
 	if (attr == &sysfs_blockdev_volume_create) {
 		u64 v = strtoi_h_or_return(buf);
@ -836,9 +835,9 @@ STORE(bch_fs)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
 	size = __bch_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	if (attr == &sysfs_add_device) {
 		char *path = kstrdup(buf, GFP_KERNEL);
@ -1273,6 +1272,31 @@ STORE(__bch_dev)
 		mutex_unlock(&c->sb_lock);
 	}
 	if (attr == &sysfs_tier) {
 		unsigned prev_tier;
 		unsigned v = strtoul_restrict_or_return(buf,
 					0, BCH_TIER_MAX - 1);
 		mutex_lock(&c->sb_lock);
 		prev_tier = ca->mi.tier;
 		if (v == ca->mi.tier) {
 			mutex_unlock(&c->sb_lock);
 			return size;
 		}
 		mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
 		SET_BCH_MEMBER_TIER(mi, v);
 		bch_write_super(c);
 		bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
 		bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
 		mutex_unlock(&c->sb_lock);
 		bch_recalc_capacity(c);
 		bch_tiering_start(c);
 	}
 	if (attr == &sysfs_state_rw) {
 		char name[BDEVNAME_SIZE];
 		const char *err = NULL;
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@ -16,8 +16,7 @@
 #include <trace/events/bcache.h>
 struct tiering_state {
-	struct cache_group	*tier;
+	struct bch_tier		*tier;
 	unsigned		tier_idx;
 	unsigned		sectors;
 	unsigned		stripe_size;
 	unsigned		dev_idx;
@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
 		mi = cache_member_info_get(c);
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev < mi->nr_devices &&
-			    mi->m[ptr->dev].tier >= s->tier_idx)
+			    mi->m[ptr->dev].tier >= s->tier->idx)
 				replicas++;
 		cache_member_info_put();
@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
 		s->sectors = 0;
 		s->dev_idx++;
-		spin_lock(&s->tier->lock);
+		spin_lock(&s->tier->devs.lock);
-		if (s->dev_idx >= s->tier->nr_devices)
+		if (s->dev_idx >= s->tier->devs.nr)
 			s->dev_idx = 0;
-		if (s->tier->nr_devices) {
+		if (s->tier->devs.nr) {
-			s->ca = s->tier->d[s->dev_idx].dev;
+			s->ca = s->tier->devs.d[s->dev_idx].dev;
 			percpu_ref_get(&s->ca->ref);
 		}
-		spin_unlock(&s->tier->lock);
+		spin_unlock(&s->tier->devs.lock);
 	}
 }
@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
 * tiering_next_cache - issue a move to write an extent to the next cache
 * device in round robin order
 */
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
 {
 	struct moving_context ctxt;
 	struct tiering_state s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned nr_devices = READ_ONCE(tier->nr_devices);
+	unsigned nr_devices = READ_ONCE(tier->devs.nr);
 	int ret;
 	if (!nr_devices)
@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
 	memset(&s, 0, sizeof(s));
 	s.tier		= tier;
 	s.tier_idx	= tier - c->cache_tiers;
 	s.stripe_size	= 2048; /* 1 mb for now */
-	bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+	bch_move_ctxt_init(&ctxt, &tier->pd.rate,
 			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
 	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
@ -164,8 +162,8 @@ next:
 static int bch_tiering_thread(void *arg)
 {
-	struct cache_set *c = arg;
+	struct bch_tier *tier = arg;
-	struct cache_group *tier = &c->cache_tiers[1];
+	struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct cache *ca;
 	u64 tier_capacity, available_sectors;
@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   tier->nr_devices))
+					   tier->devs.nr))
 			break;
 		while (1) {
-			struct cache_group *faster_tier;
+			struct bch_tier *faster_tier;
 			last = atomic_long_read(&clock->now);
 			tier_capacity = available_sectors = 0;
 			rcu_read_lock();
-			for (faster_tier = c->cache_tiers;
+			for (faster_tier = c->tiers;
 			     faster_tier != tier;
 			     faster_tier++) {
-				group_for_each_cache_rcu(ca, faster_tier, i) {
+				group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
 					tier_capacity +=
 						(ca->mi.nbuckets -
 						 ca->mi.first_bucket) << ca->bucket_bits;
@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
 	return 0;
 }
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
 {
-	bch_pd_controller_init(&c->tiering_pd);
+	tier->pd.rate.rate = UINT_MAX;
 	bch_ratelimit_reset(&tier->pd.rate);
 	if (tier->migrate)
 		kthread_stop(tier->migrate);
 	tier->migrate = NULL;
 }
-int bch_tiering_read_start(struct cache_set *c)
+void bch_tiering_stop(struct cache_set *c)
 {
-	struct task_struct *t;
+	struct bch_tier *tier;
 	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
 		__bch_tiering_stop(tier);
 }
 static int __bch_tiering_start(struct bch_tier *tier)
 {
 	if (!tier->migrate) {
 		struct task_struct *p =
 			kthread_create(bch_tiering_thread, tier,
 				       "bch_tier[%u]", tier->idx);
 		if (IS_ERR(p))
 			return PTR_ERR(p);
 		tier->migrate = p;
 	}
 	wake_up_process(tier->migrate);
 	return 0;
 }
 int bch_tiering_start(struct cache_set *c)
 {
 	struct bch_tier *tier;
 	bool have_faster_tier = false;
 	if (c->opts.nochanges)
 		return 0;
-	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
+	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-	if (IS_ERR(t))
+		if (!tier->devs.nr)
-		return PTR_ERR(t);
+			continue;
-	c->tiering_read = t;
+		if (have_faster_tier) {
-	wake_up_process(c->tiering_read);
+			int ret = __bch_tiering_start(tier);
 			if (ret)
 				return ret;
 		} else {
 			__bch_tiering_stop(tier);
 		}
 		have_faster_tier = true;
 	}
 	return 0;
 }
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
 {
-	if (!IS_ERR_OR_NULL(c->tiering_read)) {
+	unsigned i;
-		kthread_stop(c->tiering_read);
+
-		c->tiering_read = NULL;
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
 		c->tiers[i].idx = i;
 		bch_pd_controller_init(&c->tiers[i].pd);
 	}
 }
--- a/libbcache/tier.h
+++ b/libbcache/tier.h
@ -1,8 +1,8 @@
 #ifndef _BCACHE_TIER_H
 #define _BCACHE_TIER_H
-void bch_tiering_init_cache_set(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
 #endif
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@ -20,8 +20,14 @@ int submit_bio_wait(struct bio *bio)
 	ssize_t ret;
 	unsigned i;
-	if (bio->bi_opf & REQ_PREFLUSH)
+	if (bio->bi_opf & REQ_PREFLUSH) {
-		fdatasync(bio->bi_bdev->bd_fd);
+		ret = fdatasync(bio->bi_bdev->bd_fd);
 		if (ret) {
 			fprintf(stderr, "fsync error: %s\n",
 				strerror(errno));
 			return -EIO;
 		}
 	}
 	i = 0;
 	bio_for_each_segment(bv, bio, iter)
@ -49,10 +55,22 @@ int submit_bio_wait(struct bio *bio)
 		BUG();
 	}
-	if (bio->bi_opf & REQ_FUA)
+	if (ret != bio->bi_iter.bi_size) {
-		fdatasync(bio->bi_bdev->bd_fd);
+		fprintf(stderr, "IO error: %li (%s)\n",
 			ret, strerror(errno));
 		return -EIO;
 	}
-	return ret == bio->bi_iter.bi_size ? 0 : -EIO;
+	if (bio->bi_opf & REQ_FUA) {
 		ret = fdatasync(bio->bi_bdev->bd_fd);
 		if (ret) {
 			fprintf(stderr, "fsync error: %s\n",
 				strerror(errno));
 			return -EIO;
 		}
 	}
 	return 0;
 }
 void generic_make_request(struct bio *bio)
--- a/qcow2.c
+++ b/qcow2.c
@ -2,7 +2,6 @@
 #include <errno.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <linux/sort.h>
 #include "qcow2.h"
 #include "tools-util.h"
@ -69,18 +68,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
 	img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED);
 }
-static int range_cmp(const void *_l, const void *_r)
+void qcow2_write_image(int infd, int outfd, ranges *data,
 {
 	const struct range *l = _l, *r = _r;
 	if (l->start < r->start)
 		return -1;
 	if (l->start > r->start)
 		return  1;
 	return 0;
 }
 void qcow2_write_image(int infd, int outfd, sparse_data *data,
 		       unsigned block_size)
 {
 	u64 image_size = get_size(NULL, infd);
@ -98,30 +86,11 @@ void qcow2_write_image(int infd, int outfd, sparse_data *data,
 	struct range *r;
 	char *buf = xmalloc(block_size);
 	u64 src_offset, dst_offset;
 	sparse_data m;
 	assert(is_power_of_2(block_size));
-	sort(&darray_item(*data, 0),
+	ranges_roundup(data, block_size);
-	     darray_size(*data),
+	ranges_sort_merge(data);
 	     sizeof(darray_item(*data, 0)),
 	     range_cmp, NULL);
 	/* Round to blocksize, merge contiguous ranges: */
 	darray_init(m);
 	darray_foreach(r, *data) {
 		struct range *l = m.size ?  &m.item[m.size - 1] : NULL;
 		r->start = round_down(r->start, block_size);
 		r->end	= round_up(r->end, block_size);
 		if (l && l->end >= r->start)
 			l->end = max(l->end, r->end);
 		else
 			darray_append(m, *r);
 	}
 	darray_free(*data);
 	*data = m;
 	/* Write data: */
 	darray_foreach(r, *data)
--- a/qcow2.h
+++ b/qcow2.h
@ -2,23 +2,8 @@
 #define _QCOW2_H
 #include <linux/types.h>
-#include "ccan/darray/darray.h"
+#include "tools-util.h"
-struct range {
+void qcow2_write_image(int, int, ranges *, unsigned);
 	u64		start;
 	u64		end;
 };
 typedef darray(struct range) sparse_data;
 static inline void data_add(sparse_data *data, u64 offset, u64 size)
 {
 	darray_append(*data, (struct range) {
 		.start = offset,
 		.end = offset + size
 	});
 }
 void qcow2_write_image(int, int, sparse_data *, unsigned);
 #endif /* _QCOW2_H */
--- a/tools-util.c
+++ b/tools-util.c
@ -1,4 +1,3 @@
 #include <alloca.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
@ -19,6 +18,7 @@
 #include "ccan/crc/crc.h"
 #include "linux/bcache-ioctl.h"
 #include "linux/sort.h"
 #include "tools-util.h"
 #include "util.h"
@ -59,20 +59,12 @@ struct units_buf __pr_units(u64 v, enum units units)
 char *read_file_str(int dirfd, const char *path)
 {
-	int fd = openat(dirfd, path, O_RDONLY);
+	int fd = xopenat(dirfd, path, O_RDONLY);
 	size_t len = xfstat(fd).st_size;
-	if (fd < 0)
+	char *buf = malloc(len + 1);
 		die("Unable to open %s\n", path);
-	struct stat statbuf;
+	xpread(fd, buf, len, 0);
 	if (fstat(fd, &statbuf) < 0)
 		die("fstat error\n");
 	char *buf = malloc(statbuf.st_size + 1);
 	int len = read(fd, buf, statbuf.st_size);
 	if (len < 0)
 		die("read error while reading from file %s\n", path);
 	buf[len] = '\0';
 	if (len && buf[len - 1] == '\n')
@ -107,48 +99,33 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
 /* Returns size of file or block device: */
 u64 get_size(const char *path, int fd)
 {
-	struct stat statbuf;
+	struct stat statbuf = xfstat(fd);
 	u64 ret;
 	if (fstat(fd, &statbuf))
 		die("Error statting %s: %s", path, strerror(errno));
 	if (!S_ISBLK(statbuf.st_mode))
 		return statbuf.st_size;
-	if (ioctl(fd, BLKGETSIZE64, &ret))
+	u64 ret;
-		die("Error getting block device size on %s: %s\n",
+	xioctl(fd, BLKGETSIZE64, &ret);
 		    path, strerror(errno));
 	return ret;
 }
 /* Returns blocksize in units of 512 byte sectors: */
 unsigned get_blocksize(const char *path, int fd)
 {
-	struct stat statbuf;
+	struct stat statbuf = xfstat(fd);
 	if (fstat(fd, &statbuf))
 		die("Error statting %s: %s", path, strerror(errno));
 	if (!S_ISBLK(statbuf.st_mode))
 		return statbuf.st_blksize >> 9;
 	unsigned ret;
-	if (ioctl(fd, BLKPBSZGET, &ret))
+	xioctl(fd, BLKPBSZGET, &ret);
 		die("Error getting blocksize on %s: %s\n",
 		    path, strerror(errno));
 	return ret >> 9;
 }
 /* Global control device: */
 int bcachectl_open(void)
 {
-	int fd = open("/dev/bcache-ctl", O_RDWR);
+	return xopen("/dev/bcache-ctl", O_RDWR);
 	if (fd < 0)
 		die("Can't open bcache device: %s", strerror(errno));
 	return fd;
 }
 /* Filesystem handles (ioctl, sysfs dir): */
@ -162,47 +139,29 @@ struct bcache_handle bcache_fs_open(const char *path)
 	if (!uuid_parse(path, tmp)) {
 		/* It's a UUID, look it up in sysfs: */
-
+		char *sysfs = mprintf("%s%s", SYSFS_BASE, path);
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
 		sprintf(sysfs, "%s%s", SYSFS_BASE, path);
 		ret.sysfs_fd = open(sysfs, O_RDONLY);
 		if (!ret.sysfs_fd)
 			die("Unable to open %s\n", path);
 		char *minor = read_file_str(ret.sysfs_fd, "minor");
-		char *ctl = alloca(20 + strlen(minor));
+		char *ctl = mprintf("/dev/bcache%s-ctl", minor);
 		ret.ioctl_fd = xopen(ctl, O_RDWR);
-		sprintf(ctl, "/dev/bcache%s-ctl", minor);
+		free(sysfs);
 		free(minor);
-
+		free(ctl);
 		ret.ioctl_fd = open(ctl, O_RDWR);
 		if (ret.ioctl_fd < 0)
 			die("Error opening control device: %s\n",
 			    strerror(errno));
 	} else {
 		/* It's a path: */
-
+		ret.ioctl_fd = xopen(path, O_RDONLY);
 		ret.ioctl_fd = open(path, O_RDONLY);
 		if (ret.ioctl_fd < 0)
 			die("Error opening %s: %s\n",
 			    path, strerror(errno));
 		struct bch_ioctl_query_uuid uuid;
-		if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid))
+		xioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid);
 			die("ioctl error (not a bcache fs?): %s\n",
 			    strerror(errno));
 		char uuid_str[40];
 		uuid_unparse(uuid.uuid.b, uuid_str);
-		char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
+		char *sysfs = mprintf("%s%s", SYSFS_BASE, uuid_str);
-		sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
-
+		free(sysfs);
 		ret.sysfs_fd = open(sysfs, O_RDONLY);
 		if (ret.sysfs_fd < 0)
 			die("Unable to open sysfs dir %s: %s\n",
 			    sysfs, strerror(errno));
 	}
 	return ret;
@ -225,3 +184,89 @@ bool ask_yn(void)
 	free(buf);
 	return ret;
 }
 static int range_cmp(const void *_l, const void *_r)
 {
 	const struct range *l = _l, *r = _r;
 	if (l->start < r->start)
 		return -1;
 	if (l->start > r->start)
 		return  1;
 	return 0;
 }
 void ranges_sort_merge(ranges *r)
 {
 	struct range *t, *i;
 	ranges tmp = { NULL };
 	sort(&darray_item(*r, 0), darray_size(*r),
 	     sizeof(darray_item(*r, 0)), range_cmp, NULL);
 	/* Merge contiguous ranges: */
 	darray_foreach(i, *r) {
 		t = tmp.size ?  &tmp.item[tmp.size - 1] : NULL;
 		if (t && t->end >= i->start)
 			t->end = max(t->end, i->end);
 		else
 			darray_append(tmp, *i);
 	}
 	darray_free(*r);
 	*r = tmp;
 }
 void ranges_roundup(ranges *r, unsigned block_size)
 {
 	struct range *i;
 	darray_foreach(i, *r) {
 		i->start = round_down(i->start, block_size);
 		i->end	= round_up(i->end, block_size);
 	}
 }
 void ranges_rounddown(ranges *r, unsigned block_size)
 {
 	struct range *i;
 	darray_foreach(i, *r) {
 		i->start = round_up(i->start, block_size);
 		i->end	= round_down(i->end, block_size);
 		i->end	= max(i->end, i->start);
 	}
 }
 struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter)
 {
 	struct fiemap_extent e;
 	BUG_ON(iter->idx > iter->f.fm_mapped_extents);
 	if (iter->idx == iter->f.fm_mapped_extents) {
 		xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f);
 		if (!iter->f.fm_mapped_extents)
 			return (struct fiemap_extent) { .fe_length = 0 };
 		iter->idx = 0;
 	}
 	e = iter->f.fm_extents[iter->idx++];
 	BUG_ON(!e.fe_length);
 	iter->f.fm_start = e.fe_logical + e.fe_length;
 	return e;
 }
 const char *strcmp_prefix(const char *a, const char *a_prefix)
 {
 	while (*a_prefix && *a == *a_prefix) {
 		a++;
 		a_prefix++;
 	}
 	return *a_prefix ? NULL : a;
 }
--- a/tools-util.h
+++ b/tools-util.h
@ -5,21 +5,31 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <linux/bug.h>
 #include <linux/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include "ccan/darray/darray.h"
-#define die(arg, ...)					\
+#define die(arg, ...)							\
-do {							\
+do {									\
-	fprintf(stderr, arg "\n", ##__VA_ARGS__);	\
+	fprintf(stderr, arg "\n", ##__VA_ARGS__);			\
-	exit(EXIT_FAILURE);				\
+	exit(EXIT_FAILURE);						\
 } while (0)
 #define mprintf(...)							\
 ({									\
 	char *_str;							\
 	asprintf(&_str, __VA_ARGS__);					\
 	_str;								\
 })
 static inline void *xcalloc(size_t count, size_t size)
 {
 	void *p = calloc(count, size);
@ -57,6 +67,38 @@ static inline void xpwrite(int fd, const void *buf, size_t count, off_t offset)
 		die("write error (ret %zi err %s)", r, strerror(errno));
 }
 #define xopenat(_dirfd, _path, ...)					\
 ({									\
 	int _fd = openat((_dirfd), (_path), __VA_ARGS__);		\
 	if (_fd < 0)							\
 		die("Error opening %s: %s", (_path), strerror(errno));	\
 	_fd;								\
 })
 #define xopen(...)	xopenat(AT_FDCWD, __VA_ARGS__)
 static inline struct stat xfstatat(int dirfd, const char *path, int flags)
 {
 	struct stat stat;
 	if (fstatat(dirfd, path, &stat, flags))
 		die("stat error: %s", strerror(errno));
 	return stat;
 }
 static inline struct stat xfstat(int fd)
 {
 	struct stat stat;
 	if (fstat(fd, &stat))
 		die("stat error: %s", strerror(errno));
 	return stat;
 }
 #define xioctl(_fd, _nr, ...)						\
 do {									\
 	if (ioctl((_fd), (_nr), ##__VA_ARGS__))				\
 		die(#_nr " ioctl error: %s", strerror(errno));		\
 } while (0)
 enum units {
 	BYTES,
 	SECTORS,
@ -91,4 +133,74 @@ struct bcache_handle bcache_fs_open(const char *);
 bool ask_yn(void);
 struct range {
 	u64		start;
 	u64		end;
 };
 typedef darray(struct range) ranges;
 static inline void range_add(ranges *data, u64 offset, u64 size)
 {
 	darray_append(*data, (struct range) {
 		.start = offset,
 		.end = offset + size
 	});
 }
 void ranges_sort_merge(ranges *);
 void ranges_roundup(ranges *, unsigned);
 void ranges_rounddown(ranges *, unsigned);
 struct hole_iter {
 	ranges		r;
 	size_t		idx;
 	u64		end;
 };
 static inline struct range hole_iter_next(struct hole_iter *iter)
 {
 	struct range r = {
 		.start	= iter->idx ? iter->r.item[iter->idx - 1].end : 0,
 		.end	= iter->idx < iter->r.size
 			? iter->r.item[iter->idx].start : iter->end,
 	};
 	BUG_ON(r.start > r.end);
 	iter->idx++;
 	return r;
 }
 #define for_each_hole(_iter, _ranges, _end, _i)				\
 	for (_iter = (struct hole_iter) { .r = _ranges, .end = _end };	\
 	     (_iter.idx <= _iter.r.size &&				\
 	      (_i = hole_iter_next(&_iter), true));)
 #include <linux/fiemap.h>
 struct fiemap_iter {
 	struct fiemap		f;
 	struct fiemap_extent	fe[1024];
 	unsigned		idx;
 	int			fd;
 };
 static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd)
 {
 	memset(iter, 0, sizeof(*iter));
 	iter->f.fm_extent_count	= ARRAY_SIZE(iter->fe);
 	iter->f.fm_length	= FIEMAP_MAX_OFFSET;
 	iter->fd		= fd;
 }
 struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
 #define fiemap_for_each(fd, iter, extent)				\
 	for (fiemap_iter_init(&iter, fd);				\
 	     (extent = fiemap_iter_next(&iter)).fe_length;)
 const char *strcmp_prefix(const char *, const char *);
 #endif /* _TOOLS_UTIL_H */
`@ -1 +1 @@`
	`BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c`	`BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc`