cmd_device_fail

Add a comamnd for setting a device as failed, update bcache sources
2025-12-10 00:00:24 +03:00 · 2017-03-09 08:27:30 -09:00 · 2017-03-09 08:27:30 -09:00 · ac1b32acb4
commit ac1b32acb4
parent a17f7bcec7
32 changed files with 567 additions and 476 deletions
--- a/.bcache_revision
+++ b/.bcache_revision
@ -1 +1 @@
-BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc
+BCACHE_REVISION=206668e86912eea889b3f2aaeaac7433da6f9245
--- a/bcache.c
+++ b/bcache.c
@ -43,6 +43,7 @@ static void usage(void)
 	     "Commands for managing a specific device in a filesystem:\n"
 	     "  device_show    Show information about a formatted device\n"
 	     "  device_add     Add a device to an existing (running) filesystem\n"
+	     "  device_fail    Mark a device as failed\n"
 	     "  device_remove  Remove a device from an existing (running) filesystem\n"
 	     "\n"
 	     "Repair:\n"
@ -95,6 +96,8 @@ int main(int argc, char *argv[])
 		return cmd_device_show(argc, argv);
 	if (!strcmp(cmd, "device_add"))
 		return cmd_device_add(argc, argv);
+	if (!strcmp(cmd, "device_fail"))
+		return cmd_device_fail(argc, argv);
 	if (!strcmp(cmd, "device_remove"))
 		return cmd_device_remove(argc, argv);

--- a/cmd_device.c
+++ b/cmd_device.c
@ -15,6 +15,7 @@
 #include "cmds.h"
 #include "libbcache.h"
 #include "linux/bcache-ioctl.h"
+#include "tools-util.h"

 /* This code belongs under show_fs */
 #if 0
@ -188,14 +189,72 @@ int cmd_device_add(int argc, char *argv[])
 			.dev = (__u64) argv[i],
 		};

-		if (ioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ADD, &ia))
-			die("BCH_IOCTL_DISK_ADD error: %s", strerror(errno));
+		xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ADD, &ia);
 	}

 	return 0;
 }

-static void usage(void)
+static void device_fail_usage(void)
+{
+	puts("bcache device_fail - mark a device as failed\n"
+	     "Usage: bcache device_fail filesystem [devices]\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f, --force		    Force removal, even if some data\n"
+	     "                              couldn't be migrated\n"
+	     "      --force-metadata	    Force removal, even if some metadata\n"
+	     "                              couldn't be migrated\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcache@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_device_fail(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "force-degraded",		0, NULL, 'f' },
+		//{ "force-data-lost",		0, NULL, 'F' },
+		//{ "force-metadata-lost",	0, NULL, 'F' },
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	int opt, force_degraded = 0, force_data = 0, force_metadata = 0;
+
+	while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			force_degraded = 1;
+			break;
+		case 'h':
+			device_fail_usage();
+		}
+
+	if (argc - optind < 2)
+		die("Please supply a filesystem and at least one device to fail");
+
+	struct bcache_handle fs = bcache_fs_open(argv[optind]);
+
+	for (unsigned i = optind + 1; i < argc; i++) {
+		struct bch_ioctl_disk_set_state ir = {
+			.dev		= (__u64) argv[i],
+			.new_state	= BCH_MEMBER_STATE_FAILED,
+		};
+
+		if (force_degraded)
+			ir.flags |= BCH_FORCE_IF_DEGRADED;
+		if (force_data)
+			ir.flags |= BCH_FORCE_IF_DATA_LOST;
+		if (force_metadata)
+			ir.flags |= BCH_FORCE_IF_METADATA_LOST;
+
+		xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_SET_STATE, &ir);
+	}
+
+	return 0;
+}
+
+static void device_remove_usage(void)
 {
 	puts("bcache device_remove - remove one or more devices from a filesystem\n"
 	     "Usage: bcache device_remove filesystem [devices]\n"
@ -229,26 +288,25 @@ int cmd_device_remove(int argc, char *argv[])
 			force_metadata = 1;
 			break;
 		case 'h':
-			usage();
+			device_remove_usage();
 		}

-	if (argc < 3)
-		die("Please supply a filesystem and at least one device to add");
+	if (argc - optind < 2)
+		die("Please supply a filesystem and at least one device to remove");

-	struct bcache_handle fs = bcache_fs_open(argv[1]);
+	struct bcache_handle fs = bcache_fs_open(argv[optind]);

-	for (unsigned i = 2; i < argc; i++) {
+	for (unsigned i = optind + 1; i < argc; i++) {
 		struct bch_ioctl_disk_remove ir = {
 			.dev = (__u64) argv[i],
 		};

 		if (force_data)
-			ir.flags |= BCH_FORCE_IF_DATA_MISSING;
+			ir.flags |= BCH_FORCE_IF_DATA_LOST;
 		if (force_metadata)
-			ir.flags |= BCH_FORCE_IF_METADATA_MISSING;
+			ir.flags |= BCH_FORCE_IF_METADATA_LOST;

-		if (ioctl(fs.ioctl_fd, BCH_IOCTL_DISK_REMOVE, &ir))
-			die("BCH_IOCTL_DISK_REMOVE error: %s\n", strerror(errno));
+		xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_REMOVE, &ir);
 	}

 	return 0;
--- a/cmd_format.c
+++ b/cmd_format.c
@ -77,6 +77,8 @@ x(0,	btree_node_size,	"size",			"Default 256k")		\
 x(0,	metadata_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
 x(0,	data_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
 x(0,	compression_type,	"(none|lz4|gzip)",	NULL)			\
+x(0,	data_replicas,		"#",			NULL)			\
+x(0,	metadata_replicas,	"#",			NULL)			\
 x(0,	encrypted,		NULL,			"Enable whole filesystem encryption (chacha20/poly1305)")\
 x(0,	no_passphrase,		NULL,			"Don't encrypt master encryption key")\
 x('e',	error_action,		"(continue|readonly|panic)", NULL)		\
@ -112,6 +114,8 @@ static void usage(void)
 	     "      --metadata_checksum_type=(none|crc32c|crc64)\n"
 	     "      --data_checksum_type=(none|crc32c|crc64)\n"
 	     "      --compression_type=(none|lz4|gzip)\n"
+	     "      --data_replicas=#       Number of data replicas\n"
+	     "      --metadata_replicas=#   Number of metadata replicas\n"
 	     "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
 	     "      --no_passphrase         Don't encrypt master encryption key\n"
 	     "      --error_action=(continue|readonly|panic)\n"
@ -136,9 +140,9 @@ static void usage(void)
 }

 enum {
-	Opt_no_opt = 1,
+	O_no_opt = 1,
 #define t(text)
-#define x(shortopt, longopt, arg, help)	Opt_##longopt,
+#define x(shortopt, longopt, arg, help)	O_##longopt,
 	OPTS
 #undef x
 #undef t
@ -150,7 +154,7 @@ static const struct option format_opts[] = {
 	.name		= #longopt,					\
 	.has_arg	= arg ? required_argument : no_argument,	\
 	.flag		= NULL,						\
-	.val		= Opt_##longopt,				\
+	.val		= O_##longopt,					\
 },
 	OPTS
 #undef x
@ -194,85 +198,95 @@ int cmd_format(int argc, char *argv[])
 				  format_opts,
 				  NULL)) != -1)
 		switch (opt) {
-		case Opt_block_size:
+		case O_block_size:
 		case 'b':
 			opts.block_size =
 				hatoi_validate(optarg, "block size");
 			break;
-		case Opt_btree_node_size:
+		case O_btree_node_size:
 			opts.btree_node_size =
 				hatoi_validate(optarg, "btree node size");
 			break;
-		case Opt_metadata_checksum_type:
+		case O_metadata_checksum_type:
 			opts.meta_csum_type =
 				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
-		case Opt_data_checksum_type:
+		case O_data_checksum_type:
 			opts.data_csum_type =
 				read_string_list_or_die(optarg,
 						bch_csum_types, "checksum type");
 			break;
-		case Opt_compression_type:
+		case O_compression_type:
 			opts.compression_type =
 				read_string_list_or_die(optarg,
 						bch_compression_types,
 						"compression type");
 			break;
-		case Opt_encrypted:
+		case O_data_replicas:
+			if (kstrtouint(optarg, 10, &opts.data_replicas) ||
+			    dev_opts.tier >= BCH_REPLICAS_MAX)
+				die("invalid replicas");
+			break;
+		case O_metadata_replicas:
+			if (kstrtouint(optarg, 10, &opts.meta_replicas) ||
+			    dev_opts.tier >= BCH_REPLICAS_MAX)
+				die("invalid replicas");
+			break;
+		case O_encrypted:
 			opts.encrypted = true;
 			break;
-		case Opt_no_passphrase:
+		case O_no_passphrase:
 			no_passphrase = true;
 			break;
-		case Opt_error_action:
+		case O_error_action:
 		case 'e':
 			opts.on_error_action =
 				read_string_list_or_die(optarg,
 						bch_error_actions, "error action");
 			break;
-		case Opt_max_journal_entry_size:
+		case O_max_journal_entry_size:
 			opts.max_journal_entry_size =
 				hatoi_validate(optarg, "journal entry size");
 			break;
-		case Opt_label:
+		case O_label:
 		case 'L':
 			opts.label = strdup(optarg);
 			break;
-		case Opt_uuid:
+		case O_uuid:
 		case 'U':
 			if (uuid_parse(optarg, opts.uuid.b))
 				die("Bad uuid");
 			break;
-		case Opt_force:
+		case O_force:
 		case 'f':
 			force = true;
 			break;
-		case Opt_fs_size:
+		case O_fs_size:
 			if (bch_strtoull_h(optarg, &dev_opts.size))
 				die("invalid filesystem size");

 			dev_opts.size >>= 9;
 			break;
-		case Opt_bucket_size:
+		case O_bucket_size:
 			dev_opts.bucket_size =
 				hatoi_validate(optarg, "bucket size");
 			break;
-		case Opt_tier:
+		case O_tier:
 		case 't':
 			if (kstrtouint(optarg, 10, &dev_opts.tier) ||
 			    dev_opts.tier >= BCH_TIER_MAX)
 				die("invalid tier");
 			break;
-		case Opt_discard:
+		case O_discard:
 			dev_opts.discard = true;
 			break;
-		case Opt_no_opt:
+		case O_no_opt:
 			dev_opts.path = strdup(optarg);
 			darray_append(devices, dev_opts);
 			dev_opts.size = 0;
 			break;
-		case Opt_help:
+		case O_help:
 		case 'h':
 			usage();
 			exit(EXIT_SUCCESS);
--- a/cmds.h
+++ b/cmds.h
@ -22,6 +22,7 @@ int cmd_fs_set(int argc, char *argv[]);

 int cmd_device_show(int argc, char *argv[]);
 int cmd_device_add(int argc, char *argv[]);
+int cmd_device_fail(int argc, char *argv[]);
 int cmd_device_remove(int argc, char *argv[]);

 int cmd_fsck(int argc, char *argv[]);
--- a/include/linux/bcache-ioctl.h
+++ b/include/linux/bcache-ioctl.h
@ -10,8 +10,14 @@ extern "C" {

 /* global control dev: */

-#define BCH_FORCE_IF_DATA_MISSING	(1 << 0)
-#define BCH_FORCE_IF_METADATA_MISSING	(1 << 1)
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)

 #define BCH_IOCTL_ASSEMBLE	_IOW('r', 1, struct bch_ioctl_assemble)
 #define BCH_IOCTL_INCREMENTAL	_IOW('r', 1, struct bch_ioctl_incremental)
@ -23,7 +29,7 @@ extern "C" {

 #define BCH_IOCTL_DISK_ADD	_IOW('r', 4, struct bch_ioctl_disk_add)
 #define BCH_IOCTL_DISK_REMOVE	_IOW('r', 5, struct bch_ioctl_disk_remove)
-#define BCH_IOCTL_DISK_FAIL	_IOW('r', 6, struct bch_ioctl_disk_fail)
+#define BCH_IOCTL_DISK_SET_STATE _IOW('r', 6, struct bch_ioctl_disk_set_state)

 #define BCH_IOCTL_DISK_REMOVE_BY_UUID					\
 	_IOW('r', 5, struct bch_ioctl_disk_remove_by_uuid)
@ -57,9 +63,10 @@ struct bch_ioctl_disk_remove {
 	__u64			dev;
 };

-struct bch_ioctl_disk_fail {
+struct bch_ioctl_disk_set_state {
 	__u32			flags;
-	__u32			pad;
+	__u8			new_state;
+	__u8			pad[3];
 	__u64			dev;
 };

--- a/include/linux/bcache.h
+++ b/include/linux/bcache.h
@ -969,6 +969,9 @@ LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
 LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
 LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE,	struct bch_sb, flags[1], 14, 20);

+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
--- a/libbcache.c
+++ b/libbcache.c
@ -171,8 +171,10 @@ struct bch_sb *bcache_format(struct format_opts opts,
 	SET_BCH_SB_GC_RESERVE(sb,		8);
 	SET_BCH_SB_META_REPLICAS_WANT(sb,	opts.meta_replicas);
 	SET_BCH_SB_META_REPLICAS_HAVE(sb,	opts.meta_replicas);
+	SET_BCH_SB_META_REPLICAS_REQ(sb,	opts.meta_replicas_required);
 	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	opts.data_replicas);
 	SET_BCH_SB_DATA_REPLICAS_HAVE(sb,	opts.data_replicas);
+	SET_BCH_SB_DATA_REPLICAS_REQ(sb,	opts.data_replicas_required);
 	SET_BCH_SB_ERROR_ACTION(sb,		opts.on_error_action);
 	SET_BCH_SB_STR_HASH_TYPE(sb,		BCH_STR_HASH_SIPHASH);
 	SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,	ilog2(opts.max_journal_entry_size));
--- a/libbcache.h
+++ b/libbcache.h
@ -32,6 +32,9 @@ struct format_opts {
 	unsigned	meta_replicas;
 	unsigned	data_replicas;

+	unsigned	meta_replicas_required;
+	unsigned	data_replicas_required;
+
 	unsigned	meta_csum_type;
 	unsigned	data_csum_type;
 	unsigned	compression_type;
@ -48,6 +51,8 @@ static inline struct format_opts format_opts_default()
 		.data_csum_type		= BCH_CSUM_CRC32C,
 		.meta_replicas		= 1,
 		.data_replicas		= 1,
+		.meta_replicas_required	= 1,
+		.data_replicas_required	= 1,
 	};
 }

--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@ -138,7 +138,7 @@ static void pd_controllers_update(struct work_struct *work)
 				-1);

 		group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
-			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+			struct bch_dev_usage stats = bch_dev_usage_read(ca);
 			unsigned bucket_bits = ca->bucket_bits + 9;

 			u64 size = (ca->mi.nbuckets -
@ -1304,9 +1304,7 @@ static unsigned open_bucket_sectors_free(struct cache_set *c,
 	struct cache_member_rcu *mi = cache_member_info_get(c);
 	unsigned i, sectors_free = UINT_MAX;

-	BUG_ON(nr_replicas > ob->nr_ptrs);
-
-	for (i = 0; i < nr_replicas; i++)
+	for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
 		sectors_free = min(sectors_free,
 				   ob_ptr_sectors_free(ob, mi, &ob->ptrs[i]));

@ -1369,11 +1367,13 @@ static int open_bucket_add_buckets(struct cache_set *c,
 				   struct write_point *wp,
 				   struct open_bucket *ob,
 				   unsigned nr_replicas,
+				   unsigned nr_replicas_required,
 				   enum alloc_reserve reserve,
 				   struct closure *cl)
 {
 	long caches_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-	int i, dst;
+	unsigned i;
+	int ret;

 	/*
 	 * We might be allocating pointers to add to an existing extent
@ -1388,23 +1388,17 @@ static int open_bucket_add_buckets(struct cache_set *c,

 	memset(caches_used, 0, sizeof(caches_used));

-	/*
-	 * Shuffle pointers to devices we already have to the end:
-	 * bch_bucket_alloc_set() will add new pointers to the statr of @b, and
-	 * bch_alloc_sectors_done() will add the first nr_replicas ptrs to @e:
-	 */
-	for (i = dst = ob->nr_ptrs - 1; i >= 0; --i)
-		if (__test_and_set_bit(ob->ptrs[i].dev, caches_used)) {
-			if (i != dst) {
-				swap(ob->ptrs[i], ob->ptrs[dst]);
-				swap(ob->ptr_offset[i], ob->ptr_offset[dst]);
-			}
-			--dst;
-			nr_replicas++;
-		}
+	for (i = 0; i < ob->nr_ptrs; i++)
+		__set_bit(ob->ptrs[i].dev, caches_used);

-	return bch_bucket_alloc_set(c, wp, ob, nr_replicas,
-				    reserve, caches_used, cl);
+	ret = bch_bucket_alloc_set(c, wp, ob, nr_replicas,
+				   reserve, caches_used, cl);
+
+	if (ret == -EROFS &&
+	    ob->nr_ptrs >= nr_replicas_required)
+		ret = 0;
+
+	return ret;
 }

 /*
@ -1413,6 +1407,7 @@ static int open_bucket_add_buckets(struct cache_set *c,
 struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
 					    struct write_point *wp,
 					    unsigned nr_replicas,
+					    unsigned nr_replicas_required,
 					    enum alloc_reserve reserve,
 					    struct closure *cl)
 {
@ -1466,6 +1461,7 @@ retry:
 	}

 	ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
+				      nr_replicas_required,
 				      reserve, cl);
 	if (ret) {
 		mutex_unlock(&ob->lock);
@ -1498,10 +1494,6 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 	 * __bch_write() will only write to the pointers we add here:
 	 */

-	/*
-	 * XXX: don't add pointers to devices @e already has
-	 */
-	BUG_ON(nr_replicas > ob->nr_ptrs);
 	BUG_ON(sectors > ob->sectors_free);

 	/* didn't use all the ptrs: */
@ -1510,7 +1502,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,

 	rcu_read_lock();

-	for (i = 0; i < nr_replicas; i++) {
+	for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
 		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));

 		tmp = ob->ptrs[i];
@ -1576,12 +1568,15 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 				      struct write_point *wp,
 				      struct bkey_i_extent *e,
 				      unsigned nr_replicas,
+				      unsigned nr_replicas_required,
 				      enum alloc_reserve reserve,
 				      struct closure *cl)
 {
 	struct open_bucket *ob;

-	ob = bch_alloc_sectors_start(c, wp, nr_replicas, reserve, cl);
+	ob = bch_alloc_sectors_start(c, wp, nr_replicas,
+				     nr_replicas_required,
+				     reserve, cl);
 	if (IS_ERR_OR_NULL(ob))
 		return ob;

--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@ -33,7 +33,8 @@ void bch_open_bucket_put(struct cache_set *, struct open_bucket *);

 struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
 					    struct write_point *,
-					    unsigned, enum alloc_reserve,
+					    unsigned, unsigned,
+					    enum alloc_reserve,
 					    struct closure *);

 void bch_alloc_sectors_append_ptrs(struct cache_set *, struct bkey_i_extent *,
@ -42,7 +43,7 @@ void bch_alloc_sectors_done(struct cache_set *, struct write_point *,
 			    struct open_bucket *);

 struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
-				      struct bkey_i_extent *, unsigned,
+				      struct bkey_i_extent *, unsigned, unsigned,
 				      enum alloc_reserve, struct closure *);

 static inline void bch_wake_allocator(struct cache *ca)
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@ -347,18 +347,10 @@ struct cache_member_rcu {
 	struct cache_member_cpu	m[];
 };

-/* cache->flags: */
-enum {
-	BCH_DEV_REMOVING,
-	BCH_DEV_FORCE_REMOVE,
-};
-
 struct cache {
 	struct percpu_ref	ref;
 	struct rcu_head		free_rcu;
 	struct work_struct	free_work;
-	struct work_struct	remove_work;
-	unsigned long		flags;

 	struct cache_set	*set;

@ -424,8 +416,8 @@ struct cache {
 	 * second contains a saved copy of the stats from the beginning
 	 * of GC.
 	 */
-	struct bucket_stats_cache __percpu *bucket_stats_percpu;
-	struct bucket_stats_cache	bucket_stats_cached;
+	struct bch_dev_usage __percpu *bucket_stats_percpu;
+	struct bch_dev_usage	bucket_stats_cached;

 	atomic_long_t		saturated_count;
 	size_t			inc_gen_needs_gc;
@ -659,8 +651,8 @@ struct cache_set {

 	atomic64_t		sectors_available;

-	struct bucket_stats_cache_set __percpu *bucket_stats_percpu;
-	struct bucket_stats_cache_set	bucket_stats_cached;
+	struct bch_fs_usage __percpu *bucket_stats_percpu;
+	struct bch_fs_usage	bucket_stats_cached;
 	struct lglock		bucket_stats_lock;

 	struct mutex		bucket_lock;
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@ -333,7 +333,7 @@ static void bch_mark_metadata(struct cache_set *c)
 /* Also see bch_pending_btree_node_free_insert_done() */
 static void bch_mark_pending_btree_node_frees(struct cache_set *c)
 {
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bch_fs_usage stats = { 0 };
 	struct btree_interior_update *as;
 	struct pending_btree_node_free *d;

@ -407,17 +407,17 @@ void bch_gc(struct cache_set *c)

 	/* Save a copy of the existing bucket stats while we recompute them: */
 	for_each_cache(ca, c, i) {
-		ca->bucket_stats_cached = __bch_bucket_stats_read_cache(ca);
+		ca->bucket_stats_cached = __bch_dev_usage_read(ca);
 		for_each_possible_cpu(cpu) {
-			struct bucket_stats_cache *p =
+			struct bch_dev_usage *p =
 				per_cpu_ptr(ca->bucket_stats_percpu, cpu);
 			memset(p, 0, sizeof(*p));
 		}
 	}

-	c->bucket_stats_cached = __bch_bucket_stats_read_cache_set(c);
+	c->bucket_stats_cached = __bch_fs_usage_read(c);
 	for_each_possible_cpu(cpu) {
-		struct bucket_stats_cache_set *p =
+		struct bch_fs_usage *p =
 			per_cpu_ptr(c->bucket_stats_percpu, cpu);

 		memset(p->s, 0, sizeof(p->s));
--- a/libbcache/btree_types.h
+++ b/libbcache/btree_types.h
@ -272,7 +272,6 @@ struct btree_root {
 */

 struct btree_iter;
-struct bucket_stats_cache_set;
 struct btree_node_iter;

 enum extent_insert_hook_ret {
--- a/libbcache/btree_update.c
+++ b/libbcache/btree_update.c
@ -94,7 +94,7 @@ bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b,
 */
 static void bch_btree_node_free_index(struct cache_set *c, struct btree *b,
 				      enum btree_id id, struct bkey_s_c k,
-				      struct bucket_stats_cache_set *stats)
+				      struct bch_fs_usage *stats)
 {
 	struct btree_interior_update *as;
 	struct pending_btree_node_free *d;
@ -140,7 +140,7 @@ found:
 	 * moving this reference from, hence one comparison here:
 	 */
 	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bucket_stats_cache_set tmp = { 0 };
+		struct bch_fs_usage tmp = { 0 };

 		bch_mark_key(c, bkey_i_to_s_c(&d->key),
 			     -c->sb.btree_node_size, true, b
@ -208,7 +208,7 @@ void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
 static void bch_btree_node_free_ondisk(struct cache_set *c,
 				       struct pending_btree_node_free *pending)
 {
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bch_fs_usage stats = { 0 };

 	BUG_ON(!pending->index_update_done);

@ -258,6 +258,7 @@ retry:
 	ob = bch_alloc_sectors(c, &c->btree_write_point,
 			       bkey_i_to_extent(&tmp.k),
 			       res->nr_replicas,
+			       c->opts.metadata_replicas_required,
 			       use_reserve ? RESERVE_BTREE : RESERVE_NONE,
 			       cl);
 	if (IS_ERR(ob))
@ -373,7 +374,7 @@ static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b,
 		 * bch_btree_root_read()) - do marking while holding
 		 * btree_root_lock:
 		 */
-		struct bucket_stats_cache_set stats = { 0 };
+		struct bch_fs_usage stats = { 0 };

 		bch_mark_key(c, bkey_i_to_s_c(&b->key),
 			     c->sb.btree_node_size, true,
@ -632,7 +633,7 @@ static void bch_insert_fixup_btree_ptr(struct btree_iter *iter,
 				       struct disk_reservation *disk_res)
 {
 	struct cache_set *c = iter->c;
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bch_fs_usage stats = { 0 };
 	struct bkey_packed *k;
 	struct bkey tmp;

--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@ -78,8 +78,8 @@

 static void bch_fs_stats_verify(struct cache_set *c)
 {
-	struct bucket_stats_cache_set stats =
-		__bch_bucket_stats_read_cache_set(c);
+	struct bch_fs_usage stats =
+		__bch_fs_usage_read(c);

 	if ((s64) stats.sectors_dirty < 0)
 		panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
@ -162,26 +162,26 @@ do {									\
 	_ret;								\
 })

-struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *ca)
+struct bch_dev_usage __bch_dev_usage_read(struct cache *ca)
 {
 	return bucket_stats_read_raw(ca->bucket_stats_percpu);
 }

-struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *ca)
+struct bch_dev_usage bch_dev_usage_read(struct cache *ca)
 {
 	return bucket_stats_read_cached(ca->set,
 				ca->bucket_stats_cached,
 				ca->bucket_stats_percpu);
 }

-struct bucket_stats_cache_set
-__bch_bucket_stats_read_cache_set(struct cache_set *c)
+struct bch_fs_usage
+__bch_fs_usage_read(struct cache_set *c)
 {
 	return bucket_stats_read_raw(c->bucket_stats_percpu);
 }

-struct bucket_stats_cache_set
-bch_bucket_stats_read_cache_set(struct cache_set *c)
+struct bch_fs_usage
+bch_fs_usage_read(struct cache_set *c)
 {
 	return bucket_stats_read_cached(c,
 				c->bucket_stats_cached,
@ -205,7 +205,7 @@ static inline int is_cached_bucket(struct bucket_mark m)
 }

 void bch_fs_stats_apply(struct cache_set *c,
-			struct bucket_stats_cache_set *stats,
+			struct bch_fs_usage *stats,
 			struct disk_reservation *disk_res,
 			struct gc_pos gc_pos)
 {
@ -251,11 +251,11 @@ static bool bucket_became_unavailable(struct cache_set *c,
 }

 static void bucket_stats_update(struct cache *ca,
-			struct bucket_mark old, struct bucket_mark new,
-			struct bucket_stats_cache_set *bch_alloc_stats)
+				struct bucket_mark old, struct bucket_mark new,
+				struct bch_fs_usage *bch_alloc_stats)
 {
 	struct cache_set *c = ca->set;
-	struct bucket_stats_cache *cache_stats;
+	struct bch_dev_usage *cache_stats;

 	bch_fs_inconsistent_on(old.data_type && new.data_type &&
 			old.data_type != new.data_type, c,
@ -305,7 +305,7 @@ static void bucket_stats_update(struct cache *ca,

 #define bucket_data_cmpxchg(ca, g, new, expr)			\
 ({								\
-	struct bucket_stats_cache_set _stats = { 0 };		\
+	struct bch_fs_usage _stats = { 0 };		\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
 	bucket_stats_update(ca, _old, new, &_stats);		\
@ -314,7 +314,7 @@ static void bucket_stats_update(struct cache *ca,

 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 {
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bch_fs_usage stats = { 0 };
 	struct bucket_mark old, new;

 	old = bucket_cmpxchg(g, new, ({
@ -441,18 +441,18 @@ static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned s
 */
 static void bch_mark_pointer(struct cache_set *c,
 			     struct bkey_s_c_extent e,
-			     struct cache *ca,
 			     const union bch_extent_crc *crc,
 			     const struct bch_extent_ptr *ptr,
 			     s64 sectors, enum s_alloc type,
 			     bool may_make_unavailable,
-			     struct bucket_stats_cache_set *stats,
+			     struct bch_fs_usage *stats,
 			     bool gc_will_visit, u64 journal_seq)
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
-	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
-	u64 v = READ_ONCE(g->_mark.counter);
+	struct cache *ca;
+	struct bucket *g;
+	u64 v;
 	unsigned old_sectors, new_sectors;
 	int disk_sectors, compressed_sectors;

@ -469,6 +469,12 @@ static void bch_mark_pointer(struct cache_set *c,
 	compressed_sectors = -__compressed_sectors(crc, old_sectors)
 		+ __compressed_sectors(crc, new_sectors);

+	ca = PTR_CACHE(c, ptr);
+	if (!ca)
+		goto out;
+
+	g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
+
 	if (gc_will_visit) {
 		if (journal_seq)
 			bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
@ -476,6 +482,7 @@ static void bch_mark_pointer(struct cache_set *c,
 		goto out;
 	}

+	v = READ_ONCE(g->_mark.counter);
 	do {
 		new.counter = old.counter = v;
 		saturated = 0;
@ -548,33 +555,29 @@ out:
 static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
 			    s64 sectors, bool metadata,
 			    bool may_make_unavailable,
-			    struct bucket_stats_cache_set *stats,
+			    struct bch_fs_usage *stats,
 			    bool gc_will_visit, u64 journal_seq)
 {
 	const struct bch_extent_ptr *ptr;
 	const union bch_extent_crc *crc;
-	struct cache *ca;
 	enum s_alloc type = metadata ? S_META : S_DIRTY;

 	BUG_ON(metadata && bkey_extent_is_cached(e.k));
 	BUG_ON(!sectors);

 	rcu_read_lock();
-	extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
-		trace_bcache_mark_bucket(ca, e.k, ptr, sectors, !ptr->cached);
-
-		bch_mark_pointer(c, e, ca, crc, ptr, sectors,
+	extent_for_each_ptr_crc(e, ptr, crc)
+		bch_mark_pointer(c, e, crc, ptr, sectors,
 				 ptr->cached ? S_CACHED : type,
 				 may_make_unavailable,
 				 stats, gc_will_visit, journal_seq);
-	}
 	rcu_read_unlock();
 }

 static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
 			   s64 sectors, bool metadata,
 			   bool may_make_unavailable,
-			   struct bucket_stats_cache_set *stats,
+			   struct bch_fs_usage *stats,
 			   bool gc_will_visit, u64 journal_seq)
 {
 	switch (k.k->type) {
@ -595,7 +598,7 @@ static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,

 void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
 		       s64 sectors, bool metadata,
-		       struct bucket_stats_cache_set *stats)
+		       struct bch_fs_usage *stats)
 {
 	__bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
 }
@ -603,7 +606,7 @@ void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
 void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
 		     s64 sectors, bool metadata)
 {
-	struct bucket_stats_cache_set stats = { 0 };
+	struct bch_fs_usage stats = { 0 };

 	__bch_gc_mark_key(c, k, sectors, metadata, &stats);

@ -614,7 +617,7 @@ void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,

 void bch_mark_key(struct cache_set *c, struct bkey_s_c k,
 		  s64 sectors, bool metadata, struct gc_pos gc_pos,
-		  struct bucket_stats_cache_set *stats, u64 journal_seq)
+		  struct bch_fs_usage *stats, u64 journal_seq)
 {
 	/*
 	 * synchronization w.r.t. GC:
@ -693,7 +696,7 @@ int bch_disk_reservation_add(struct cache_set *c,
 			     struct disk_reservation *res,
 			     unsigned sectors, int flags)
 {
-	struct bucket_stats_cache_set *stats;
+	struct bch_fs_usage *stats;
 	u64 old, new, v;
 	s64 sectors_available;
 	int ret;
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@ -157,11 +157,11 @@ static inline unsigned bucket_sectors_used(struct bucket *g)

 /* Per device stats: */

-struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *);
-struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *);
+struct bch_dev_usage __bch_dev_usage_read(struct cache *);
+struct bch_dev_usage bch_dev_usage_read(struct cache *);

 static inline u64 __buckets_available_cache(struct cache *ca,
-					    struct bucket_stats_cache stats)
+					    struct bch_dev_usage stats)
 {
 	return max_t(s64, 0,
 		     ca->mi.nbuckets - ca->mi.first_bucket -
@ -175,11 +175,11 @@ static inline u64 __buckets_available_cache(struct cache *ca,
 */
 static inline u64 buckets_available_cache(struct cache *ca)
 {
-	return __buckets_available_cache(ca, bch_bucket_stats_read_cache(ca));
+	return __buckets_available_cache(ca, bch_dev_usage_read(ca));
 }

 static inline u64 __buckets_free_cache(struct cache *ca,
-				       struct bucket_stats_cache stats)
+				       struct bch_dev_usage stats)
 {
 	return __buckets_available_cache(ca, stats) +
 		fifo_used(&ca->free[RESERVE_NONE]) +
@ -188,21 +188,19 @@ static inline u64 __buckets_free_cache(struct cache *ca,

 static inline u64 buckets_free_cache(struct cache *ca)
 {
-	return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca));
+	return __buckets_free_cache(ca, bch_dev_usage_read(ca));
 }

 /* Cache set stats: */

-struct bucket_stats_cache_set __bch_bucket_stats_read_cache_set(struct cache_set *);
-struct bucket_stats_cache_set bch_bucket_stats_read_cache_set(struct cache_set *);
-void bch_fs_stats_apply(struct cache_set *,
-			struct bucket_stats_cache_set *,
-			struct disk_reservation *,
-			       struct gc_pos);
+struct bch_fs_usage __bch_fs_usage_read(struct cache_set *);
+struct bch_fs_usage bch_fs_usage_read(struct cache_set *);
+void bch_fs_stats_apply(struct cache_set *, struct bch_fs_usage *,
+			struct disk_reservation *, struct gc_pos);

 static inline u64 __bch_fs_sectors_used(struct cache_set *c)
 {
-	struct bucket_stats_cache_set stats = __bch_bucket_stats_read_cache_set(c);
+	struct bch_fs_usage stats = __bch_fs_usage_read(c);
 	u64 reserved = stats.persistent_reserved +
 		stats.online_reserved;

@ -256,10 +254,10 @@ void bch_mark_metadata_bucket(struct cache *, struct bucket *,
 			      enum bucket_data_type, bool);

 void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
-		       struct bucket_stats_cache_set *);
+		       struct bch_fs_usage *);
 void bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool);
 void bch_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
-		  struct gc_pos, struct bucket_stats_cache_set *, u64);
+		  struct gc_pos, struct bch_fs_usage *, u64);

 void bch_recalc_sectors_available(struct cache_set *);

--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@ -65,7 +65,7 @@ struct bucket {
 	};
 };

-struct bucket_stats_cache {
+struct bch_dev_usage {
 	u64			buckets_dirty;
 	u64			buckets_cached;
 	u64			buckets_meta;
@ -89,7 +89,7 @@ enum s_compressed {
 	S_COMPRESSED_NR,
 };

-struct bucket_stats_cache_set {
+struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
 	u64			s[S_COMPRESSED_NR][S_ALLOC_NR];
 	u64			persistent_reserved;
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@ -173,17 +173,16 @@ static long bch_ioctl_disk_remove(struct cache_set *c,
 	if (IS_ERR(ca))
 		return PTR_ERR(ca);

-	ret = bch_dev_remove(ca, arg.flags & BCH_FORCE_IF_DATA_MISSING)
-		? 0 : -EBUSY;
+	ret = bch_dev_remove(c, ca, arg.flags);

 	percpu_ref_put(&ca->ref);
 	return ret;
 }

-static long bch_ioctl_disk_fail(struct cache_set *c,
-				struct bch_ioctl_disk_fail __user *user_arg)
+static long bch_ioctl_disk_set_state(struct cache_set *c,
+				     struct bch_ioctl_disk_set_state __user *user_arg)
 {
-	struct bch_ioctl_disk_fail arg;
+	struct bch_ioctl_disk_set_state arg;
 	struct cache *ca;
 	int ret;

@ -194,8 +193,7 @@ static long bch_ioctl_disk_fail(struct cache_set *c,
 	if (IS_ERR(ca))
 		return PTR_ERR(ca);

-	/* XXX: failed not actually implemented yet */
-	ret = bch_dev_remove(ca, true);
+	ret = bch_dev_set_state(c, ca, arg.new_state, arg.flags);

 	percpu_ref_put(&ca->ref);
 	return ret;
@ -288,8 +286,8 @@ long bch_fs_ioctl(struct cache_set *c, unsigned cmd, void __user *arg)
 		return bch_ioctl_disk_add(c, arg);
 	case BCH_IOCTL_DISK_REMOVE:
 		return bch_ioctl_disk_remove(c, arg);
-	case BCH_IOCTL_DISK_FAIL:
-		return bch_ioctl_disk_fail(c, arg);
+	case BCH_IOCTL_DISK_SET_STATE:
+		return bch_ioctl_disk_set_state(c, arg);

 	case BCH_IOCTL_DISK_REMOVE_BY_UUID:
 		return bch_ioctl_disk_remove_by_uuid(c, arg);
--- a/libbcache/error.c
+++ b/libbcache/error.c
@ -121,9 +121,11 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 		bch_notify_dev_error(ca, true);

 		mutex_lock(&c->state_lock);
-		dev = bch_dev_may_remove(ca);
+		dev = bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+					    BCH_FORCE_IF_DEGRADED);
 		if (dev
-		    ? bch_dev_read_only(ca)
+		    ? __bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+					  BCH_FORCE_IF_DEGRADED)
 		    : bch_fs_emergency_read_only(c))
 			bch_err(c,
 				"too many IO errors on %s, setting %s RO",
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@ -622,6 +622,9 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 				PTR_BUCKET_NR(ca, ptr)))
 			continue;

+		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+			continue;
+
 		if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
 			continue;

@ -938,7 +941,7 @@ struct extent_insert_state {
 	struct btree_insert		*trans;
 	struct btree_insert_entry	*insert;
 	struct bpos			committed;
-	struct bucket_stats_cache_set	stats;
+	struct bch_fs_usage	stats;

 	/* for deleting: */
 	struct bkey_i			whiteout;
@ -2202,6 +2205,9 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 			if (ptr_stale(ca, ptr))
 				continue;

+			if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+				continue;
+
 			if (ret->ca &&
 			    (ca == avoid ||
 			     ret->ca->mi.tier < ca->mi.tier))
--- a/libbcache/fs-io.c
+++ b/libbcache/fs-io.c
@ -974,7 +974,9 @@ do_io:
 		new.reserved = 0;
 	});

-	w->io->op.op.res.sectors += PAGE_SECTORS * (old.reserved - new.reserved);
+	w->io->op.op.res.sectors += PAGE_SECTORS *
+		(old.reserved - new.reserved) *
+		old.nr_replicas;
 out:
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
--- a/libbcache/io.c
+++ b/libbcache/io.c
@ -625,7 +625,9 @@ static void __bch_write(struct closure *cl)
 					BKEY_EXTENT_U64s_MAX))
 			continue_at(cl, bch_write_index, index_update_wq(op));

-		b = bch_alloc_sectors_start(c, op->wp, op->nr_replicas,
+		b = bch_alloc_sectors_start(c, op->wp,
+			op->nr_replicas,
+			c->opts.data_replicas_required,
 			op->alloc_reserve,
 			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
 		EBUG_ON(!b);
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@ -1319,10 +1319,10 @@ static int journal_entry_sectors(struct journal *j)
 	}
 	rcu_read_unlock();

-	if (nr_online < c->opts.metadata_replicas)
+	if (nr_online < c->opts.metadata_replicas_required)
 		return -EROFS;

-	if (nr_devs < c->opts.metadata_replicas)
+	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
 		return 0;

 	return sectors_available;
@ -1540,11 +1540,9 @@ static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,

 	closure_init_stack(&cl);

-	mutex_lock(&c->sb_lock);
-
 	/* don't handle reducing nr of buckets yet: */
 	if (nr <= ja->nr)
-		goto err;
+		return 0;

 	/*
 	 * note: journal buckets aren't really counted as _sectors_ used yet, so
@ -1553,10 +1551,11 @@ static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
 	 * reservation to ensure we'll actually be able to allocate:
 	 */

-	ret = ENOSPC;
 	if (bch_disk_reservation_get(c, &disk_res,
 			(nr - ja->nr) << ca->bucket_bits, 0))
-		goto err;
+		return -ENOSPC;
+
+	mutex_lock(&c->sb_lock);

 	ret = -ENOMEM;
 	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
@ -2040,9 +2039,11 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	j->prev_buf_sectors = 0;
 	spin_unlock(&j->lock);

-	if (replicas < replicas_want)
+	if (replicas < c->opts.metadata_replicas_required)
 		return -EROFS;

+	BUG_ON(!replicas);
+
 	return 0;
 }

--- a/libbcache/migrate.c
+++ b/libbcache/migrate.c
@ -11,6 +11,7 @@
 #include "keylist.h"
 #include "migrate.h"
 #include "move.h"
+#include "super-io.h"

 static int issue_migration_move(struct cache *ca,
 				struct moving_context *ctxt,
@ -58,12 +59,16 @@ int bch_move_data_off_device(struct cache *ca)
 {
 	struct moving_context ctxt;
 	struct cache_set *c = ca->set;
+	struct bch_sb_field_members *mi;
 	unsigned pass = 0;
 	u64 seen_key_count;
 	int ret = 0;

 	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);

+	if (!ca->mi.has_data)
+		return 0;
+
 	bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
 	ctxt.avoid = ca;

@ -136,6 +141,13 @@ next:
 		return -1;
 	}

+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
+
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
 	return 0;
 }

@ -240,11 +252,18 @@ retry:
 *   is written.
 */

-int bch_move_meta_data_off_device(struct cache *ca)
+int bch_move_metadata_off_device(struct cache *ca)
 {
+	struct cache_set *c = ca->set;
+	struct bch_sb_field_members *mi;
 	unsigned i;
 	int ret;

+	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
+
+	if (!ca->mi.has_metadata)
+		return 0;
+
 	/* 1st, Move the btree nodes off the device */

 	for (i = 0; i < BTREE_ID_NR; i++) {
@ -261,6 +280,13 @@ int bch_move_meta_data_off_device(struct cache *ca)
 	if (ret)
 		return ret;

+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
+
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
 	return 0;
 }

@ -303,11 +329,11 @@ static int bch_flag_key_bad(struct btree_iter *iter,
 * and don't have other valid pointers.  If there are valid pointers,
 * the necessary pointers to the removed device are replaced with
 * bad pointers instead.
+ *
 * This is only called if bch_move_data_off_device above failed, meaning
 * that we've already tried to move the data MAX_DATA_OFF_ITER times and
 * are not likely to succeed if we try again.
 */
-
 int bch_flag_data_bad(struct cache *ca)
 {
 	int ret = 0;
--- a/libbcache/migrate.h
+++ b/libbcache/migrate.h
@ -2,7 +2,7 @@
 #define _BCACHE_MIGRATE_H

 int bch_move_data_off_device(struct cache *);
-int bch_move_meta_data_off_device(struct cache *);
+int bch_move_metadata_off_device(struct cache *);
 int bch_flag_data_bad(struct cache *);

 #endif /* _BCACHE_MIGRATE_H */
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@ -52,9 +52,13 @@ enum opt_type {
 	BCH_OPT(errors,			0644,	BCH_SB_ERROR_ACTION,	\
 		s8,  OPT_STR(bch_error_actions))			\
 	BCH_OPT(metadata_replicas,	0444,	BCH_SB_META_REPLICAS_WANT,\
-		s8,  OPT_UINT(0, BCH_REPLICAS_MAX))			\
+		s8,  OPT_UINT(1, BCH_REPLICAS_MAX))			\
 	BCH_OPT(data_replicas,		0444,	BCH_SB_DATA_REPLICAS_WANT,\
-		s8,  OPT_UINT(0, BCH_REPLICAS_MAX))			\
+		s8,  OPT_UINT(1, BCH_REPLICAS_MAX))			\
+	BCH_OPT(metadata_replicas_required, 0444, BCH_SB_META_REPLICAS_REQ,\
+		s8,  OPT_UINT(1, BCH_REPLICAS_MAX))			\
+	BCH_OPT(data_replicas_required,	0444,	BCH_SB_DATA_REPLICAS_REQ,\
+		s8,  OPT_UINT(1, BCH_REPLICAS_MAX))			\
 	BCH_OPT(metadata_checksum,	0644,	BCH_SB_META_CSUM_TYPE,	\
 		s8,  OPT_STR(bch_csum_types))				\
 	BCH_OPT(data_checksum,		0644,	BCH_SB_DATA_CSUM_TYPE,	\
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@ -317,6 +317,10 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	    BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
 		return "Invalid number of metadata replicas";

+	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
+	    BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
 	if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
 	    BCH_SB_META_REPLICAS_HAVE(sb) >
 	    BCH_SB_META_REPLICAS_WANT(sb))
@ -326,6 +330,10 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	    BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
 		return "Invalid number of data replicas";

+	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
+	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+		return "Invalid number of metadata replicas";
+
 	if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
 	    BCH_SB_DATA_REPLICAS_HAVE(sb) >
 	    BCH_SB_DATA_REPLICAS_WANT(sb))
@ -831,6 +839,7 @@ void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
 	struct bch_member *mi;
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 	const struct bch_extent_ptr *ptr;
+	unsigned nr_replicas = 0;

 	mutex_lock(&c->sb_lock);

@ -843,10 +852,20 @@ void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
 	mi = bch_sb_get_members(c->disk_sb)->members;

 	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached)
+		if (!ptr->cached) {
 			(meta
 			 ? SET_BCH_MEMBER_HAS_METADATA
 			 : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
+			nr_replicas++;
+		}
+
+	nr_replicas = min_t(unsigned, nr_replicas,
+			    (meta
+			     ? BCH_SB_META_REPLICAS_HAVE
+			     : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
+	(meta
+	 ? SET_BCH_SB_META_REPLICAS_HAVE
+	 : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);

 	bch_write_super(c);
 	mutex_unlock(&c->sb_lock);
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@ -129,17 +129,27 @@ static inline bool bch_check_super_marked(struct cache_set *c,
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 	const struct bch_extent_ptr *ptr;
 	struct cache_member_cpu *mi = cache_member_info_get(c)->m;
+	unsigned nr_replicas = 0;
 	bool ret = true;

-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached &&
-		    !(meta
+	extent_for_each_ptr(e, ptr) {
+		if (ptr->cached)
+			continue;
+
+		if (!(meta
 		      ? mi[ptr->dev].has_metadata
 		      : mi[ptr->dev].has_data)) {
 			ret = false;
 			break;
 		}

+		nr_replicas++;
+	}
+
+	if (nr_replicas <
+	    (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
+		ret = false;
+
 	cache_member_info_put();

 	return ret;
--- a/libbcache/super.c
+++ b/libbcache/super.c
@ -616,7 +616,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 					 c->sb.btree_node_size,
 					 BCH_ENCODED_EXTENT_MAX) /
 				   PAGE_SECTORS, 0) ||
-	    !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
+	    !(c->bucket_stats_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->bucket_stats_lock) ||
 	    mempool_init_page_pool(&c->btree_bounce_pool, 1,
 				   ilog2(btree_pages(c))) ||
@ -1015,104 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 	return NULL;
 }

-/* Device startup/shutdown, ro/rw: */
-
-bool bch_dev_read_only(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct bch_sb_field_members *mi;
-	char buf[BDEVNAME_SIZE];
-
-	bdevname(ca->disk_sb.bdev, buf);
-
-	lockdep_assert_held(&c->state_lock);
-
-	if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-		return false;
-
-	if (!bch_dev_may_remove(ca)) {
-		bch_err(c, "required member %s going RO, forcing fs RO", buf);
-		bch_fs_read_only(c);
-	}
-
-	trace_bcache_cache_read_only(ca);
-
-	bch_moving_gc_stop(ca);
-
-	/*
-	 * This stops new data writes (e.g. to existing open data
-	 * buckets) and then waits for all existing writes to
-	 * complete.
-	 */
-	bch_dev_allocator_stop(ca);
-
-	bch_dev_group_remove(&c->journal.devs, ca);
-
-	/*
-	 * Device data write barrier -- no non-meta-data writes should
-	 * occur after this point.  However, writes to btree buckets,
-	 * journal buckets, and the superblock can still occur.
-	 */
-	trace_bcache_cache_read_only_done(ca);
-
-	bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
-	bch_notify_dev_read_only(ca);
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
-			     BCH_MEMBER_STATE_RO);
-	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
-	return true;
-}
-
-static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
-{
-	lockdep_assert_held(&c->state_lock);
-
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
-		return NULL;
-
-	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
-		return "removing";
-
-	trace_bcache_cache_read_write(ca);
-
-	if (bch_dev_allocator_start(ca))
-		return "error starting allocator thread";
-
-	if (bch_moving_gc_start(ca))
-		return "error starting moving GC thread";
-
-	if (bch_tiering_start(c))
-		return "error starting tiering thread";
-
-	bch_notify_dev_read_write(ca);
-	trace_bcache_cache_read_write_done(ca);
-
-	return NULL;
-}
-
-const char *bch_dev_read_write(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct bch_sb_field_members *mi;
-	const char *err;
-
-	err = __bch_dev_read_write(c, ca);
-	if (err)
-		return err;
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
-			     BCH_MEMBER_STATE_ACTIVE);
-	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return NULL;
-}
+/* Device startup/shutdown: */

 void bch_dev_release(struct kobject *kobj)
 {
@ -1209,148 +1112,6 @@ static void bch_dev_stop(struct cache *ca)
 	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }

-static void bch_dev_remove_work(struct work_struct *work)
-{
-	struct cache *ca = container_of(work, struct cache, remove_work);
-	struct bch_sb_field_members *mi;
-	struct cache_set *c = ca->set;
-	char name[BDEVNAME_SIZE];
-	bool force = test_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
-	unsigned dev_idx = ca->dev_idx;
-
-	bdevname(ca->disk_sb.bdev, name);
-
-	/*
-	 * Device should already be RO, now migrate data off:
-	 *
-	 * XXX: locking is sketchy, bch_dev_read_write() has to check
-	 * BCH_DEV_REMOVING bit
-	 */
-	if (!ca->mi.has_data) {
-		/* Nothing to do: */
-	} else if (!bch_move_data_off_device(ca)) {
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else if (force) {
-		bch_flag_data_bad(ca);
-
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else {
-		bch_err(c, "Remove of %s failed, unable to migrate data off",
-			name);
-		clear_bit(BCH_DEV_REMOVING, &ca->flags);
-		return;
-	}
-
-	/* Now metadata: */
-
-	if (!ca->mi.has_metadata) {
-		/* Nothing to do: */
-	} else if (!bch_move_meta_data_off_device(ca)) {
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else {
-		bch_err(c, "Remove of %s failed, unable to migrate metadata off",
-			name);
-		clear_bit(BCH_DEV_REMOVING, &ca->flags);
-		return;
-	}
-
-	/*
-	 * Ok, really doing the remove:
-	 * Drop device's prio pointer before removing it from superblock:
-	 */
-	bch_notify_dev_removed(ca);
-
-	spin_lock(&c->journal.lock);
-	c->journal.prio_buckets[dev_idx] = 0;
-	spin_unlock(&c->journal.lock);
-
-	bch_journal_meta(&c->journal);
-
-	/*
-	 * Stop device before removing it from the cache set's list of devices -
-	 * and get our own ref on cache set since ca is going away:
-	 */
-	closure_get(&c->cl);
-
-	mutex_lock(&c->state_lock);
-
-	bch_dev_stop(ca);
-
-	/*
-	 * RCU barrier between dropping between c->cache and dropping from
-	 * member info:
-	 */
-	synchronize_rcu();
-
-	/*
-	 * Free this device's slot in the bch_member array - all pointers to
-	 * this device must be gone:
-	 */
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
-
-	bch_write_super(c);
-
-	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
-
-	closure_put(&c->cl);
-}
-
-static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
-{
-	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
-		return false;
-
-	if (!bch_dev_may_remove(ca)) {
-		bch_err(ca->set, "Can't remove last RW device");
-		bch_notify_dev_remove_failed(ca);
-		return false;
-	}
-
-	/* First, go RO before we try to migrate data off: */
-	bch_dev_read_only(ca);
-
-	if (force)
-		set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
-
-	set_bit(BCH_DEV_REMOVING, &ca->flags);
-	bch_notify_dev_removing(ca);
-
-	/* Migrate the data and finish removal asynchronously: */
-
-	queue_work(system_long_wq, &ca->remove_work);
-	return true;
-}
-
-bool bch_dev_remove(struct cache *ca, bool force)
-{
-	struct cache_set *c = ca->set;
-	bool ret;
-
-	mutex_lock(&c->state_lock);
-	ret = __bch_dev_remove(c, ca, force);
-	mutex_unlock(&c->state_lock);
-
-	return ret;
-}
-
 static int bch_dev_online(struct cache *ca)
 {
 	char buf[12];
@ -1402,7 +1163,6 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	ca->dev_idx = sb->sb->dev_idx;

 	INIT_WORK(&ca->free_work, bch_dev_free_work);
-	INIT_WORK(&ca->remove_work, bch_dev_remove_work);
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
@ -1451,7 +1211,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
 					  2, GFP_KERNEL)) ||
 	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
-	    !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
+	    !(ca->bucket_stats_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
@ -1506,6 +1266,232 @@ err:
 	return err;
 }

+/* Device management: */
+
+static void __bch_dev_read_only(struct cache_set *c, struct cache *ca)
+{
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_dev_allocator_stop(ca);
+
+	bch_dev_group_remove(&c->journal.devs, ca);
+}
+
+static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
+		return NULL;
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_start(ca))
+		return "error starting moving GC thread";
+
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
+
+	bch_notify_dev_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
+bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
+			   enum bch_member_state new_state, int flags)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (new_state == BCH_MEMBER_STATE_ACTIVE)
+		return true;
+
+	if (ca->mi.has_data &&
+	    !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+		return false;
+
+	if (ca->mi.has_data &&
+	    c->sb.data_replicas_have <= 1 &&
+	    !(flags & BCH_FORCE_IF_DATA_LOST))
+		return false;
+
+	if (ca->mi.has_metadata &&
+	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
+		return false;
+
+	if (ca->mi.has_metadata &&
+	    c->sb.meta_replicas_have <= 1 &&
+	    !(flags & BCH_FORCE_IF_METADATA_LOST))
+		return false;
+
+	return true;
+}
+
+int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
+			enum bch_member_state new_state, int flags)
+{
+	struct bch_sb_field_members *mi;
+	char buf[BDEVNAME_SIZE];
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch_dev_state_allowed(c, ca, new_state, flags))
+		return -EINVAL;
+
+	if (new_state == BCH_MEMBER_STATE_ACTIVE) {
+		if (__bch_dev_read_write(c, ca))
+			return -ENOMEM;
+	} else {
+		__bch_dev_read_only(c, ca);
+	}
+
+	bch_notice(c, "%s %s",
+		   bdevname(ca->disk_sb.bdev, buf),
+		   bch_dev_state[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch_dev_set_state(struct cache_set *c, struct cache *ca,
+		      enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_set_state(c, ca, new_state, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
+#if 0
+int bch_dev_migrate_from(struct cache_set *c, struct cache *ca)
+{
+	/* First, go RO before we try to migrate data off: */
+	ret = bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, flags);
+	if (ret)
+		return ret;
+
+	bch_notify_dev_removing(ca);
+
+	/* Migrate data, metadata off device: */
+
+	ret = bch_move_data_off_device(ca);
+	if (ret && !(flags & BCH_FORCE_IF_DATA_LOST)) {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		return ret;
+	}
+
+	if (ret)
+		ret = bch_flag_data_bad(ca);
+	if (ret) {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		return ret;
+	}
+
+	ret = bch_move_metadata_off_device(ca);
+	if (ret)
+		return ret;
+}
+#endif
+
+/* Device add/removal: */
+
+static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+{
+	struct bch_sb_field_members *mi;
+	char name[BDEVNAME_SIZE];
+	unsigned dev_idx = ca->dev_idx;
+	int ret;
+
+	bdevname(ca->disk_sb.bdev, name);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
+		bch_err(ca->set, "Cannot remove RW device");
+		bch_notify_dev_remove_failed(ca);
+		return -EINVAL;
+	}
+
+	if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca->set, "Cannot remove %s without losing data", name);
+		bch_notify_dev_remove_failed(ca);
+		return -EINVAL;
+	}
+
+	/*
+	 * XXX: verify that dev_idx is really not in use anymore, anywhere
+	 *
+	 * flag_data_bad() does not check btree pointers
+	 */
+	ret = bch_flag_data_bad(ca);
+	if (ret) {
+		bch_err(c, "Remove of %s failed", name);
+		return ret;
+	}
+
+	/*
+	 * Ok, really doing the remove:
+	 * Drop device's prio pointer before removing it from superblock:
+	 */
+	bch_notify_dev_removed(ca);
+
+	spin_lock(&c->journal.lock);
+	c->journal.prio_buckets[dev_idx] = 0;
+	spin_unlock(&c->journal.lock);
+
+	bch_journal_meta(&c->journal);
+
+	bch_dev_stop(ca);
+
+	/*
+	 * RCU barrier between dropping between c->cache and dropping from
+	 * member info:
+	 */
+	synchronize_rcu();
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+	bch_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_remove(c, ca, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
 int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
@ -1626,6 +1612,8 @@ err_unlock:
 	return ret ?: -EINVAL;
 }

+/* Filesystem open: */
+
 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 			struct bch_opts opts, struct cache_set **ret)
 {
--- a/libbcache/super.h
+++ b/libbcache/super.h
@ -3,6 +3,8 @@

 #include "extents.h"

+#include <linux/bcache-ioctl.h>
+
 static inline size_t sector_to_bucket(const struct cache *ca, sector_t s)
 {
 	return s >> ca->bucket_bits;
@ -54,21 +56,17 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 	     (ca = bch_get_next_cache(c, &(iter)));			\
 	     percpu_ref_put(&ca->ref), (iter)++)

-static inline bool bch_dev_may_remove(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct cache_group *grp = &c->cache_all;
-
-	/* Can't remove the last RW device: */
-	return grp->nr != 1 ||
-		rcu_access_pointer(grp->d[0].dev) != ca;
-}
-
 void bch_dev_release(struct kobject *);

-bool bch_dev_read_only(struct cache *);
-const char *bch_dev_read_write(struct cache *);
-bool bch_dev_remove(struct cache *, bool force);
+bool bch_dev_state_allowed(struct cache_set *, struct cache *,
+			   enum bch_member_state, int);
+int __bch_dev_set_state(struct cache_set *, struct cache *,
+			enum bch_member_state, int);
+int bch_dev_set_state(struct cache_set *, struct cache *,
+		      enum bch_member_state, int);
+
+int bch_dev_fail(struct cache *, int);
+int bch_dev_remove(struct cache_set *, struct cache *, int);
 int bch_dev_add(struct cache_set *, const char *);

 void bch_fs_detach(struct cache_set *);
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@ -159,7 +159,7 @@ read_attribute(data_replicas_have);

 static struct attribute sysfs_state_rw = {
 	.name = "state",
-	.mode = S_IRUGO|S_IWUSR
+	.mode = S_IRUGO
 };

 SHOW(bch_cached_dev)
@ -552,7 +552,7 @@ static unsigned bch_average_key_size(struct cache_set *c)

 static ssize_t show_fs_alloc_debug(struct cache_set *c, char *buf)
 {
-	struct bucket_stats_cache_set stats = bch_bucket_stats_read_cache_set(c);
+	struct bch_fs_usage stats = bch_fs_usage_read(c);

 	return scnprintf(buf, PAGE_SIZE,
 			 "capacity:\t\t%llu\n"
@ -1127,7 +1127,7 @@ static ssize_t show_reserve_stats(struct cache *ca, char *buf)
 static ssize_t show_dev_alloc_debug(struct cache *ca, char *buf)
 {
 	struct cache_set *c = ca->set;
-	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+	struct bch_dev_usage stats = bch_dev_usage_read(ca);

 	return scnprintf(buf, PAGE_SIZE,
 		"free_inc:               %zu/%zu\n"
@ -1171,7 +1171,7 @@ SHOW(bch_dev)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	struct cache_set *c = ca->set;
-	struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
+	struct bch_dev_usage stats = bch_dev_usage_read(ca);

 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);

@ -1297,52 +1297,6 @@ STORE(__bch_dev)
 		bch_tiering_start(c);
 	}

-	if (attr == &sysfs_state_rw) {
-		char name[BDEVNAME_SIZE];
-		const char *err = NULL;
-		ssize_t v = bch_read_string_list(buf, bch_dev_state);
-
-		if (v < 0)
-			return v;
-
-		if (v == ca->mi.state)
-			return size;
-
-		switch (v) {
-		case BCH_MEMBER_STATE_ACTIVE:
-			err = bch_dev_read_write(ca);
-			break;
-		case BCH_MEMBER_STATE_RO:
-			bch_dev_read_only(ca);
-			break;
-		case BCH_MEMBER_STATE_FAILED:
-		case BCH_MEMBER_STATE_SPARE:
-			/*
-			 * XXX: need to migrate data off and set correct state
-			 */
-			pr_err("can't set %s %s: not supported",
-			       bdevname(ca->disk_sb.bdev, name),
-			       bch_dev_state[v]);
-			return -EINVAL;
-		}
-
-		if (err) {
-			pr_err("can't set %s %s: %s",
-			       bdevname(ca->disk_sb.bdev, name),
-			       bch_dev_state[v], err);
-			return -EINVAL;
-		}
-	}
-
-	if (attr == &sysfs_unregister) {
-		bool force = false;
-
-		if (!strncmp(buf, "force", 5) &&
-		    (buf[5] == '\0' || buf[5] == '\n'))
-			force = true;
-		bch_dev_remove(ca, force);
-	}
-
 	if (attr == &sysfs_clear_stats) {
 		int cpu;

@ -1361,7 +1315,6 @@ STORE_LOCKED(bch_dev)

 static struct attribute *bch_dev_files[] = {
 	&sysfs_uuid,
-	&sysfs_unregister,
 	&sysfs_bucket_size,
 	&sysfs_bucket_size_bytes,
 	&sysfs_block_size,