diff --git a/.bcache_revision b/.bcache_revision index 58bdf2da..b86381a1 100644 --- a/.bcache_revision +++ b/.bcache_revision @@ -1 +1 @@ -BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c +BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc diff --git a/Makefile b/Makefile index 2defed04..682bf8e7 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,7 @@ OBJS=bcache.o \ cmd_fsck.o \ cmd_format.o \ cmd_key.o \ + cmd_migrate.o \ cmd_run.o \ crypto.o \ libbcache.o \ diff --git a/bcache.c b/bcache.c index ac9eb07e..a0fa860f 100644 --- a/bcache.c +++ b/bcache.c @@ -50,7 +50,12 @@ static void usage(void) "\n" "Debug:\n" " bcache dump Dump filesystem metadata to a qcow2 image\n" - " bcache list List filesystem metadata in textual form\n"); + " bcache list List filesystem metadata in textual form\n" + "\n" + "Migrate:\n" + " bcache migrate Migrate an existing filesystem to bcachefs, in place\n" + " bcache migrate_superblock\n" + " Add default superblock, after bcache migrate\n"); } int main(int argc, char *argv[]) @@ -104,6 +109,11 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "list")) return cmd_list(argc, argv); + if (!strcmp(cmd, "migrate")) + return cmd_migrate(argc, argv); + if (!strcmp(cmd, "migrate_superblock")) + return cmd_migrate_superblock(argc, argv); + usage(); return 0; } diff --git a/cmd_debug.c b/cmd_debug.c index 4f2586d4..ca0f4530 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -30,35 +30,35 @@ static void dump_usage(void) static void dump_one_device(struct cache_set *c, struct cache *ca, int fd) { struct bch_sb *sb = ca->disk_sb.sb; - sparse_data data; + ranges data; unsigned i; darray_init(data); /* Superblock: */ - data_add(&data, BCH_SB_LAYOUT_SECTOR << 9, - sizeof(struct bch_sb_layout)); + range_add(&data, BCH_SB_LAYOUT_SECTOR << 9, + sizeof(struct bch_sb_layout)); for (i = 0; i < sb->layout.nr_superblocks; i++) - data_add(&data, - le64_to_cpu(sb->layout.sb_offset[i]) << 9, - vstruct_bytes(sb)); + range_add(&data, + le64_to_cpu(sb->layout.sb_offset[i]) << 9, + vstruct_bytes(sb)); /* Journal: */ for (i = 0; i < ca->journal.nr; i++) if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) { u64 bucket = ca->journal.buckets[i]; - data_add(&data, - bucket_bytes(ca) * bucket, - bucket_bytes(ca)); + range_add(&data, + bucket_bytes(ca) * bucket, + bucket_bytes(ca)); } /* Prios/gens: */ for (i = 0; i < prio_buckets(ca); i++) - data_add(&data, - bucket_bytes(ca) * ca->prio_last_buckets[i], - bucket_bytes(ca)); + range_add(&data, + bucket_bytes(ca) * ca->prio_last_buckets[i], + bucket_bytes(ca)); /* Btree: */ for (i = 0; i < BTREE_ID_NR; i++) { @@ -71,9 +71,9 @@ static void dump_one_device(struct cache_set *c, struct cache *ca, int fd) extent_for_each_ptr(e, ptr) if (ptr->dev == ca->dev_idx) - data_add(&data, - ptr->offset << 9, - b->written << 9); + range_add(&data, + ptr->offset << 9, + b->written << 9); } bch_btree_iter_unlock(&iter); } @@ -87,7 +87,7 @@ int cmd_dump(int argc, char *argv[]) struct bch_opts opts = bch_opts_empty(); struct cache_set *c = NULL; const char *err; - char *out = NULL, *buf; + char *out = NULL; unsigned i, nr_devices = 0; bool force = false; int fd, opt; @@ -116,9 +116,6 @@ int cmd_dump(int argc, char *argv[]) if (!out) die("Please supply output filename"); - buf = alloca(strlen(out) + 10); - strcpy(buf, out); - err = bch_fs_open(argv + optind, argc - optind, opts, &c); if (err) die("error opening %s: %s", argv[optind], err); @@ -140,12 +137,11 @@ int cmd_dump(int argc, char *argv[]) if (!c->cache[i]) continue; - if (nr_devices > 1) - sprintf(buf, "%s.%u", out, i); - - fd = open(buf, mode, 0600); - if (fd < 0) - die("error opening %s: %s", buf, strerror(errno)); + char *path = nr_devices > 1 + ? mprintf("%s.%u", out, i) + : strdup(out); + fd = xopen(path, mode, 0600); + free(path); dump_one_device(c, c->cache[i], fd); close(fd); @@ -153,7 +149,7 @@ int cmd_dump(int argc, char *argv[]) up_read(&c->gc_lock); - bch_fs_stop_sync(c); + bch_fs_stop(c); return 0; } @@ -213,14 +209,20 @@ static void list_keys_usage(void) "Usage: bcache list_keys [OPTION]... <devices>\n" "\n" "Options:\n" - " -b btree_id Integer btree id to list\n" - " -s start Start pos (as inode:offset)\n" - " -e end End pos\n" - " -m mode Mode for listing\n" - " -h Display this help and exit\n" + " -b (extents|inodes|dirents|xattrs) Btree to list from\n" + " -s inode:offset Start position to list from\n" + " -e inode:offset End position\n" + " -m (keys|formats) List mode\n" + " -h Display this help and exit\n" "Report bugs to <linux-bcache@vger.kernel.org>"); } +static const char * const list_modes[] = { + "keys", + "formats", + NULL +}; + int cmd_list(int argc, char *argv[]) { struct bch_opts opts = bch_opts_empty(); @@ -229,7 +231,6 @@ int cmd_list(int argc, char *argv[]) struct bpos start = POS_MIN, end = POS_MAX; const char *err; int mode = 0, opt; - u64 v; opts.nochanges = true; opts.norecovery = true; @@ -239,10 +240,8 @@ int cmd_list(int argc, char *argv[]) while ((opt = getopt(argc, argv, "b:s:e:m:h")) != -1) switch (opt) { case 'b': - if (kstrtoull(optarg, 10, &v) || - v >= BTREE_ID_NR) - die("invalid btree id"); - btree_id = v; + btree_id = read_string_list_or_die(optarg, + bch_btree_ids, "btree id"); break; case 's': start = parse_pos(optarg); @@ -251,6 +250,8 @@ int cmd_list(int argc, char *argv[]) end = parse_pos(optarg); break; case 'm': + mode = read_string_list_or_die(optarg, + list_modes, "list mode"); break; case 'h': list_keys_usage(); @@ -275,6 +276,6 @@ int cmd_list(int argc, char *argv[]) die("Invalid mode"); } - bch_fs_stop_sync(c); + bch_fs_stop(c); return 0; } diff --git a/cmd_device.c b/cmd_device.c index 1c5208af..505fedc4 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -121,10 +121,7 @@ int cmd_device_show(int argc, char *argv[]) char *dev_name = basename(dirname(link)); - int fd = openat(dirfd(fs.sysfs), entry->d_name, O_RDONLY); - if (fd < 0) - die("couldn't open device %s: %s\n", - entry->d_name, strerror(errno)); + int fd = xopenat(dirfd(fs.sysfs), entry->d_name, O_RDONLY); devices[nr_devices] = fill_dev(strdup(dev_name), nr, fd); tiers[devices[nr_devices].tier]++; diff --git a/cmd_format.c b/cmd_format.c index 2b1453ee..f222a8b7 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -34,10 +34,8 @@ static int open_for_format(const char *dev, bool force) blkid_probe pr; const char *fs_type = NULL, *fs_label = NULL; size_t fs_type_len, fs_label_len; - int fd; - if ((fd = open(dev, O_RDWR|O_EXCL)) == -1) - die("Can't open dev %s: %s\n", dev, strerror(errno)); + int fd = xopen(dev, O_RDWR|O_EXCL); if (force) return fd; @@ -70,8 +68,41 @@ static int open_for_format(const char *dev, bool force) return fd; } +#define OPTS \ +t("bcache format - create a new bcache filesystem on one or more devices") \ +t("Usage: bcache format [OPTION]... <devices>") \ +t("") \ +x('b', block_size, "size", NULL) \ +x(0, btree_node_size, "size", "Default 256k") \ +x(0, metadata_checksum_type, "(none|crc32c|crc64)", NULL) \ +x(0, data_checksum_type, "(none|crc32c|crc64)", NULL) \ +x(0, compression_type, "(none|lz4|gzip)", NULL) \ +x(0, encrypted, NULL, "Enable whole filesystem encryption (chacha20/poly1305)")\ +x(0, no_passphrase, NULL, "Don't encrypt master encryption key")\ +x('e', error_action, "(continue|readonly|panic)", NULL) \ +x(0, max_journal_entry_size, "size", NULL) \ +x('L', label, "label", NULL) \ +x('U', uuid, "uuid", NULL) \ +x('f', force, NULL, NULL) \ +t("") \ +t("Device specific options:") \ +x(0, fs_size, "size", "Size of filesystem on device")\ +x(0, bucket_size, "size", "Bucket size") \ +x('t', tier, "#", "Higher tier indicates slower devices")\ +x(0, discard, NULL, NULL) \ +t("Device specific options must come before corresponding devices, e.g.") \ +t(" bcache format --tier 0 /dev/sdb --tier 1 /dev/sdc") \ +t("") \ +x('h', help, NULL, "display this help and exit") + static void usage(void) { +#define t(text) puts(text "\n") +#define x(shortopt, longopt, arg, help) do { \ + OPTS +#undef x +#undef t + puts("bcache format - create a new bcache filesystem on one or more devices\n" "Usage: bcache format [OPTION]... <devices>\n" "\n" @@ -81,7 +112,8 @@ static void usage(void) " --metadata_checksum_type=(none|crc32c|crc64)\n" " --data_checksum_type=(none|crc32c|crc64)\n" " --compression_type=(none|lz4|gzip)\n" - " --encrypted\n" + " --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n" + " --no_passphrase Don't encrypt master encryption key\n" " --error_action=(continue|readonly|panic)\n" " Action to take on filesystem error\n" " --max_journal_entry_size=size\n" @@ -103,37 +135,26 @@ static void usage(void) "Report bugs to <linux-bcache@vger.kernel.org>"); } -#define OPTS \ - OPT('b', block_size, required_argument) \ - OPT(0, btree_node_size, required_argument) \ - OPT(0, metadata_checksum_type, required_argument) \ - OPT(0, data_checksum_type, required_argument) \ - OPT(0, compression_type, required_argument) \ - OPT(0, encrypted, no_argument) \ - OPT('e', error_action, required_argument) \ - OPT(0, max_journal_entry_size, required_argument) \ - OPT('L', label, required_argument) \ - OPT('U', uuid, required_argument) \ - OPT('f', force, no_argument) \ - OPT(0, fs_size, required_argument) \ - OPT(0, bucket_size, required_argument) \ - OPT('t', tier, required_argument) \ - OPT(0, discard, no_argument) \ - OPT('h', help, no_argument) - enum { Opt_no_opt = 1, -#define OPT(shortopt, longopt, has_arg) Opt_##longopt, +#define t(text) +#define x(shortopt, longopt, arg, help) Opt_##longopt, OPTS -#undef OPT +#undef x +#undef t }; static const struct option format_opts[] = { -#define OPT(shortopt, longopt, has_arg) { \ - #longopt, has_arg, NULL, Opt_##longopt \ - }, +#define t(text) +#define x(shortopt, longopt, arg, help) { \ + .name = #longopt, \ + .has_arg = arg ? required_argument : no_argument, \ + .flag = NULL, \ + .val = Opt_##longopt, \ +}, OPTS -#undef OPT +#undef x +#undef t { NULL } }; @@ -161,29 +182,12 @@ static unsigned hatoi_validate(const char *s, const char *msg) int cmd_format(int argc, char *argv[]) { darray(struct dev_opts) devices; - struct dev_opts *dev; - unsigned block_size = 0; - unsigned btree_node_size = 0; - unsigned meta_csum_type = BCH_CSUM_CRC32C; - unsigned data_csum_type = BCH_CSUM_CRC32C; - unsigned compression_type = BCH_COMPRESSION_NONE; - bool encrypted = false; - unsigned on_error_action = BCH_ON_ERROR_RO; - char *label = NULL; - uuid_le uuid; - bool force = false; - - /* Device specific options: */ - u64 filesystem_size = 0; - unsigned bucket_size = 0; - unsigned tier = 0; - bool discard = false; - unsigned max_journal_entry_size = 0; - char *passphrase = NULL; + struct format_opts opts = format_opts_default(); + struct dev_opts dev_opts = { 0 }, *dev; + bool force = false, no_passphrase = false; int opt; darray_init(devices); - uuid_clear(uuid.b); while ((opt = getopt_long(argc, argv, "-b:e:L:U:ft:h", @@ -192,45 +196,52 @@ int cmd_format(int argc, char *argv[]) switch (opt) { case Opt_block_size: case 'b': - block_size = hatoi_validate(optarg, - "block size"); + opts.block_size = + hatoi_validate(optarg, "block size"); break; case Opt_btree_node_size: - btree_node_size = hatoi_validate(optarg, - "btree node size"); + opts.btree_node_size = + hatoi_validate(optarg, "btree node size"); break; case Opt_metadata_checksum_type: - meta_csum_type = read_string_list_or_die(optarg, + opts.meta_csum_type = + read_string_list_or_die(optarg, bch_csum_types, "checksum type"); break; case Opt_data_checksum_type: - data_csum_type = read_string_list_or_die(optarg, + opts.data_csum_type = + read_string_list_or_die(optarg, bch_csum_types, "checksum type"); break; case Opt_compression_type: - compression_type = read_string_list_or_die(optarg, + opts.compression_type = + read_string_list_or_die(optarg, bch_compression_types, "compression type"); break; case Opt_encrypted: - encrypted = true; + opts.encrypted = true; + break; + case Opt_no_passphrase: + no_passphrase = true; break; case Opt_error_action: case 'e': - on_error_action = read_string_list_or_die(optarg, + opts.on_error_action = + read_string_list_or_die(optarg, bch_error_actions, "error action"); break; case Opt_max_journal_entry_size: - max_journal_entry_size = hatoi_validate(optarg, - "journal entry size"); + opts.max_journal_entry_size = + hatoi_validate(optarg, "journal entry size"); break; case Opt_label: case 'L': - label = strdup(optarg); + opts.label = strdup(optarg); break; case Opt_uuid: case 'U': - if (uuid_parse(optarg, uuid.b)) + if (uuid_parse(optarg, opts.uuid.b)) die("Bad uuid"); break; case Opt_force: @@ -238,31 +249,28 @@ int cmd_format(int argc, char *argv[]) force = true; break; case Opt_fs_size: - if (bch_strtoull_h(optarg, &filesystem_size)) + if (bch_strtoull_h(optarg, &dev_opts.size)) die("invalid filesystem size"); - filesystem_size >>= 9; + dev_opts.size >>= 9; break; case Opt_bucket_size: - bucket_size = hatoi_validate(optarg, "bucket size"); + dev_opts.bucket_size = + hatoi_validate(optarg, "bucket size"); break; case Opt_tier: case 't': - if (kstrtouint(optarg, 10, &tier) || - tier >= BCH_TIER_MAX) + if (kstrtouint(optarg, 10, &dev_opts.tier) || + dev_opts.tier >= BCH_TIER_MAX) die("invalid tier"); break; case Opt_discard: - discard = true; + dev_opts.discard = true; break; case Opt_no_opt: - darray_append(devices, (struct dev_opts) { - .path = strdup(optarg), - .size = filesystem_size, - .bucket_size = bucket_size, - .tier = tier, - .discard = discard, - }); + dev_opts.path = strdup(optarg); + darray_append(devices, dev_opts); + dev_opts.size = 0; break; case Opt_help: case 'h': @@ -274,18 +282,16 @@ int cmd_format(int argc, char *argv[]) if (!darray_size(devices)) die("Please supply a device"); - if (uuid_is_null(uuid.b)) - uuid_generate(uuid.b); - - if (encrypted) { - passphrase = read_passphrase("Enter passphrase: "); + if (opts.encrypted && !no_passphrase) { + opts.passphrase = read_passphrase("Enter passphrase: "); if (isatty(STDIN_FILENO)) { char *pass2 = read_passphrase("Enter same passphrase again: "); - if (strcmp(passphrase, pass2)) { - memzero_explicit(passphrase, strlen(passphrase)); + if (strcmp(opts.passphrase, pass2)) { + memzero_explicit(opts.passphrase, + strlen(opts.passphrase)); memzero_explicit(pass2, strlen(pass2)); die("Passphrases do not match"); } @@ -298,23 +304,14 @@ int cmd_format(int argc, char *argv[]) darray_foreach(dev, devices) dev->fd = open_for_format(dev->path, force); - bcache_format(devices.item, darray_size(devices), - block_size, - btree_node_size, - meta_csum_type, - data_csum_type, - compression_type, - passphrase, - 1, - 1, - on_error_action, - max_journal_entry_size, - label, - uuid); + struct bch_sb *sb = + bcache_format(opts, devices.item, darray_size(devices)); + bcache_super_print(sb, HUMAN_READABLE); + free(sb); - if (passphrase) { - memzero_explicit(passphrase, strlen(passphrase)); - free(passphrase); + if (opts.passphrase) { + memzero_explicit(opts.passphrase, strlen(opts.passphrase)); + free(opts.passphrase); } return 0; diff --git a/cmd_fsck.c b/cmd_fsck.c index a8c8dc53..6af56692 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -56,6 +56,6 @@ int cmd_fsck(int argc, char *argv[]) if (err) die("error opening %s: %s", argv[optind], err); - bch_fs_stop_sync(c); + bch_fs_stop(c); return 0; } diff --git a/cmd_key.c b/cmd_key.c index 587ecbe3..654ad774 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -1,6 +1,5 @@ #include <errno.h> #include <unistd.h> -#include <keyutils.h> #include <uuid/uuid.h> #include "cmds.h" @@ -10,52 +9,18 @@ int cmd_unlock(int argc, char *argv[]) { - struct bch_encrypted_key sb_key; - struct bch_key passphrase_key; struct bch_sb *sb; - struct bch_sb_field_crypt *crypt; char *passphrase; - char uuid[40]; - char description[60]; if (argc != 2) die("please supply a single device"); sb = bcache_super_read(argv[1]); - crypt = bch_sb_get_crypt(sb); - if (!crypt) - die("filesystem is not encrypted"); - - sb_key = crypt->key; - - if (!bch_key_is_encrypted(&sb_key)) - die("filesystem does not have encryption key"); - passphrase = read_passphrase("Enter passphrase: "); - derive_passphrase(crypt, &passphrase_key, passphrase); - /* Check if the user supplied the correct passphrase: */ - if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), - &sb_key, sizeof(sb_key))) - die("error encrypting key"); + add_bcache_key(sb, passphrase); - if (bch_key_is_encrypted(&sb_key)) - die("incorrect passphrase"); - - uuid_unparse_lower(sb->user_uuid.b, uuid); - sprintf(description, "bcache:%s", uuid); - - if (add_key("logon", description, - &passphrase_key, sizeof(passphrase_key), - KEY_SPEC_USER_KEYRING) < 0 || - add_key("user", description, - &passphrase_key, sizeof(passphrase_key), - KEY_SPEC_USER_KEYRING) < 0) - die("add_key error: %s", strerror(errno)); - - memzero_explicit(&sb_key, sizeof(sb_key)); - memzero_explicit(&passphrase_key, sizeof(passphrase_key)); memzero_explicit(passphrase, strlen(passphrase)); free(passphrase); return 0; diff --git a/cmd_migrate.c b/cmd_migrate.c new file mode 100644 index 00000000..9a02cb9f --- /dev/null +++ b/cmd_migrate.c @@ -0,0 +1,835 @@ +#include </usr/include/dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/vfs.h> +#include <unistd.h> +#include <attr/xattr.h> + +#include <linux/fiemap.h> +#include <linux/fs.h> +#include <linux/stat.h> + +#include <uuid/uuid.h> + +#include "cmds.h" +#include "crypto.h" +#include "libbcache.h" +#include "linux/bcache.h" + +#include <linux/dcache.h> +#include <linux/generic-radix-tree.h> +#include <linux/xattr.h> +#include "btree_update.h" +#include "buckets.h" +#include "dirent.h" +#include "fs.h" +#include "inode.h" +#include "io.h" +#include "str_hash.h" +#include "super.h" +#include "xattr.h" + +static char *dev_t_to_path(dev_t dev) +{ + char link[PATH_MAX], *p; + int ret; + + char *sysfs_dev = mprintf("/sys/dev/block/%u:%u", + major(dev), minor(dev)); + ret = readlink(sysfs_dev, link, sizeof(link)); + free(sysfs_dev); + + if (ret < 0 || ret >= sizeof(link)) + die("readlink error while looking up block device: %s", strerror(errno)); + + link[ret] = '\0'; + + p = strrchr(link, '/'); + if (!p) + die("error looking up device name"); + p++; + + return mprintf("/dev/%s", p); +} + +static bool path_is_fs_root(char *path) +{ + char *line = NULL, *p, *mount; + size_t n = 0; + FILE *f; + bool ret = true; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + die("Error getting mount information"); + + while (getline(&line, &n, f) != -1) { + p = line; + + strsep(&p, " "); /* mount id */ + strsep(&p, " "); /* parent id */ + strsep(&p, " "); /* dev */ + strsep(&p, " "); /* root */ + mount = strsep(&p, " "); + strsep(&p, " "); + + if (mount && !strcmp(path, mount)) + goto found; + } + + ret = false; +found: + fclose(f); + free(line); + return ret; +} + +static void mark_unreserved_space(struct cache_set *c, ranges extents) +{ + struct cache *ca = c->cache[0]; + struct hole_iter iter; + struct range i; + + for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) { + struct bucket_mark new; + u64 b; + + if (i.start == i.end) + return; + + b = sector_to_bucket(ca, i.start >> 9); + do { + bucket_cmpxchg(&ca->buckets[b], new, new.nouse = 1); + b++; + } while (bucket_to_sector(ca, b) << 9 < i.end); + } +} + +static void update_inode(struct cache_set *c, + struct bch_inode_unpacked *inode) +{ + struct bkey_inode_buf packed; + int ret; + + bch_inode_pack(&packed, inode); + ret = bch_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL); + if (ret) + die("error creating file: %s", strerror(-ret)); +} + +static void create_dirent(struct cache_set *c, + struct bch_inode_unpacked *parent, + const char *name, u64 inum, mode_t mode) +{ + struct bch_hash_info parent_hash_info = bch_hash_info_init(parent); + struct qstr qname = { { { .len = strlen(name), } }, .name = name }; + + int ret = bch_dirent_create(c, parent->inum, &parent_hash_info, + mode_to_type(mode), &qname, + inum, NULL, BCH_HASH_SET_MUST_CREATE); + if (ret) + die("error creating file: %s", strerror(-ret)); + + if (S_ISDIR(mode)) + parent->i_nlink++; +} + +static void create_link(struct cache_set *c, + struct bch_inode_unpacked *parent, + const char *name, u64 inum, mode_t mode) +{ + struct bch_inode_unpacked inode; + int ret = bch_inode_find_by_inum(c, inum, &inode); + if (ret) + die("error looking up hardlink: %s", strerror(-ret)); + + inode.i_nlink++; + update_inode(c, &inode); + + create_dirent(c, parent, name, inum, mode); +} + +static struct bch_inode_unpacked create_file(struct cache_set *c, + struct bch_inode_unpacked *parent, + const char *name, + uid_t uid, gid_t gid, + mode_t mode, dev_t rdev) +{ + struct bch_inode_unpacked new_inode; + struct bkey_inode_buf packed; + int ret; + + bch_inode_init(c, &new_inode, uid, gid, mode, rdev); + bch_inode_pack(&packed, &new_inode); + + ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (ret) + die("error creating file: %s", strerror(-ret)); + + new_inode.inum = packed.inode.k.p.inode; + create_dirent(c, parent, name, new_inode.inum, mode); + + return new_inode; +} + +#define for_each_xattr_handler(handlers, handler) \ + if (handlers) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +static const struct xattr_handler *xattr_resolve_name(const char **name) +{ + const struct xattr_handler **handlers = bch_xattr_handlers; + const struct xattr_handler *handler; + + for_each_xattr_handler(handlers, handler) { + const char *n; + + n = strcmp_prefix(*name, xattr_prefix(handler)); + if (n) { + if (!handler->prefix ^ !*n) { + if (*n) + continue; + return ERR_PTR(-EINVAL); + } + *name = n; + return handler; + } + } + return ERR_PTR(-EOPNOTSUPP); +} + +static void copy_times(struct cache_set *c, struct bch_inode_unpacked *dst, + struct stat *src) +{ + dst->i_atime = timespec_to_bch_time(c, src->st_atim); + dst->i_mtime = timespec_to_bch_time(c, src->st_mtim); + dst->i_ctime = timespec_to_bch_time(c, src->st_ctim); +} + +static void copy_xattrs(struct cache_set *c, struct bch_inode_unpacked *dst, + char *src) +{ + struct bch_hash_info hash_info = bch_hash_info_init(dst); + ssize_t size = llistxattr(src, NULL, 0); + if (size < 0) + die("listxattr error: %s", strerror(errno)); + + if (!size) + return; + + char *buf = malloc(size); + size = llistxattr(src, buf, size); + if (size < 0) + die("listxattr error: %s", strerror(errno)); + + for (const char *next, *attr = buf; + attr <= buf + size; + attr = next) { + next = attr + strlen(attr) + 1; + + /* max possible xattr val: */ + static char val[64 << 10]; + ssize_t val_size = lgetxattr(src, attr, val, sizeof(val)); + + if (val_size < 0) + die("error getting xattr val: %s", strerror(errno)); + + const struct xattr_handler *h = xattr_resolve_name(&attr); + + int ret = __bch_xattr_set(c, dst->inum, &hash_info, attr, + val, val_size, 0, h->flags, NULL); + if (ret < 0) + die("error creating xattr: %s", strerror(-ret)); + } + + free(buf); +} + +static void write_data(struct cache_set *c, + struct bch_inode_unpacked *dst_inode, + u64 dst_offset, void *buf, size_t len) +{ + struct disk_reservation res; + struct bch_write_op op; + struct bch_write_bio bio; + struct bio_vec bv; + struct closure cl; + + BUG_ON(dst_offset & (block_bytes(c) - 1)); + BUG_ON(len & (block_bytes(c) - 1)); + + closure_init_stack(&cl); + + bio_init(&bio.bio); + bio.bio.bi_max_vecs = 1; + bio.bio.bi_io_vec = &bv; + bio.bio.bi_iter.bi_size = len; + bch_bio_map(&bio.bio, buf); + + int ret = bch_disk_reservation_get(c, &res, len >> 9, 0); + if (ret) + die("error reserving space in new filesystem: %s", strerror(-ret)); + + bch_write_op_init(&op, c, &bio, res, c->write_points, + POS(dst_inode->inum, dst_offset >> 9), NULL, 0); + closure_call(&op.cl, bch_write, NULL, &cl); + closure_sync(&cl); + + dst_inode->i_sectors += len >> 9; +} + +static char buf[1 << 20] __aligned(PAGE_SIZE); + +static void copy_data(struct cache_set *c, + struct bch_inode_unpacked *dst_inode, + int src_fd, u64 start, u64 end) +{ + while (start < end) { + unsigned len = min_t(u64, end - start, sizeof(buf)); + + xpread(src_fd, buf, len, start); + write_data(c, dst_inode, start, buf, len); + start += len; + } +} + +static void link_data(struct cache_set *c, struct bch_inode_unpacked *dst, + u64 logical, u64 physical, u64 length) +{ + struct cache *ca = c->cache[0]; + + BUG_ON(logical & (block_bytes(c) - 1)); + BUG_ON(physical & (block_bytes(c) - 1)); + BUG_ON(length & (block_bytes(c) - 1)); + + logical >>= 9; + physical >>= 9; + length >>= 9; + + BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets)); + + while (length) { + struct bkey_i_extent *e; + BKEY_PADDED(k) k; + u64 b = sector_to_bucket(ca, physical >> 9); + struct disk_reservation res; + unsigned sectors; + int ret; + + sectors = min(ca->mi.bucket_size - + (physical & (ca->mi.bucket_size - 1)), + length); + + e = bkey_extent_init(&k.k); + e->k.p.inode = dst->inum; + e->k.p.offset = logical + sectors; + e->k.size = sectors; + extent_ptr_append(e, (struct bch_extent_ptr) { + .offset = physical, + .dev = 0, + .gen = ca->buckets[b].mark.gen, + }); + + ret = bch_disk_reservation_get(c, &res, sectors, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + die("error reserving space in new filesystem: %s", + strerror(-ret)); + + ret = bch_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, + &res, NULL, NULL, 0); + if (ret) + die("btree insert error %s", strerror(-ret)); + + bch_disk_reservation_put(c, &res); + + dst->i_sectors += sectors; + logical += sectors; + physical += sectors; + length -= sectors; + } +} + +static void copy_link(struct cache_set *c, struct bch_inode_unpacked *dst, + char *src) +{ + ssize_t ret = readlink(src, buf, sizeof(buf)); + if (ret < 0) + die("readlink error: %s", strerror(errno)); + + write_data(c, dst, 0, buf, round_up(ret, block_bytes(c))); +} + +static void copy_file(struct cache_set *c, struct bch_inode_unpacked *dst, + int src, char *src_path, ranges *extents) +{ + struct fiemap_iter iter; + struct fiemap_extent e; + + fiemap_for_each(src, iter, e) + if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) { + fsync(src); + break; + } + + fiemap_for_each(src, iter, e) { + if ((e.fe_logical & (block_bytes(c) - 1)) || + (e.fe_length & (block_bytes(c) - 1))) + die("Unaligned extent in %s - can't handle", src_path); + + if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN| + FIEMAP_EXTENT_ENCODED| + FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE)) { + copy_data(c, dst, + src, + round_down(e.fe_logical, block_bytes(c)), + round_up(e.fe_logical + e.fe_length, + block_bytes(c))); + continue; + } + + if ((e.fe_physical & (block_bytes(c) - 1))) + die("Unaligned extent in %s - can't handle", src_path); + + range_add(extents, e.fe_physical, e.fe_length); + link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length); + } +} + +struct copy_fs_state { + u64 bcachefs_inum; + dev_t dev; + + GENRADIX(u64) hardlinks; + ranges extents; +}; + +static void copy_dir(struct copy_fs_state *s, + struct cache_set *c, + struct bch_inode_unpacked *dst, + int src_fd, const char *src_path) +{ + DIR *dir = fdopendir(src_fd); + struct dirent *d; + + while ((errno = 0), (d = readdir(dir))) { + struct bch_inode_unpacked inode; + int fd; + + if (fchdir(src_fd)) + die("chdir error: %s", strerror(errno)); + + struct stat stat = + xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW); + + if (!strcmp(d->d_name, ".") || + !strcmp(d->d_name, "..") || + stat.st_ino == s->bcachefs_inum) + continue; + + char *child_path = mprintf("%s/%s", src_path, d->d_name); + + if (stat.st_dev != s->dev) + die("%s does not have correct st_dev!", child_path); + + u64 *dst_inum = S_ISREG(stat.st_mode) + ? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL) + : NULL; + + if (dst_inum && *dst_inum) { + create_link(c, dst, d->d_name, *dst_inum, S_IFREG); + goto next; + } + + inode = create_file(c, dst, d->d_name, + stat.st_uid, stat.st_gid, + stat.st_mode, stat.st_rdev); + + if (dst_inum) + *dst_inum = inode.inum; + + copy_times(c, &inode, &stat); + copy_xattrs(c, &inode, d->d_name); + + /* copy xattrs */ + + switch (mode_to_type(stat.st_mode)) { + case DT_DIR: + fd = xopen(d->d_name, O_RDONLY|O_NOATIME); + copy_dir(s, c, &inode, fd, child_path); + close(fd); + break; + case DT_REG: + inode.i_size = stat.st_size; + + fd = xopen(d->d_name, O_RDONLY|O_NOATIME); + copy_file(c, &inode, fd, child_path, &s->extents); + close(fd); + break; + case DT_LNK: + inode.i_size = stat.st_size; + + copy_link(c, &inode, d->d_name); + break; + case DT_FIFO: + case DT_CHR: + case DT_BLK: + case DT_SOCK: + case DT_WHT: + /* nothing else to copy for these: */ + break; + default: + BUG(); + } + + update_inode(c, &inode); +next: + free(child_path); + } + + if (errno) + die("readdir error: %s", strerror(errno)); +} + +static ranges reserve_new_fs_space(const char *file_path, unsigned block_size, + u64 size, u64 *bcachefs_inum, dev_t dev) +{ + int fd = open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + die("Error creating %s for bcachefs metadata: %s", + file_path, strerror(errno)); + + struct stat statbuf = xfstat(fd); + + if (statbuf.st_dev != dev) + die("bcachefs file has incorrect device"); + + *bcachefs_inum = statbuf.st_ino; + + if (fallocate(fd, 0, 0, size)) + die("Error reserving space for bcachefs metadata: %s", + strerror(errno)); + + fsync(fd); + + struct fiemap_iter iter; + struct fiemap_extent e; + ranges extents = { NULL }; + + fiemap_for_each(fd, iter, e) { + if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN| + FIEMAP_EXTENT_ENCODED| + FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE)) + die("Unable to continue: metadata file not fully mapped"); + + if ((e.fe_physical & (block_size - 1)) || + (e.fe_length & (block_size - 1))) + die("Unable to continue: unaligned extents in metadata file"); + + range_add(&extents, e.fe_physical, e.fe_length); + } + close(fd); + + ranges_sort_merge(&extents); + return extents; +} + +static void reserve_old_fs_space(struct cache_set *c, + struct bch_inode_unpacked *root_inode, + ranges *extents) +{ + struct cache *ca = c->cache[0]; + struct bch_inode_unpacked dst; + struct hole_iter iter; + struct range i; + + dst = create_file(c, root_inode, "old_migrated_filesystem", + 0, 0, S_IFREG|0400, 0); + dst.i_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9; + + ranges_sort_merge(extents); + + for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) + link_data(c, &dst, i.start, i.start, i.end - i.start); + + update_inode(c, &dst); +} + +static void copy_fs(struct cache_set *c, int src_fd, const char *src_path, + u64 bcachefs_inum, ranges *extents) +{ + syncfs(src_fd); + + struct bch_inode_unpacked root_inode; + int ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, &root_inode); + if (ret) + die("error looking up root directory: %s", strerror(-ret)); + + if (fchdir(src_fd)) + die("chdir error: %s", strerror(errno)); + + struct stat stat = xfstat(src_fd); + copy_times(c, &root_inode, &stat); + copy_xattrs(c, &root_inode, "."); + + struct copy_fs_state s = { + .bcachefs_inum = bcachefs_inum, + .dev = stat.st_dev, + .extents = *extents, + }; + + /* now, copy: */ + copy_dir(&s, c, &root_inode, src_fd, src_path); + + reserve_old_fs_space(c, &root_inode, &s.extents); + + update_inode(c, &root_inode); + + darray_free(s.extents); + genradix_free(&s.hardlinks); +} + +static void find_superblock_space(ranges extents, struct dev_opts *dev) +{ + struct range *i; + darray_foreach(i, extents) { + u64 offset = max(256ULL << 10, i->start); + + if (offset + (128 << 10) <= i->end) { + dev->sb_offset = offset >> 9; + dev->sb_end = dev->sb_offset + 256; + return; + } + } + + die("Couldn't find a valid location for superblock"); +} + +static void migrate_usage(void) +{ + puts("bcache migrate - migrate an existing filesystem to bcachefs\n" + "Usage: bcache migrate [OPTION]...\n" + "\n" + "Options:\n" + " -f fs Root of filesystem to migrate(s)\n" + " --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n" + " --no_passphrase Don't encrypt master encryption key\n" + " -h Display this help and exit\n" + "Report bugs to <linux-bcache@vger.kernel.org>"); +} + +static const struct option migrate_opts[] = { + { "encrypted", no_argument, NULL, 'e' }, + { "no_passphrase", no_argument, NULL, 'p' }, + { NULL } +}; + +int cmd_migrate(int argc, char *argv[]) +{ + struct format_opts format_opts = format_opts_default(); + char *fs_path = NULL; + unsigned block_size; + bool no_passphrase = false; + int opt; + + while ((opt = getopt_long(argc, argv, "f:h", + migrate_opts, NULL)) != -1) + switch (opt) { + case 'f': + fs_path = optarg; + break; + case 'e': + format_opts.encrypted = true; + break; + case 'p': + no_passphrase = true; + break; + case 'h': + migrate_usage(); + exit(EXIT_SUCCESS); + } + + if (!fs_path) + die("Please specify a filesytem to migrate"); + + if (!path_is_fs_root(fs_path)) + die("%s is not a filysestem root", fs_path); + + int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME); + struct stat stat = xfstat(fs_fd); + + if (!S_ISDIR(stat.st_mode)) + die("%s is not a directory", fs_path); + + struct dev_opts dev = { 0 }; + + dev.path = dev_t_to_path(stat.st_dev); + dev.fd = xopen(dev.path, O_RDWR); + + block_size = min_t(unsigned, stat.st_blksize, + get_blocksize(dev.path, dev.fd) << 9); + + BUG_ON(!is_power_of_2(block_size) || block_size < 512); + format_opts.block_size = block_size >> 9; + + u64 bcachefs_inum; + char *file_path = mprintf("%s/bcachefs", fs_path); + + ranges extents = reserve_new_fs_space(file_path, + block_size, get_size(dev.path, dev.fd) / 5, + &bcachefs_inum, stat.st_dev); + + find_superblock_space(extents, &dev); + + if (format_opts.encrypted && !no_passphrase) { + format_opts.passphrase = read_passphrase("Enter passphrase: "); + + if (isatty(STDIN_FILENO)) { + char *pass2 = + read_passphrase("Enter same passphrase again: "); + + if (strcmp(format_opts.passphrase, pass2)) { + memzero_explicit(format_opts.passphrase, + strlen(format_opts.passphrase)); + memzero_explicit(pass2, strlen(pass2)); + die("Passphrases do not match"); + } + + memzero_explicit(pass2, strlen(pass2)); + free(pass2); + } + } + + struct bch_sb *sb = bcache_format(format_opts, &dev, 1); + u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]); + + if (format_opts.passphrase) + add_bcache_key(sb, format_opts.passphrase); + + free(sb); + + printf("Creating new filesystem on %s in space reserved at %s\n" + "To mount, run\n" + " mount -t bcache -o sb=%llu %s dir\n" + "\n" + "After verifying that the new filesystem is correct, to create a\n" + "superblock at the default offset and finish the migration run\n" + " bcache migrate_superblock -d %s -o %llu\n" + "\n" + "The new filesystem will have a file at /old_migrated_filestem\n" + "referencing all disk space that might be used by the existing\n" + "filesystem. That file can be deleted once the old filesystem is\n" + "no longer needed (and should be deleted prior to running\n" + "bcache migrate_superblock)\n", + dev.path, file_path, sb_offset, dev.path, + dev.path, sb_offset); + + struct bch_opts opts = bch_opts_empty(); + struct cache_set *c = NULL; + char *path[1] = { dev.path }; + const char *err; + + opts.sb = sb_offset; + opts.nostart = true; + opts.noexcl = true; + + err = bch_fs_open(path, 1, opts, &c); + if (err) + die("Error opening new filesystem: %s", err); + + mark_unreserved_space(c, extents); + + err = bch_fs_start(c); + if (err) + die("Error starting new filesystem: %s", err); + + copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents); + + bch_fs_stop(c); + + printf("Migrate complete, running fsck:\n"); + opts.nostart = false; + opts.nochanges = true; + fsck_err_opt = FSCK_ERR_NO; + + err = bch_fs_open(path, 1, opts, &c); + if (err) + die("Error opening new filesystem: %s", err); + + bch_fs_stop(c); + printf("fsck complete\n"); + return 0; +} + +static void migrate_superblock_usage(void) +{ + puts("bcache migrate_superblock - create default superblock after migrating\n" + "Usage: bcache migrate_superblock [OPTION]...\n" + "\n" + "Options:\n" + " -d device Device to create superblock for\n" + " -o offset Offset of existing superblock\n" + " -h Display this help and exit\n" + "Report bugs to <linux-bcache@vger.kernel.org>"); +} + +int cmd_migrate_superblock(int argc, char *argv[]) +{ + char *dev = NULL; + u64 offset = 0; + int opt, ret; + + while ((opt = getopt(argc, argv, "d:o:h")) != -1) + switch (opt) { + case 'd': + dev = optarg; + break; + case 'o': + ret = kstrtou64(optarg, 10, &offset); + if (ret) + die("Invalid offset"); + break; + case 'h': + migrate_superblock_usage(); + exit(EXIT_SUCCESS); + } + + if (!dev) + die("Please specify a device"); + + if (!offset) + die("Please specify offset of existing superblock"); + + int fd = xopen(dev, O_RDWR); + struct bch_sb *sb = __bcache_super_read(fd, offset); + + if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset)) + die("Can't add superblock: no space left in superblock layout"); + + for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) + if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR) + die("Superblock layout already has default superblock"); + + memmove(&sb->layout.sb_offset[1], + &sb->layout.sb_offset[0], + sb->layout.nr_superblocks * sizeof(u64)); + sb->layout.nr_superblocks++; + + sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR); + + bcache_super_write(fd, sb); + close(fd); + + return 0; +} diff --git a/cmd_run.c b/cmd_run.c index 74f32480..6fb1c4f9 100644 --- a/cmd_run.c +++ b/cmd_run.c @@ -25,9 +25,6 @@ int cmd_stop(int argc, char *argv[]) die("Please supply a filesystem"); struct bcache_handle fs = bcache_fs_open(argv[1]); - - if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP)) - die("BCH_IOCTL_STOP error: %s", strerror(errno)); - + xioctl(fs.ioctl_fd, BCH_IOCTL_STOP); return 0; } diff --git a/cmds.h b/cmds.h index 946acfda..120e83f9 100644 --- a/cmds.h +++ b/cmds.h @@ -29,4 +29,7 @@ int cmd_fsck(int argc, char *argv[]); int cmd_dump(int argc, char *argv[]); int cmd_list(int argc, char *argv[]); +int cmd_migrate(int argc, char *argv[]); +int cmd_migrate_superblock(int argc, char *argv[]); + #endif /* _CMDS_H */ diff --git a/crypto.c b/crypto.c index 86da70a1..f38a359d 100644 --- a/crypto.c +++ b/crypto.c @@ -10,8 +10,10 @@ #include <time.h> #include <unistd.h> +#include <keyutils.h> #include <linux/random.h> #include <libscrypt.h> +#include <uuid/uuid.h> #include "checksum.h" #include "crypto.h" @@ -75,29 +77,71 @@ void derive_passphrase(struct bch_sb_field_crypt *crypt, } } +void add_bcache_key(struct bch_sb *sb, const char *passphrase) +{ + struct bch_sb_field_crypt *crypt = bch_sb_get_crypt(sb); + if (!crypt) + die("filesystem is not encrypted"); + + struct bch_encrypted_key sb_key = crypt->key; + if (!bch_key_is_encrypted(&sb_key)) + die("filesystem does not have encryption key"); + + struct bch_key passphrase_key; + derive_passphrase(crypt, &passphrase_key, passphrase); + + /* Check if the user supplied the correct passphrase: */ + if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), + &sb_key, sizeof(sb_key))) + die("error encrypting key"); + + if (bch_key_is_encrypted(&sb_key)) + die("incorrect passphrase"); + + char uuid[40]; + uuid_unparse_lower(sb->user_uuid.b, uuid); + + char *description = mprintf("bcache:%s", uuid); + + if (add_key("logon", description, + &passphrase_key, sizeof(passphrase_key), + KEY_SPEC_USER_KEYRING) < 0 || + add_key("user", description, + &passphrase_key, sizeof(passphrase_key), + KEY_SPEC_USER_KEYRING) < 0) + die("add_key error: %s", strerror(errno)); + + memzero_explicit(description, strlen(description)); + free(description); + memzero_explicit(&passphrase_key, sizeof(passphrase_key)); + memzero_explicit(&sb_key, sizeof(sb_key)); +} + void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *crypt, const char *passphrase) { - struct bch_key passphrase_key; - - SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT); - SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N)); - SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r)); - SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p)); - - derive_passphrase(crypt, &passphrase_key, passphrase); - crypt->key.magic = BCH_KEY_MAGIC; get_random_bytes(&crypt->key.key, sizeof(crypt->key.key)); - assert(!bch_key_is_encrypted(&crypt->key)); + if (passphrase) { + struct bch_key passphrase_key; - if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), - &crypt->key, sizeof(crypt->key))) - die("error encrypting key"); + SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT); + SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N)); + SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r)); + SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p)); - assert(bch_key_is_encrypted(&crypt->key)); + derive_passphrase(crypt, &passphrase_key, passphrase); - memzero_explicit(&passphrase_key, sizeof(passphrase_key)); + assert(!bch_key_is_encrypted(&crypt->key)); + + if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), + &crypt->key, sizeof(crypt->key))) + die("error encrypting key"); + + assert(bch_key_is_encrypted(&crypt->key)); + + memzero_explicit(&passphrase_key, sizeof(passphrase_key)); + } } diff --git a/crypto.h b/crypto.h index 643073eb..91a8b9fc 100644 --- a/crypto.h +++ b/crypto.h @@ -1,12 +1,16 @@ #ifndef _CRYPTO_H #define _CRYPTO_H -#include "super-io.h" #include "tools-util.h" +struct bch_sb; +struct bch_sb_field_crypt; +struct bch_key; + char *read_passphrase(const char *); void derive_passphrase(struct bch_sb_field_crypt *, struct bch_key *, const char *); +void add_bcache_key(struct bch_sb *, const char *); void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *, const char *); diff --git a/include/linux/bcache.h b/include/linux/bcache.h index dbb02742..d70e2e32 100644 --- a/include/linux/bcache.h +++ b/include/linux/bcache.h @@ -821,7 +821,7 @@ struct bch_sb_field { __le32 type; }; -enum bch_sb_field_types { +enum bch_sb_field_type { BCH_SB_FIELD_journal = 0, BCH_SB_FIELD_members = 1, BCH_SB_FIELD_crypt = 2, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3c185945..217ff094 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -110,6 +110,7 @@ struct super_block { * NOTE! These match bits 12..15 of stat.st_mode * (ie "(i_mode >> 12) & 15"). */ +#ifndef DT_UNKNOWN #define DT_UNKNOWN 0 #define DT_FIFO 1 #define DT_CHR 2 @@ -119,6 +120,7 @@ struct super_block { #define DT_LNK 10 #define DT_SOCK 12 #define DT_WHT 14 +#endif /* * This is the "filldir" function type, used by readdir() to let diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 1a951e97..6ea2deb2 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -8,7 +8,6 @@ * interior nodes. */ -#include <linux/page.h> #include <linux/bug.h> #include <linux/kernel.h> #include <linux/log2.h> @@ -41,20 +40,14 @@ struct __genradix { * genradix. */ -#define DECLARE_GENRADIX_TYPE(_name, _type) \ -struct _name { \ +#define GENRADIX(_type) \ +struct { \ struct __genradix tree; \ _type type[0] __aligned(1); \ } -#define DECLARE_GENRADIX(_name, _type) \ -struct { \ - struct __genradix tree; \ - _type type[0] __aligned(1); \ -} _name - #define DEFINE_GENRADIX(_name, _type) \ - DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER + GENRADIX(_type) _name = __GENRADIX_INITIALIZER #define genradix_init(_radix) \ do { \ diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 5a986188..2bbd0979 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -180,4 +180,9 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) return !atomic_long_read(&ref->count); } +static inline bool percpu_ref_is_dying(struct percpu_ref *ref) +{ + return percpu_ref_is_zero(ref); +} + #endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */ diff --git a/libbcache.c b/libbcache.c index 6908ead9..0cfafbbc 100644 --- a/libbcache.c +++ b/libbcache.c @@ -23,66 +23,82 @@ #define BCH_MIN_NR_NBUCKETS (1 << 10) -/* first bucket should start 1 mb in, in sectors: */ -#define FIRST_BUCKET_OFFSET (1 << 11) - /* minimum size filesystem we can create, given a bucket size: */ static u64 min_size(unsigned bucket_size) { - return (DIV_ROUND_UP(FIRST_BUCKET_OFFSET, bucket_size) + - BCH_MIN_NR_NBUCKETS) * bucket_size; + return BCH_MIN_NR_NBUCKETS * bucket_size; } -static void init_layout(struct bch_sb_layout *l) +static void init_layout(struct bch_sb_layout *l, unsigned block_size, + u64 start, u64 end) { + unsigned sb_size; + u64 backup; /* offset of 2nd sb */ + memset(l, 0, sizeof(*l)); + if (start != BCH_SB_SECTOR) + start = round_up(start, block_size); + end = round_down(end, block_size); + + if (start >= end) + die("insufficient space for superblocks"); + + /* + * Create two superblocks in the allowed range: reserve a maximum of 64k + */ + sb_size = min_t(u64, 128, end - start / 2); + + backup = start + sb_size; + backup = round_up(backup, block_size); + + backup = min(backup, end); + + sb_size = min(end - backup, backup- start); + sb_size = rounddown_pow_of_two(sb_size); + + if (sb_size < 8) + die("insufficient space for superblocks"); + l->magic = BCACHE_MAGIC; l->layout_type = 0; l->nr_superblocks = 2; - l->sb_max_size_bits = 7; - l->sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR); - l->sb_offset[1] = cpu_to_le64(BCH_SB_SECTOR + - (1 << l->sb_max_size_bits)); + l->sb_max_size_bits = ilog2(sb_size); + l->sb_offset[0] = cpu_to_le64(start); + l->sb_offset[1] = cpu_to_le64(backup); } -void bcache_format(struct dev_opts *devs, size_t nr_devs, - unsigned block_size, - unsigned btree_node_size, - unsigned meta_csum_type, - unsigned data_csum_type, - unsigned compression_type, - const char *passphrase, - unsigned meta_replicas, - unsigned data_replicas, - unsigned on_error_action, - unsigned max_journal_entry_size, - char *label, - uuid_le uuid) +struct bch_sb *bcache_format(struct format_opts opts, + struct dev_opts *devs, size_t nr_devs) { struct bch_sb *sb; struct dev_opts *i; struct bch_sb_field_members *mi; - unsigned u64s, j; + unsigned u64s; /* calculate block size: */ - if (!block_size) + if (!opts.block_size) for (i = devs; i < devs + nr_devs; i++) - block_size = max(block_size, - get_blocksize(i->path, i->fd)); + opts.block_size = max(opts.block_size, + get_blocksize(i->path, i->fd)); /* calculate bucket sizes: */ for (i = devs; i < devs + nr_devs; i++) { + if (!i->sb_offset) { + i->sb_offset = BCH_SB_SECTOR; + i->sb_end = BCH_SB_SECTOR + 256; + } + if (!i->size) i->size = get_size(i->path, i->fd) >> 9; if (!i->bucket_size) { - if (i->size < min_size(block_size)) + if (i->size < min_size(opts.block_size)) die("cannot format %s, too small (%llu sectors, min %llu)", - i->path, i->size, min_size(block_size)); + i->path, i->size, min_size(opts.block_size)); /* Want a bucket size of at least 128k, if possible: */ - i->bucket_size = max(block_size, 256U); + i->bucket_size = max(opts.block_size, 256U); if (i->size >= min_size(i->bucket_size)) { unsigned scale = max(1, @@ -99,34 +115,36 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, } } - /* first bucket: 1 mb in */ - i->first_bucket = DIV_ROUND_UP(FIRST_BUCKET_OFFSET, i->bucket_size); i->nbuckets = i->size / i->bucket_size; - if (i->bucket_size < block_size) + if (i->bucket_size < opts.block_size) die("Bucket size cannot be smaller than block size"); - if (i->nbuckets - i->first_bucket < BCH_MIN_NR_NBUCKETS) + if (i->nbuckets < BCH_MIN_NR_NBUCKETS) die("Not enough buckets: %llu, need %u (bucket size %u)", - i->nbuckets - i->first_bucket, BCH_MIN_NR_NBUCKETS, - i->bucket_size); + i->nbuckets, BCH_MIN_NR_NBUCKETS, i->bucket_size); } /* calculate btree node size: */ - if (!btree_node_size) { + if (!opts.btree_node_size) { /* 256k default btree node size */ - btree_node_size = 512; + opts.btree_node_size = 512; for (i = devs; i < devs + nr_devs; i++) - btree_node_size = min(btree_node_size, i->bucket_size); + opts.btree_node_size = + min(opts.btree_node_size, i->bucket_size); } - if (!max_journal_entry_size) { + if (!opts.max_journal_entry_size) { /* 2 MB default: */ - max_journal_entry_size = 4096; + opts.max_journal_entry_size = 4096; } - max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size); + opts.max_journal_entry_size = + roundup_pow_of_two(opts.max_journal_entry_size); + + if (uuid_is_null(opts.uuid.b)) + uuid_generate(opts.uuid.b); sb = calloc(1, sizeof(*sb) + sizeof(struct bch_sb_field_members) + @@ -135,35 +153,29 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, sb->version = cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4); sb->magic = BCACHE_MAGIC; - sb->block_size = cpu_to_le16(block_size); - sb->user_uuid = uuid; + sb->block_size = cpu_to_le16(opts.block_size); + sb->user_uuid = opts.uuid; sb->nr_devices = nr_devs; - init_layout(&sb->layout); - uuid_generate(sb->uuid.b); - if (label) - strncpy((char *) sb->label, label, sizeof(sb->label)); + if (opts.label) + strncpy((char *) sb->label, opts.label, sizeof(sb->label)); - /* - * don't have a userspace crc32c implementation handy, just always use - * crc64 - */ - SET_BCH_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64); - SET_BCH_SB_META_CSUM_TYPE(sb, meta_csum_type); - SET_BCH_SB_DATA_CSUM_TYPE(sb, data_csum_type); - SET_BCH_SB_COMPRESSION_TYPE(sb, compression_type); + SET_BCH_SB_CSUM_TYPE(sb, opts.meta_csum_type); + SET_BCH_SB_META_CSUM_TYPE(sb, opts.meta_csum_type); + SET_BCH_SB_DATA_CSUM_TYPE(sb, opts.data_csum_type); + SET_BCH_SB_COMPRESSION_TYPE(sb, opts.compression_type); - SET_BCH_SB_BTREE_NODE_SIZE(sb, btree_node_size); + SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size); SET_BCH_SB_GC_RESERVE(sb, 8); - SET_BCH_SB_META_REPLICAS_WANT(sb, meta_replicas); - SET_BCH_SB_META_REPLICAS_HAVE(sb, meta_replicas); - SET_BCH_SB_DATA_REPLICAS_WANT(sb, data_replicas); - SET_BCH_SB_DATA_REPLICAS_HAVE(sb, data_replicas); - SET_BCH_SB_ERROR_ACTION(sb, on_error_action); + SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas); + SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas); + SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas); + SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas); + SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action); SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); - SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(max_journal_entry_size)); + SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(opts.max_journal_entry_size)); struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now)) @@ -172,7 +184,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec); sb->time_precision = cpu_to_le32(1); - if (passphrase) { + if (opts.encrypted) { struct bch_sb_field_crypt *crypt = vstruct_end(sb); u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64); @@ -181,7 +193,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, crypt->field.u64s = cpu_to_le32(u64s); crypt->field.type = BCH_SB_FIELD_crypt; - bch_sb_crypt_init(sb, crypt, passphrase); + bch_sb_crypt_init(sb, crypt, opts.passphrase); SET_BCH_SB_ENCRYPTION_TYPE(sb, 1); } @@ -198,7 +210,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, uuid_generate(m->uuid.b); m->nbuckets = cpu_to_le64(i->nbuckets); - m->first_bucket = cpu_to_le16(i->first_bucket); + m->first_bucket = 0; m->bucket_size = cpu_to_le16(i->bucket_size); SET_BCH_MEMBER_TIER(m, i->tier); @@ -209,42 +221,49 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, for (i = devs; i < devs + nr_devs; i++) { sb->dev_idx = i - devs; - static const char zeroes[BCH_SB_SECTOR << 9]; - struct nonce nonce = { 0 }; + init_layout(&sb->layout, opts.block_size, + i->sb_offset, i->sb_end); - /* Zero start of disk */ - xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0); + if (i->sb_offset == BCH_SB_SECTOR) { + /* Zero start of disk */ + static const char zeroes[BCH_SB_SECTOR << 9]; - xpwrite(i->fd, &sb->layout, sizeof(sb->layout), - BCH_SB_LAYOUT_SECTOR << 9); - - for (j = 0; j < sb->layout.nr_superblocks; j++) { - sb->offset = sb->layout.sb_offset[j]; - - sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), - nonce, sb); - xpwrite(i->fd, sb, vstruct_bytes(sb), - le64_to_cpu(sb->offset) << 9); + xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0); } - fsync(i->fd); + bcache_super_write(i->fd, sb); close(i->fd); } - bcache_super_print(sb, HUMAN_READABLE); - - free(sb); + return sb; } -struct bch_sb *bcache_super_read(const char *path) +void bcache_super_write(int fd, struct bch_sb *sb) +{ + struct nonce nonce = { 0 }; + + for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) { + sb->offset = sb->layout.sb_offset[i]; + + if (sb->offset == BCH_SB_SECTOR) { + /* Write backup layout */ + xpwrite(fd, &sb->layout, sizeof(sb->layout), + BCH_SB_LAYOUT_SECTOR << 9); + } + + sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb); + xpwrite(fd, sb, vstruct_bytes(sb), + le64_to_cpu(sb->offset) << 9); + } + + fsync(fd); +} + +struct bch_sb *__bcache_super_read(int fd, u64 sector) { struct bch_sb sb, *ret; - int fd = open(path, O_RDONLY); - if (fd < 0) - die("couldn't open %s", path); - - xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9); + xpread(fd, &sb, sizeof(sb), sector << 9); if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic))) die("not a bcache superblock"); @@ -253,11 +272,19 @@ struct bch_sb *bcache_super_read(const char *path) ret = malloc(bytes); - xpread(fd, ret, bytes, BCH_SB_SECTOR << 9); + xpread(fd, ret, bytes, sector << 9); return ret; } +struct bch_sb *bcache_super_read(const char *path) +{ + int fd = xopen(path, O_RDONLY); + struct bch_sb *sb = __bcache_super_read(fd, BCH_SB_SECTOR); + close(fd); + return sb; +} + void bcache_super_print(struct bch_sb *sb, int units) { struct bch_sb_field_members *mi; diff --git a/libbcache.h b/libbcache.h index 6ec3f42d..779b4708 100644 --- a/libbcache.h +++ b/libbcache.h @@ -1,6 +1,7 @@ #ifndef _LIBBCACHE_H #define _LIBBCACHE_H +#include <linux/bcache.h> #include <linux/uuid.h> #include "tools-util.h" #include "vstructs.h" @@ -18,32 +19,56 @@ enum fsck_err_opts { extern enum fsck_err_opts fsck_err_opt; +struct format_opts { + char *label; + uuid_le uuid; + + unsigned on_error_action; + unsigned max_journal_entry_size; /* will be removed */ + + unsigned block_size; + unsigned btree_node_size; + + unsigned meta_replicas; + unsigned data_replicas; + + unsigned meta_csum_type; + unsigned data_csum_type; + unsigned compression_type; + + bool encrypted; + char *passphrase; +}; + +static inline struct format_opts format_opts_default() +{ + return (struct format_opts) { + .on_error_action = BCH_ON_ERROR_RO, + .meta_csum_type = BCH_CSUM_CRC32C, + .data_csum_type = BCH_CSUM_CRC32C, + .meta_replicas = 1, + .data_replicas = 1, + }; +} + struct dev_opts { int fd; - const char *path; + char *path; u64 size; /* 512 byte sectors */ unsigned bucket_size; unsigned tier; bool discard; - u64 first_bucket; u64 nbuckets; + + u64 sb_offset; + u64 sb_end; }; -void bcache_format(struct dev_opts *devs, size_t nr_devs, - unsigned block_size, - unsigned btree_node_size, - unsigned meta_csum_type, - unsigned data_csum_type, - unsigned compression_type, - const char *passphrase, - unsigned meta_replicas, - unsigned data_replicas, - unsigned on_error_action, - unsigned max_journal_entry_size, - char *label, - uuid_le uuid); +struct bch_sb *bcache_format(struct format_opts, struct dev_opts *, size_t); +void bcache_super_write(int, struct bch_sb *); +struct bch_sb *__bcache_super_read(int, u64); struct bch_sb *bcache_super_read(const char *); void bcache_super_print(struct bch_sb *, int); diff --git a/libbcache/alloc.c b/libbcache/alloc.c index 8cb31944..93f0c2f1 100644 --- a/libbcache/alloc.c +++ b/libbcache/alloc.c @@ -73,7 +73,6 @@ #include <linux/rcupdate.h> #include <trace/events/bcache.h> -static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve); static void __bch_bucket_free(struct cache *, struct bucket *); /* Allocation groups: */ @@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca) spin_lock(&grp->lock); - for (i = 0; i < grp->nr_devices; i++) + for (i = 0; i < grp->nr; i++) if (rcu_access_pointer(grp->d[i].dev) == ca) { - grp->nr_devices--; + grp->nr--; memmove(&grp->d[i], &grp->d[i + 1], - (grp->nr_devices - i) * sizeof(grp->d[0])); + (grp->nr- i) * sizeof(grp->d[0])); break; } @@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca) unsigned i; spin_lock(&grp->lock); - for (i = 0; i < grp->nr_devices; i++) + for (i = 0; i < grp->nr; i++) if (rcu_access_pointer(grp->d[i].dev) == ca) goto out; - BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX); + BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca); + rcu_assign_pointer(grp->d[grp->nr++].dev, ca); out: spin_unlock(&grp->lock); } @@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work) struct cache_set, pd_controllers_update); struct cache *ca; - unsigned iter; - int i; + unsigned i, iter; /* All units are in bytes */ - u64 tier_size[BCH_TIER_MAX]; - u64 tier_free[BCH_TIER_MAX]; - u64 tier_dirty[BCH_TIER_MAX]; - u64 tier0_can_free = 0; + u64 faster_tiers_size = 0; + u64 faster_tiers_dirty = 0; - memset(tier_size, 0, sizeof(tier_size)); - memset(tier_free, 0, sizeof(tier_free)); - memset(tier_dirty, 0, sizeof(tier_dirty)); + u64 fastest_tier_size = 0; + u64 fastest_tier_free = 0; + u64 copygc_can_free = 0; rcu_read_lock(); - for (i = BCH_TIER_MAX - 1; i >= 0; --i) - group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) { + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + bch_pd_controller_update(&c->tiers[i].pd, + div_u64(faster_tiers_size * + c->tiering_percent, 100), + faster_tiers_dirty, + -1); + + group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) { struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); unsigned bucket_bits = ca->bucket_bits + 9; + u64 size = (ca->mi.nbuckets - + ca->mi.first_bucket) << bucket_bits; + u64 dirty = stats.buckets_dirty << bucket_bits; + u64 free = __buckets_free_cache(ca, stats) << bucket_bits; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC @@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work) ((stats.sectors_dirty + stats.sectors_cached) << 9); - u64 dev_size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - - u64 free = __buckets_free_cache(ca, stats) << bucket_bits; - if (fragmented < 0) fragmented = 0; bch_pd_controller_update(&ca->moving_gc_pd, free, fragmented, -1); - if (i == 0) - tier0_can_free += fragmented; + faster_tiers_size += size; + faster_tiers_dirty += dirty; - tier_size[i] += dev_size; - tier_free[i] += free; - tier_dirty[i] += stats.buckets_dirty << bucket_bits; + if (!c->fastest_tier || + c->fastest_tier == &c->tiers[i]) { + fastest_tier_size += size; + fastest_tier_free += free; + } + + copygc_can_free += fragmented; } - rcu_read_unlock(); - - if (tier_size[1]) { - u64 target = div_u64(tier_size[0] * c->tiering_percent, 100); - - tier0_can_free = max_t(s64, 0, tier_dirty[0] - target); - - bch_pd_controller_update(&c->tiering_pd, - target, - tier_dirty[0], - -1); } + rcu_read_unlock(); + /* * Throttle foreground writes if tier 0 is running out of free buckets, - * and either tiering or copygc can free up space (but don't take both - * into account). + * and either tiering or copygc can free up space. * * Target will be small if there isn't any work to do - we don't want to * throttle foreground writes if we currently have all the free space @@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work) * Otherwise, if there's work to do, try to keep 20% of tier0 available * for foreground writes. */ + if (c->fastest_tier) + copygc_can_free = U64_MAX; + bch_pd_controller_update(&c->foreground_write_pd, - min(tier0_can_free, - div_u64(tier_size[0] * + min(copygc_can_free, + div_u64(fastest_tier_size * c->foreground_target_percent, 100)), - tier_free[0], + fastest_tier_free, -1); schedule_delayed_work(&c->pd_controllers_update, @@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca) * it getting gc'd from under us */ ca->prio_buckets[i] = r; - bch_mark_metadata_bucket(ca, ca->buckets + r, false); + bch_mark_metadata_bucket(ca, ca->buckets + r, + BUCKET_PRIOS, false); spin_unlock(&ca->prio_buckets_lock); SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c)); @@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca) do { unsigned u64s = jset_u64s(0); + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) + break; + ret = bch_journal_res_get(j, &res, u64s, u64s); if (ret) return ret; @@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca) if (is_available_bucket(m) && !m.cached_sectors && !m.had_metadata && - (!m.wait_on_journal || - ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) { + !bucket_needs_journal_commit(m, last_seq_ondisk)) { spin_lock(&ca->freelist_lock); bch_mark_alloc_bucket(ca, g, true); @@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg) set_freezable(); + bch_find_empty_buckets(c, ca); + while (1) { /* * First, we pull buckets off of the free_inc list, possibly @@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg) * See if we have buckets we can reuse without invalidating them * or forcing a journal commit: */ - bch_find_empty_buckets(c, ca); + //bch_find_empty_buckets(c, ca); if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { up_read(&c->gc_lock); @@ -967,7 +970,7 @@ out: * * Returns index of bucket on success, 0 on failure * */ -static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve) +size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve) { struct bucket *g; long r; @@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c, u64 available_buckets = 1; /* avoid a divide by zero... */ unsigned i; - for (i = 0; i < devs->nr_devices; i++) { + for (i = 0; i < devs->nr; i++) { ca = devs->d[i].dev; devs->d[i].weight = buckets_free_cache(ca); available_buckets += devs->d[i].weight; } - for (i = 0; i < devs->nr_devices; i++) { + for (i = 0; i < devs->nr; i++) { const unsigned min_weight = U32_MAX >> 4; const unsigned max_weight = U32_MAX; devs->d[i].weight = min_weight + div64_u64(devs->d[i].weight * - devs->nr_devices * + devs->nr * (max_weight - min_weight), available_buckets); devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); @@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, rcu_read_lock(); spin_lock(&devs->lock); - for (i = 0; i < devs->nr_devices; i++) + for (i = 0; i < devs->nr; i++) available += !test_bit(devs->d[i].dev->dev_idx, caches_used); @@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, } i++; - i %= devs->nr_devices; + i %= devs->nr; ret = FREELIST_EMPTY; if (i == fail_idx) @@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, enum alloc_reserve reserve, long *caches_used) { + struct bch_tier *tier; /* * this should implement policy - for a given type of allocation, decide * which devices to allocate from: * * XXX: switch off wp->type and do something more intelligent here */ + if (wp->group) + return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, + wp->group, caches_used); - /* foreground writes: prefer tier 0: */ - if (wp->group == &c->cache_all) + /* foreground writes: prefer fastest tier: */ + tier = READ_ONCE(c->fastest_tier); + if (tier) bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->cache_tiers[0], caches_used); + &tier->devs, caches_used); return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, caches_used); + &c->cache_all, caches_used); } static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp, @@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c, ? 0 : BTREE_NODE_RESERVE; int ret; - BUG_ON(!wp->group); BUG_ON(!reserve); BUG_ON(!nr_replicas); retry: @@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, unsigned nr_replicas, struct open_bucket *ob, unsigned sectors) { - struct bch_extent_ptr tmp, *ptr; + struct bch_extent_ptr tmp; struct cache *ca; bool has_data = false; unsigned i; @@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, if (nr_replicas < ob->nr_ptrs) has_data = true; + rcu_read_lock(); + for (i = 0; i < nr_replicas; i++) { EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); @@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, extent_ptr_append(e, tmp); ob->ptr_offset[i] += sectors; + + if ((ca = PTR_CACHE(c, &ob->ptrs[i]))) + this_cpu_add(*ca->sectors_written, sectors); } - open_bucket_for_each_online_device(c, ob, ptr, ca) - this_cpu_add(*ca->sectors_written, sectors); + rcu_read_unlock(); } /* @@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, /* Startup/shutdown (ro/rw): */ -static void bch_recalc_capacity(struct cache_set *c) +void bch_recalc_capacity(struct cache_set *c) { - struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers); + struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; struct cache *ca; u64 total_capacity, capacity = 0, reserved_sectors = 0; unsigned long ra_pages = 0; @@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c) c->bdi.ra_pages = ra_pages; + /* Find fastest, slowest tiers with devices: */ + + for (tier = c->tiers; + tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; + if (!fastest_tier) + fastest_tier = tier; + slowest_tier = tier; + } + + c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; + + c->promote_write_point.group = &fastest_tier->devs; + + if (!fastest_tier) + goto set_capacity; + /* * Capacity of the cache set is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. */ - for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1; - tier > c->cache_tiers && !tier->nr_devices; - --tier) - ; - - group_for_each_cache_rcu(ca, tier, i) { + group_for_each_cache_rcu(ca, &slowest_tier->devs, i) { size_t reserve = 0; /* @@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c) ca->mi.first_bucket) << ca->bucket_bits; } +set_capacity: rcu_read_unlock(); - total_capacity = capacity; capacity *= (100 - c->opts.gc_reserve_percent); @@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca) void bch_dev_allocator_stop(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + struct cache_group *tier = &c->tiers[ca->mi.tier].devs; struct task_struct *p; struct closure cl; unsigned i; @@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca) int bch_dev_allocator_start(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + struct cache_group *tier = &c->tiers[ca->mi.tier].devs; struct task_struct *k; /* @@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca) bch_dev_group_add(tier, ca); bch_dev_group_add(&c->cache_all, ca); + bch_dev_group_add(&c->journal.devs, ca); bch_recalc_capacity(c); @@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca) return 0; } -void bch_open_buckets_init(struct cache_set *c) +void bch_fs_allocator_init(struct cache_set *c) { unsigned i; @@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c) spin_lock_init(&c->cache_all.lock); - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) + spin_lock_init(&c->tiers[i].devs.lock); + + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) c->write_points[i].throttle = true; - c->write_points[i].group = &c->cache_tiers[0]; - } - - for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) - spin_lock_init(&c->cache_tiers[i].lock); - - c->promote_write_point.group = &c->cache_tiers[0]; - - c->migration_write_point.group = &c->cache_all; - - c->btree_write_point.group = &c->cache_all; c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcache/alloc.h b/libbcache/alloc.h index 09139a59..9573dd2c 100644 --- a/libbcache/alloc.h +++ b/libbcache/alloc.h @@ -27,6 +27,8 @@ int bch_prio_read(struct cache *); void bch_recalc_min_prio(struct cache *, int); +size_t bch_bucket_alloc(struct cache *, enum alloc_reserve); + void bch_open_bucket_put(struct cache_set *, struct open_bucket *); struct open_bucket *bch_alloc_sectors_start(struct cache_set *, @@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs, { struct cache *ret = NULL; - while (*iter < devs->nr_devices && + while (*iter < devs->nr && !(ret = rcu_dereference(devs->d[*iter].dev))) (*iter)++; @@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs, ((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\ (_ptr)++) +void bch_recalc_capacity(struct cache_set *); void bch_dev_allocator_stop(struct cache *); int bch_dev_allocator_start(struct cache *); -void bch_open_buckets_init(struct cache_set *); +void bch_fs_allocator_init(struct cache_set *); #endif /* _BCACHE_ALLOC_H */ diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h index fbe8b75c..f408bd97 100644 --- a/libbcache/alloc_types.h +++ b/libbcache/alloc_types.h @@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id) struct cache_group { spinlock_t lock; - unsigned nr_devices; + unsigned nr; unsigned cur_device; struct { u64 weight; diff --git a/libbcache/bcache.h b/libbcache/bcache.h index babc08db..5b668c71 100644 --- a/libbcache/bcache.h +++ b/libbcache/bcache.h @@ -464,24 +464,10 @@ struct cache { * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching * all the backing devices first (their cached data gets invalidated, and they * won't automatically reattach). - * - * BCH_FS_STOPPING always gets set first when we're closing down a cache set; - * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e. - * flushing dirty data). - * - * BCH_FS_RUNNING means all cache devices have been registered and journal - * replay is complete. */ enum { - /* Startup: */ BCH_FS_INITIAL_GC_DONE, - BCH_FS_RUNNING, - - /* Shutdown: */ BCH_FS_DETACHING, - BCH_FS_STOPPING, - BCH_FS_RO, - BCH_FS_RO_COMPLETE, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_GC_STOPPING, @@ -498,6 +484,21 @@ struct btree_debug { struct dentry *failed; }; +struct bch_tier { + unsigned idx; + struct task_struct *migrate; + struct bch_pd_controller pd; + + struct cache_group devs; +}; + +enum bch_fs_state { + BCH_FS_STARTING = 0, + BCH_FS_STOPPING, + BCH_FS_RO, + BCH_FS_RW, +}; + struct cache_set { struct closure cl; @@ -506,7 +507,6 @@ struct cache_set { struct kobject internal; struct kobject opts_dir; struct kobject time_stats; - struct completion *stop_completion; unsigned long flags; int minor; @@ -514,6 +514,10 @@ struct cache_set { struct super_block *vfs_sb; char name[40]; + /* ro/rw, add/remove devices: */ + struct mutex state_lock; + enum bch_fs_state state; + /* Counts outstanding writes, for clean transition to read-only */ struct percpu_ref writes; struct work_struct read_only_work; @@ -640,7 +644,9 @@ struct cache_set { * allocate from: */ struct cache_group cache_all; - struct cache_group cache_tiers[BCH_TIER_MAX]; + struct bch_tier tiers[BCH_TIER_MAX]; + /* NULL if we only have devices in one tier: */ + struct bch_tier *fastest_tier; u64 capacity; /* sectors */ @@ -753,10 +759,6 @@ struct cache_set { unsigned writeback_pages_max; atomic_long_t nr_inodes; - /* TIERING */ - struct task_struct *tiering_read; - struct bch_pd_controller tiering_pd; - /* NOTIFICATIONS */ struct mutex uevent_lock; struct kobj_uevent_env uevent_env; @@ -828,6 +830,11 @@ struct cache_set { #undef BCH_TIME_STAT }; +static inline bool bch_fs_running(struct cache_set *c) +{ + return c->state == BCH_FS_RO || c->state == BCH_FS_RW; +} + static inline unsigned bucket_pages(const struct cache *ca) { return ca->mi.bucket_size / PAGE_SECTORS; diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c index 82b07f59..ba2e9a8c 100644 --- a/libbcache/blockdev.c +++ b/libbcache/blockdev.c @@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bool found; int ret; + lockdep_assert_held(&c->state_lock); + bdevname(dc->disk_sb.bdev, buf); if (memcmp(&dc->disk_sb.sb->set_uuid, @@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) return -EINVAL; } - if (!test_bit(BCH_FS_RUNNING, &c->flags)) - return 0; - - if (test_bit(BCH_FS_STOPPING, &c->flags)) { - pr_err("Can't attach %s: shutting down", buf); + if (!bch_fs_running(c)) { + pr_err("Can't attach %s: not running", buf); return -EINVAL; } @@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c) struct cached_dev *dc, *t; lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); list_for_each_entry_safe(dc, t, &uncached_devices, list) bch_cached_dev_attach(dc, c); @@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c) struct bkey_s_c_inode_blockdev inode; int ret = 0; - if (test_bit(BCH_FS_STOPPING, &c->flags)) + if (!bch_fs_running(c)) return -EINVAL; for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c index 4d5efdbd..4d0c6d4d 100644 --- a/libbcache/btree_cache.c +++ b/libbcache/btree_cache.c @@ -11,8 +11,9 @@ #define DEF_BTREE_ID(kwd, val, name) name, -const char *bch_btree_id_names[BTREE_ID_NR] = { +const char * const bch_btree_ids[] = { DEFINE_BCH_BTREE_IDS() + NULL }; #undef DEF_BTREE_ID @@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, return mca_can_free(c) * btree_pages(c); } -void bch_btree_cache_free(struct cache_set *c) +void bch_fs_btree_exit(struct cache_set *c) { struct btree *b; unsigned i; @@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c) rhashtable_destroy(&c->btree_cache_table); } -int bch_btree_cache_alloc(struct cache_set *c) +int bch_fs_btree_init(struct cache_set *c) { unsigned i; int ret; diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h index c26489d1..4d67704b 100644 --- a/libbcache/btree_cache.h +++ b/libbcache/btree_cache.h @@ -6,7 +6,7 @@ struct btree_iter; -extern const char *bch_btree_id_names[BTREE_ID_NR]; +extern const char * const bch_btree_ids[]; void bch_recalc_btree_reserve(struct cache_set *); @@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *); struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *, unsigned, enum six_lock_type); -void bch_btree_cache_free(struct cache_set *); -int bch_btree_cache_alloc(struct cache_set *); +void bch_fs_btree_exit(struct cache_set *); +int bch_fs_btree_init(struct cache_set *); #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c index 0eb7290c..b90807f7 100644 --- a/libbcache/btree_gc.c +++ b/libbcache/btree_gc.c @@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c) } } +static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end, + enum bucket_data_type type) +{ + u64 b = start >> ca->bucket_bits; + + do { + bch_mark_metadata_bucket(ca, ca->buckets + b, type, true); + b++; + } while (b < end >> ca->bucket_bits); +} + /* * Mark non btree metadata - prios, journal */ +static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + u64 b; + + /* Mark superblocks: */ + for (i = 0; i < layout->nr_superblocks; i++) { + if (layout->sb_offset[i] == BCH_SB_SECTOR) + mark_metadata_sectors(ca, 0, BCH_SB_SECTOR, + BUCKET_SB); + + mark_metadata_sectors(ca, + layout->sb_offset[i], + layout->sb_offset[i] + + (1 << layout->sb_max_size_bits), + BUCKET_SB); + } + + spin_lock(&c->journal.lock); + + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; + bch_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_JOURNAL, true); + } + + spin_unlock(&c->journal.lock); + + spin_lock(&ca->prio_buckets_lock); + + for (i = 0; i < prio_buckets(ca) * 2; i++) { + b = ca->prio_buckets[i]; + if (b) + bch_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_PRIOS, true); + } + + spin_unlock(&ca->prio_buckets_lock); +} + static void bch_mark_metadata(struct cache_set *c) { struct cache *ca; - unsigned i, j; - u64 b; + unsigned i; - for_each_cache(ca, c, i) { - for (j = 0; j < ca->journal.nr; j++) { - b = ca->journal.buckets[j]; - bch_mark_metadata_bucket(ca, ca->buckets + b, true); - } + mutex_lock(&c->sb_lock); - spin_lock(&ca->prio_buckets_lock); + for_each_cache(ca, c, i) + bch_mark_dev_metadata(c, ca); - for (j = 0; j < prio_buckets(ca) * 2; j++) { - b = ca->prio_buckets[j]; - bch_mark_metadata_bucket(ca, ca->buckets + b, true); - } - - spin_unlock(&ca->prio_buckets_lock); - } + mutex_unlock(&c->sb_lock); } /* Also see bch_pending_btree_node_free_insert_done() */ @@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c) for_each_bucket(g, ca) { bucket_cmpxchg(g, new, ({ new.owned_by_allocator = 0; - new.is_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); @@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c) u64 start_time; enum btree_id id; - if (btree_gc_coalesce_disabled(c)) - return; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) return; @@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg) last_kick = atomic_read(&c->kick_gc); bch_gc(c); - bch_coalesce(c); + if (!btree_gc_coalesce_disabled(c)) + bch_coalesce(c); debug_check_no_locks_held(); } @@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c) { set_bit(BCH_FS_GC_STOPPING, &c->flags); - if (!IS_ERR_OR_NULL(c->gc_thread)) + if (c->gc_thread) kthread_stop(c->gc_thread); + + c->gc_thread = NULL; + clear_bit(BCH_FS_GC_STOPPING, &c->flags); } int bch_gc_thread_start(struct cache_set *c) { - clear_bit(BCH_FS_GC_STOPPING, &c->flags); + struct task_struct *p; - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(c->gc_thread)) - return PTR_ERR(c->gc_thread); + BUG_ON(c->gc_thread); + p = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(p)) + return PTR_ERR(p); + + c->gc_thread = p; wake_up_process(c->gc_thread); return 0; } @@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal) { enum btree_id id; - if (journal) { - for (id = 0; id < BTREE_ID_NR; id++) - bch_initial_gc_btree(c, id); + bch_mark_metadata(c); + for (id = 0; id < BTREE_ID_NR; id++) + bch_initial_gc_btree(c, id); + + if (journal) bch_journal_mark(c, journal); - } /* * Skip past versions that might have possibly been used (as nonces), @@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal) if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); - bch_mark_metadata(c); - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); diff --git a/libbcache/buckets.c b/libbcache/buckets.c index 315cfbec..ec4ee54a 100644 --- a/libbcache/buckets.c +++ b/libbcache/buckets.c @@ -66,6 +66,7 @@ #include "alloc.h" #include "btree_gc.h" #include "buckets.h" +#include "error.h" #include <linux/preempt.h> #include <trace/events/bcache.h> @@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {} #endif +/* + * Clear journal_seq_valid for buckets for which it's not needed, to prevent + * wraparound: + */ void bch_bucket_seq_cleanup(struct cache_set *c) { u16 last_seq_ondisk = c->journal.last_seq_ondisk; @@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(g, ca) { bucket_cmpxchg(g, m, ({ - if (!m.wait_on_journal || - ((s16) last_seq_ondisk - - (s16) m.journal_seq < 0)) + if (!m.journal_seq_valid || + bucket_needs_journal_commit(m, last_seq_ondisk)) break; - m.wait_on_journal = 0; + m.journal_seq_valid = 0; })); } } @@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c) static inline int is_meta_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && m.is_metadata; + return m.data_type != BUCKET_DATA; } static inline int is_dirty_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors; + return m.data_type == BUCKET_DATA && !!m.dirty_sectors; } static inline int is_cached_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors; + return m.data_type == BUCKET_DATA && + !m.dirty_sectors && !!m.cached_sectors; } void bch_fs_stats_apply(struct cache_set *c, @@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c, memset(stats, 0, sizeof(*stats)); } +static bool bucket_became_unavailable(struct cache_set *c, + struct bucket_mark old, + struct bucket_mark new) +{ + return is_available_bucket(old) && + !is_available_bucket(new) && + c->gc_pos.phase == GC_PHASE_DONE; +} + static void bucket_stats_update(struct cache *ca, struct bucket_mark old, struct bucket_mark new, - bool may_make_unavailable, struct bucket_stats_cache_set *bch_alloc_stats) { struct cache_set *c = ca->set; struct bucket_stats_cache *cache_stats; - BUG_ON(!may_make_unavailable && - is_available_bucket(old) && - !is_available_bucket(new) && - c->gc_pos.phase == GC_PHASE_DONE); + bch_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of metadata in same bucket: %u, %u", + old.data_type, new.data_type); if (bch_alloc_stats) { bch_alloc_stats->s[S_COMPRESSED][S_CACHED] += (int) new.cached_sectors - (int) old.cached_sectors; bch_alloc_stats->s[S_COMPRESSED] - [old.is_metadata ? S_META : S_DIRTY] -= + [is_meta_bucket(old) ? S_META : S_DIRTY] -= old.dirty_sectors; bch_alloc_stats->s[S_COMPRESSED] - [new.is_metadata ? S_META : S_DIRTY] += + [is_meta_bucket(new) ? S_META : S_DIRTY] += new.dirty_sectors; } @@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca, cache_stats->sectors_cached += (int) new.cached_sectors - (int) old.cached_sectors; - if (old.is_metadata) + if (is_meta_bucket(old)) cache_stats->sectors_meta -= old.dirty_sectors; else cache_stats->sectors_dirty -= old.dirty_sectors; - if (new.is_metadata) + if (is_meta_bucket(new)) cache_stats->sectors_meta += new.dirty_sectors; else cache_stats->sectors_dirty += new.dirty_sectors; @@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca, bch_wake_allocator(ca); } +#define bucket_data_cmpxchg(ca, g, new, expr) \ +({ \ + struct bucket_stats_cache_set _stats = { 0 }; \ + struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ + \ + bucket_stats_update(ca, _old, new, &_stats); \ + _old; \ +}) + void bch_invalidate_bucket(struct cache *ca, struct bucket *g) { struct bucket_stats_cache_set stats = { 0 }; @@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g) old = bucket_cmpxchg(g, new, ({ new.owned_by_allocator = 1; - new.is_metadata = 0; + new.had_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; new.copygc = 0; new.gen++; })); - BUG_ON(old.dirty_sectors); + bucket_stats_update(ca, old, new, &stats); - bucket_stats_update(ca, old, new, true, &stats); + BUG_ON(old.dirty_sectors); /* * Ick: @@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g) void bch_mark_free_bucket(struct cache *ca, struct bucket *g) { - struct bucket_stats_cache_set stats = { 0 }; struct bucket_mark old, new; - old = bucket_cmpxchg(g, new, ({ + old = bucket_data_cmpxchg(ca, g, new, ({ new.owned_by_allocator = 0; - new.is_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); - bucket_stats_update(ca, old, new, false, &stats); + BUG_ON(bucket_became_unavailable(ca->set, old, new)); } void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g, bool owned_by_allocator) { - struct bucket_stats_cache_set stats = { 0 }; - struct bucket_mark old, new; + struct bucket_mark new; - old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator); - - bucket_stats_update(ca, old, new, true, &stats); + bucket_data_cmpxchg(ca, g, new, ({ + new.owned_by_allocator = owned_by_allocator; + })); } void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g, + enum bucket_data_type type, bool may_make_unavailable) { - struct bucket_stats_cache_set stats = { 0 }; struct bucket_mark old, new; - old = bucket_cmpxchg(g, new, ({ - new.is_metadata = 1; + BUG_ON(!type); + + old = bucket_data_cmpxchg(ca, g, new, ({ + new.data_type = type; new.had_metadata = 1; })); BUG_ON(old.cached_sectors); BUG_ON(old.dirty_sectors); - - bucket_stats_update(ca, old, new, may_make_unavailable, &stats); + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(ca->set, old, new)); } #define saturated_add(ca, dst, src, max) \ @@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c, if (!new.dirty_sectors && !new.cached_sectors) { - new.is_metadata = false; + new.data_type = 0; if (journal_seq) { - new.wait_on_journal = true; + new.journal_seq_valid = 1; new.journal_seq = journal_seq; } } else { - new.is_metadata = (type == S_META); + new.data_type = type == S_META + ? BUCKET_BTREE : BUCKET_DATA; } - new.had_metadata |= new.is_metadata; + new.had_metadata |= is_meta_bucket(new); } while ((v = cmpxchg(&g->_mark.counter, old.counter, new.counter)) != old.counter); - bucket_stats_update(ca, old, new, may_make_unavailable, NULL); + bucket_stats_update(ca, old, new, NULL); + + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(c, old, new)); if (saturated && atomic_long_add_return(saturated, diff --git a/libbcache/buckets.h b/libbcache/buckets.h index 9c6e4385..6d70103e 100644 --- a/libbcache/buckets.h +++ b/libbcache/buckets.h @@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c) static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && - !mark.is_metadata && - !mark.dirty_sectors); + mark.data_type == BUCKET_DATA && + !mark.dirty_sectors && + !mark.nouse); +} + +static inline bool bucket_needs_journal_commit(struct bucket_mark m, + u16 last_seq_ondisk) +{ + return m.journal_seq_valid && + ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); } void bch_bucket_seq_cleanup(struct cache_set *); @@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *); void bch_invalidate_bucket(struct cache *, struct bucket *); void bch_mark_free_bucket(struct cache *, struct bucket *); void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool); -void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool); +void bch_mark_metadata_bucket(struct cache *, struct bucket *, + enum bucket_data_type, bool); void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool, struct bucket_stats_cache_set *); diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h index 6bbdcd26..f42e09d8 100644 --- a/libbcache/buckets_types.h +++ b/libbcache/buckets_types.h @@ -1,6 +1,14 @@ #ifndef _BUCKETS_TYPES_H #define _BUCKETS_TYPES_H +enum bucket_data_type { + BUCKET_DATA = 0, + BUCKET_BTREE, + BUCKET_PRIOS, + BUCKET_JOURNAL, + BUCKET_SB, +}; + struct bucket_mark { union { struct { @@ -12,23 +20,30 @@ struct bucket_mark { /* generation copygc is going to move this bucket into */ unsigned copygc:1; - unsigned wait_on_journal:1; + + unsigned journal_seq_valid:1; /* - * If this bucket ever had metadata in it, the allocator must - * increment its gen before we reuse it: + * If this bucket had metadata while at the current generation + * number, the allocator must increment its gen before we reuse + * it: */ unsigned had_metadata:1; unsigned owned_by_allocator:1; - unsigned is_metadata:1; - u16 cached_sectors; + unsigned data_type:3; + + unsigned nouse:1; + u16 dirty_sectors; + u16 cached_sectors; /* * low bits of journal sequence number when this bucket was most - * recently modified: + * recently modified: if journal_seq_valid is set, this bucket + * can't be reused until the journal sequence number written to + * disk is >= the bucket's journal sequence number: */ u16 journal_seq; }; diff --git a/libbcache/chardev.c b/libbcache/chardev.c index b142d7b2..049aa910 100644 --- a/libbcache/chardev.c +++ b/libbcache/chardev.c @@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg) static long bch_ioctl_stop(struct cache_set *c) { - bch_fs_stop(c); + bch_fs_stop_async(c); return 0; } diff --git a/libbcache/checksum.c b/libbcache/checksum.c index dae52d49..92036db4 100644 --- a/libbcache/checksum.c +++ b/libbcache/checksum.c @@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed) if (ret) goto err; - crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL, - sizeof(*crypt) / sizeof(u64)), - struct bch_sb_field_crypt, field); + crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64)); if (!crypt) { ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ goto err; } - crypt->field.type = BCH_SB_FIELD_crypt; crypt->key = key; /* write superblock */ @@ -560,7 +557,7 @@ err: return ret; } -void bch_fs_encryption_free(struct cache_set *c) +void bch_fs_encryption_exit(struct cache_set *c) { if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); diff --git a/libbcache/checksum.h b/libbcache/checksum.h index 137c9155..9d4da08d 100644 --- a/libbcache/checksum.h +++ b/libbcache/checksum.h @@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned, int bch_disable_encryption(struct cache_set *); int bch_enable_encryption(struct cache_set *, bool); -void bch_fs_encryption_free(struct cache_set *); +void bch_fs_encryption_exit(struct cache_set *); int bch_fs_encryption_init(struct cache_set *); static inline unsigned bch_data_checksum_type(struct cache_set *c) diff --git a/libbcache/compress.c b/libbcache/compress.c index f81a8143..89da31e5 100644 --- a/libbcache/compress.c +++ b/libbcache/compress.c @@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c, break; } - return bch_compress_init(c); + return bch_fs_compress_init(c); } -void bch_compress_free(struct cache_set *c) +void bch_fs_compress_exit(struct cache_set *c) { vfree(c->zlib_workspace); mempool_exit(&c->lz4_workspace_pool); @@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c) max_t(size_t, zlib_inflate_workspacesize(), \ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)) -int bch_compress_init(struct cache_set *c) +int bch_fs_compress_init(struct cache_set *c) { unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); int ret, cpu; - if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && - !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; - if (!c->bio_decompress_worker) { c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker); if (!c->bio_decompress_worker) @@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c) } } + if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && + !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; + if (!mempool_initialized(&c->compression_bounce[READ])) { ret = mempool_init_page_pool(&c->compression_bounce[READ], 1, order); diff --git a/libbcache/compress.h b/libbcache/compress.h index 485acd95..4604b065 100644 --- a/libbcache/compress.h +++ b/libbcache/compress.h @@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *, struct bio *, size_t *, unsigned *); int bch_check_set_has_compressed_data(struct cache_set *, unsigned); -void bch_compress_free(struct cache_set *); -int bch_compress_init(struct cache_set *); +void bch_fs_compress_exit(struct cache_set *); +int bch_fs_compress_init(struct cache_set *); #endif /* _BCACHE_COMPRESS_H */ diff --git a/libbcache/debug.c b/libbcache/debug.c index d25c32ae..16cc72b9 100644 --- a/libbcache/debug.c +++ b/libbcache/debug.c @@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = { .read = bch_read_bfloat_failed, }; -void bch_debug_exit_cache_set(struct cache_set *c) +void bch_fs_debug_exit(struct cache_set *c) { if (!IS_ERR_OR_NULL(c->debug)) debugfs_remove_recursive(c->debug); } -void bch_debug_init_cache_set(struct cache_set *c) +void bch_fs_debug_init(struct cache_set *c) { struct btree_debug *bd; char name[100]; @@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - bd->btree = debugfs_create_file(bch_btree_id_names[bd->id], + bd->btree = debugfs_create_file(bch_btree_ids[bd->id], 0400, c->debug, bd, &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", - bch_btree_id_names[bd->id]); + bch_btree_ids[bd->id]); bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", - bch_btree_id_names[bd->id]); + bch_btree_ids[bd->id]); bd->failed = debugfs_create_file(name, 0400, c->debug, bd, &bfloat_failed_debug_ops); diff --git a/libbcache/debug.h b/libbcache/debug.h index a3635e60..d34a95a0 100644 --- a/libbcache/debug.h +++ b/libbcache/debug.h @@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b) } #ifdef CONFIG_DEBUG_FS -void bch_debug_exit_cache_set(struct cache_set *); -void bch_debug_init_cache_set(struct cache_set *); +void bch_fs_debug_exit(struct cache_set *); +void bch_fs_debug_init(struct cache_set *); #else -static inline void bch_debug_exit_cache_set(struct cache_set *c) {} -static inline void bch_debug_init_cache_set(struct cache_set *c) {} +static inline void bch_fs_debug_exit(struct cache_set *c) {} +static inline void bch_fs_debug_init(struct cache_set *c) {} #endif void bch_debug_exit(void); diff --git a/libbcache/error.c b/libbcache/error.c index 9f39be1b..f4109da6 100644 --- a/libbcache/error.c +++ b/libbcache/error.c @@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c) case BCH_ON_ERROR_RO: if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { /* XXX do something better here? */ - bch_fs_stop(c); + bch_fs_stop_async(c); return; } @@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work) } else { bch_notify_dev_error(ca, true); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); dev = bch_dev_may_remove(ca); if (dev ? bch_dev_read_only(ca) @@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work) "too many IO errors on %s, setting %s RO", bdevname(ca->disk_sb.bdev, buf), dev ? "device" : "filesystem"); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); } } diff --git a/libbcache/extents.c b/libbcache/extents.c index 523f3f48..c5e0e375 100644 --- a/libbcache/extents.c +++ b/libbcache/extents.c @@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - !g->mark.is_metadata; + g->mark.data_type != BUCKET_BTREE; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); err = "inconsistent"; @@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; + struct extent_pick_ptr pick = { .ca = NULL }; struct cache *ca; rcu_read_lock(); @@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) PTR_BUCKET_NR(ca, ptr))) continue; - percpu_ref_get(&ca->ref); - rcu_read_unlock(); + if (pick.ca && pick.ca->mi.tier < ca->mi.tier) + continue; - return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca }; + pick.ca = ca; + pick.ptr = *ptr; } + if (pick.ca) + percpu_ref_get(&pick.ca->ref); + rcu_read_unlock(); - return (struct extent_pick_ptr) { .ca = NULL, }; + return pick; } const struct bkey_ops bch_bkey_btree_ops = { @@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, if (stale) break; - bad = (mark.is_metadata || + bad = (mark.data_type != BUCKET_DATA || (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && !mark.owned_by_allocator && !(ptr->cached @@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k, rcu_read_lock(); ret->ca = NULL; - extent_for_each_online_device_crc(c, e, crc, ptr, ca) - if (!ptr_stale(ca, ptr)) { - *ret = (struct extent_pick_ptr) { - .crc = crc_to_128(e.k, crc), - .ptr = *ptr, - .ca = ca, - }; + extent_for_each_online_device_crc(c, e, crc, ptr, ca) { + if (ptr_stale(ca, ptr)) + continue; - if (ca != avoid) - break; - } + if (ret->ca && + (ca == avoid || + ret->ca->mi.tier < ca->mi.tier)) + continue; + + *ret = (struct extent_pick_ptr) { + .crc = crc_to_128(e.k, crc), + .ptr = *ptr, + .ca = ca, + }; + } if (ret->ca) percpu_ref_get(&ret->ca->ref); diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c index e9585fd5..e2f1427f 100644 --- a/libbcache/fs-gc.c +++ b/libbcache/fs-gc.c @@ -545,9 +545,9 @@ struct nlink { u32 dir_count; }; -DECLARE_GENRADIX_TYPE(nlinks, struct nlink); +typedef GENRADIX(struct nlink) nlink_table; -static void inc_link(struct cache_set *c, struct nlinks *links, +static void inc_link(struct cache_set *c, nlink_table *links, u64 range_start, u64 *range_end, u64 inum, bool dir) { @@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links, } noinline_for_stack -static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links, +static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links, u64 range_start, u64 *range_end) { struct btree_iter iter; @@ -776,7 +776,7 @@ fsck_err: noinline_for_stack static int bch_gc_walk_inodes(struct cache_set *c, struct bch_inode_unpacked *lostfound_inode, - struct nlinks *links, + nlink_table *links, u64 range_start, u64 range_end) { struct btree_iter iter; @@ -850,7 +850,7 @@ noinline_for_stack static int check_inode_nlinks(struct cache_set *c, struct bch_inode_unpacked *lostfound_inode) { - struct nlinks links; + nlink_table links; u64 this_iter_range_start, next_iter_range_start = 0; int ret = 0; diff --git a/libbcache/fs.c b/libbcache/fs.c index ab0d9728..ec70a3e3 100644 --- a/libbcache/fs.c +++ b/libbcache/fs.c @@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name, if (!c) goto err_unlock; - if (!test_bit(BCH_FS_RUNNING, &c->flags)) { + mutex_lock(&c->state_lock); + + if (!bch_fs_running(c)) { + mutex_unlock(&c->state_lock); err = "incomplete cache set"; c = NULL; goto err_unlock; } closure_get(&c->cl); + mutex_unlock(&c->state_lock); mutex_unlock(&bch_register_lock); } @@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data) if (ret) return ret; - mutex_lock(&bch_register_lock); - if (opts.read_only >= 0 && opts.read_only != c->opts.read_only) { const char *err = NULL; if (opts.read_only) { - bch_fs_read_only_sync(c); + bch_fs_read_only(c); sb->s_flags |= MS_RDONLY; } else { err = bch_fs_read_write(c); if (err) { bch_err(c, "error going rw: %s", err); - ret = -EINVAL; - goto unlock; + return -EINVAL; } sb->s_flags &= ~MS_RDONLY; @@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data) if (opts.errors >= 0) c->opts.errors = opts.errors; -unlock: - mutex_unlock(&bch_register_lock); - return ret; } @@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb) generic_shutdown_super(sb); if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) - bch_fs_stop_sync(c); + bch_fs_stop(c); else closure_put(&c->cl); } @@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = { MODULE_ALIAS_FS("bcache"); -void bch_fs_exit(void) +void bch_vfs_exit(void) { unregister_filesystem(&bcache_fs_type); if (bch_dio_write_bioset) @@ -1477,7 +1475,7 @@ void bch_fs_exit(void) kmem_cache_destroy(bch_inode_cache); } -int __init bch_fs_init(void) +int __init bch_vfs_init(void) { int ret = -ENOMEM; @@ -1504,6 +1502,6 @@ int __init bch_fs_init(void) return 0; err: - bch_fs_exit(); + bch_vfs_exit(); return ret; } diff --git a/libbcache/fs.h b/libbcache/fs.h index 933fb6de..2a29b132 100644 --- a/libbcache/fs.h +++ b/libbcache/fs.h @@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *, int __must_check bch_write_inode(struct cache_set *, struct bch_inode_info *); -void bch_fs_exit(void); -int bch_fs_init(void); +void bch_vfs_exit(void); +int bch_vfs_init(void); #else -static inline void bch_fs_exit(void) {} -static inline int bch_fs_init(void) { return 0; } +static inline void bch_vfs_exit(void) {} +static inline int bch_vfs_init(void) { return 0; } #endif diff --git a/libbcache/io.c b/libbcache/io.c index be99a973..a3df3794 100644 --- a/libbcache/io.c +++ b/libbcache/io.c @@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data) spin_lock_irqsave(&c->foreground_write_pd_lock, flags); while ((op = c->write_wait_head)) { - if (!test_bit(BCH_FS_RO, &c->flags) && - !test_bit(BCH_FS_STOPPING, &c->flags) && - time_after(op->expires, jiffies)) { + if (time_after(op->expires, jiffies)) { mod_timer(&c->foreground_write_wakeup, op->expires); break; } @@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio) return; } - if (rbio->promote && - !test_bit(BCH_FS_RO, &c->flags) && - !test_bit(BCH_FS_STOPPING, &c->flags)) { + if (rbio->promote) { struct cache_promote_op *promote = rbio->promote; struct closure *cl = &promote->cl; @@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio) preempt_disable(); d = this_cpu_ptr(c->bio_decompress_worker); llist_add(&rbio->list, &d->bio_list); - queue_work(system_unbound_wq, &d->work); + queue_work(system_highpri_wq, &d->work); preempt_enable(); } else { __bch_read_endio(c, rbio); } } +static bool should_promote(struct cache_set *c, + struct extent_pick_ptr *pick, unsigned flags) +{ + if (!(flags & BCH_READ_PROMOTE)) + return false; + + if (percpu_ref_is_dying(&c->writes)) + return false; + + return c->fastest_tier && + c->fastest_tier < c->tiers + pick->ca->mi.tier; +} + void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) @@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, * XXX: multiple promotes can race with each other, wastefully. Keep a * list of outstanding promotes? */ - if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) { + if (should_promote(c, pick, flags)) { /* * biovec needs to be big enough to hold decompressed data, if * the bch_write_extent() has to decompress/recompress it: diff --git a/libbcache/journal.c b/libbcache/journal.c index 99dd9f26..b2838376 100644 --- a/libbcache/journal.c +++ b/libbcache/journal.c @@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c, return BCH_FSCK_UNKNOWN_VERSION; } - if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 || - bytes > c->journal.entry_size_max, c, + if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c, "journal entry too big (%zu bytes), sector %lluu", bytes, sector)) { /* XXX: note we might have missing journal entries */ @@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c) { struct journal *j = &c->journal; struct journal_seq_blacklist *bl; - struct cache *ca; u64 new_seq = 0; - unsigned i; - - for_each_cache(ca, c, i) - if (is_journal_device(ca)) - bch_dev_group_add(&c->journal.devs, ca); list_for_each_entry(bl, &j->seq_blacklist, list) new_seq = max(new_seq, bl->seq); @@ -1534,48 +1527,111 @@ err: return ret; } -static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr) +static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca, + unsigned nr, bool write_super) { + struct journal *j = &c->journal; struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(ca->disk_sb.sb); - struct bch_sb_field *f; - u64 *p; + struct bch_sb_field_journal *journal_buckets; + struct disk_reservation disk_res = { 0, 0 }; + struct closure cl; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; + int ret = 0; - p = krealloc(ja->bucket_seq, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; + closure_init_stack(&cl); - ja->bucket_seq = p; + mutex_lock(&c->sb_lock); - p = krealloc(ja->buckets, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; + /* don't handle reducing nr of buckets yet: */ + if (nr <= ja->nr) + goto err; - ja->buckets = p; + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + */ - f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr + - sizeof(*journal_buckets) / sizeof(u64)); - if (!f) - return -ENOMEM; - f->type = BCH_SB_FIELD_journal; + ret = ENOSPC; + if (bch_disk_reservation_get(c, &disk_res, + (nr - ja->nr) << ca->bucket_bits, 0)) + goto err; - ja->nr = nr; - return 0; + ret = -ENOMEM; + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); + if (!new_buckets || !new_bucket_seq) + goto err; + + journal_buckets = bch_sb_resize_journal(&ca->disk_sb, + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) + goto err; + + spin_lock(&j->lock); + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + + while (ja->nr < nr) { + /* must happen under journal lock, to avoid racing with gc: */ + u64 b = bch_bucket_alloc(ca, RESERVE_NONE); + if (!b) { + if (!closure_wait(&c->freelist_wait, &cl)) { + spin_unlock(&j->lock); + closure_sync(&cl); + spin_lock(&j->lock); + } + continue; + } + + bch_mark_metadata_bucket(ca, &ca->buckets[b], + BUCKET_JOURNAL, false); + bch_mark_alloc_bucket(ca, &ca->buckets[b], false); + + memmove(ja->buckets + ja->last_idx + 1, + ja->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(ja->bucket_seq + ja->last_idx + 1, + ja->bucket_seq + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(journal_buckets->buckets + ja->last_idx + 1, + journal_buckets->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + + ja->buckets[ja->last_idx] = b; + journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); + + if (ja->last_idx < ja->nr) { + if (ja->cur_idx >= ja->last_idx) + ja->cur_idx++; + ja->last_idx++; + } + ja->nr++; + + } + spin_unlock(&j->lock); + + BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi)); + + if (write_super) + bch_write_super(c); + + ret = 0; +err: + mutex_unlock(&c->sb_lock); + + kfree(new_bucket_seq); + kfree(new_buckets); + bch_disk_reservation_put(c, &disk_res); + + return ret; } int bch_dev_journal_alloc(struct cache *ca) { - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; - int ret; - unsigned i; - - if (ca->mi.tier != 0) - return 0; - if (dynamic_fault("bcache:add:journal_alloc")) return -ENOMEM; @@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca) * clamp journal size to 1024 buckets or 512MB (in sectors), whichever * is smaller: */ - ret = bch_set_nr_journal_buckets(ca, + return bch_set_nr_journal_buckets(ca->set, ca, clamp_t(unsigned, ca->mi.nbuckets >> 8, BCH_JOURNAL_BUCKETS_MIN, min(1 << 10, - (1 << 20) / ca->mi.bucket_size))); - if (ret) - return ret; - - journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); - - for (i = 0; i < ja->nr; i++) { - u64 bucket = ca->mi.first_bucket + i; - - ja->buckets[i] = bucket; - journal_buckets->buckets[i] = cpu_to_le64(bucket); - - bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true); - } - - return 0; + (1 << 20) / ca->mi.bucket_size)), + false); } /* Journalling */ @@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j, fifo_entry_idx(&j->pin, pin->pin_list))) { if (journal_pin_active(pin)) __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, - pin, NULL); + __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); } spin_unlock_irq(&j->pin_lock); } - static struct journal_entry_pin * journal_get_next_pin(struct journal *j, u64 seq_to_flush) { @@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush) return ret; } +static bool journal_has_pins(struct journal *j) +{ + bool ret; + + spin_lock(&j->lock); + journal_reclaim_fast(j); + ret = fifo_used(&j->pin) > 1 || + atomic_read(&fifo_peek_front(&j->pin).count) > 1; + spin_unlock(&j->lock); + + return ret; +} + +void bch_journal_flush_pins(struct journal *j) +{ + struct journal_entry_pin *pin; + + while ((pin = journal_get_next_pin(j, U64_MAX))) + pin->flush(j, pin); + + wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j)); +} + static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { bool ret; @@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) struct cache_set *c = container_of(j, struct cache_set, journal); struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); struct bch_extent_ptr *ptr; + struct journal_device *ja; struct cache *ca; - unsigned iter, replicas, replicas_want = + bool swapped; + unsigned i, replicas, replicas_want = READ_ONCE(c->opts.metadata_replicas); spin_lock(&j->lock); @@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) replicas = bch_extent_nr_ptrs(e.c); + spin_lock(&j->devs.lock); + + /* Sort by tier: */ + do { + swapped = false; + + for (i = 0; i + 1 < j->devs.nr; i++) + if (j->devs.d[i + 0].dev->mi.tier > + j->devs.d[i + 1].dev->mi.tier) { + swap(j->devs.d[i], j->devs.d[i + 1]); + swapped = true; + } + } while (swapped); + /* - * Determine location of the next journal write: - * XXX: sort caches by free journal space + * Pick devices for next journal write: + * XXX: sort devices by free journal space? */ - group_for_each_cache_rcu(ca, &j->devs, iter) { - struct journal_device *ja = &ca->journal; + for (i = 0; i < j->devs.nr; i++) { + ca = j->devs.d[i].dev; + ja = &ca->journal; if (replicas >= replicas_want) break; @@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); } - + spin_unlock(&j->devs.lock); rcu_read_unlock(); j->prev_buf_sectors = 0; @@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j) return bch_journal_flush_seq(j, seq); } -void bch_journal_free(struct journal *j) -{ - unsigned order = get_order(j->entry_size_max); - - free_pages((unsigned long) j->buf[1].data, order); - free_pages((unsigned long) j->buf[0].data, order); - free_fifo(&j->pin); -} - -int bch_journal_alloc(struct journal *j, unsigned entry_size_max) -{ - static struct lock_class_key res_key; - unsigned order = get_order(entry_size_max); - - spin_lock_init(&j->lock); - spin_lock_init(&j->pin_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); - spin_lock_init(&j->devs.lock); - mutex_init(&j->reclaim_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - j->entry_size_max = entry_size_max; - j->write_delay_ms = 100; - j->reclaim_delay_ms = 100; - - bkey_extent_init(&j->key); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || - !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) - return -ENOMEM; - - return 0; -} - ssize_t bch_journal_print_debug(struct journal *j, char *buf) { union journal_res_state *s = &j->reservations; @@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca) return ret; } -void bch_journal_free_cache(struct cache *ca) +void bch_fs_journal_stop(struct journal *j) +{ + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return; + + /* + * Empty out the journal by first flushing everything pinning existing + * journal entries, then force a brand new empty journal entry to be + * written: + */ + bch_journal_flush_pins(j); + bch_journal_flush_async(j, NULL); + bch_journal_meta(j); + + cancel_delayed_work_sync(&j->write_work); + cancel_delayed_work_sync(&j->reclaim_work); +} + +void bch_dev_journal_exit(struct cache *ca) { kfree(ca->journal.buckets); kfree(ca->journal.bucket_seq); } -int bch_journal_init_cache(struct cache *ca) +int bch_dev_journal_init(struct cache *ca) { struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = @@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca) return 0; } + +void bch_fs_journal_exit(struct journal *j) +{ + unsigned order = get_order(j->entry_size_max); + + free_pages((unsigned long) j->buf[1].data, order); + free_pages((unsigned long) j->buf[0].data, order); + free_fifo(&j->pin); +} + +int bch_fs_journal_init(struct journal *j, unsigned entry_size_max) +{ + static struct lock_class_key res_key; + unsigned order = get_order(entry_size_max); + + spin_lock_init(&j->lock); + spin_lock_init(&j->pin_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); + INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); + mutex_init(&j->blacklist_lock); + INIT_LIST_HEAD(&j->seq_blacklist); + spin_lock_init(&j->devs.lock); + mutex_init(&j->reclaim_lock); + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + + j->entry_size_max = entry_size_max; + j->write_delay_ms = 100; + j->reclaim_delay_ms = 100; + + bkey_extent_init(&j->key); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || + !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) + return -ENOMEM; + + return 0; +} diff --git a/libbcache/journal.h b/libbcache/journal.h index 02a6e676..d3a1db0c 100644 --- a/libbcache/journal.h +++ b/libbcache/journal.h @@ -111,7 +111,6 @@ #include <linux/hash.h> #include "journal_types.h" -//#include "super-io.h" /* * Only used for holding the journal entries we read in btree_journal_read() @@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); +void bch_journal_flush_pins(struct journal *); struct closure; struct cache_set; @@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j) ? -EIO : 0; } -static inline bool is_journal_device(struct cache *ca) -{ - return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0; -} - static inline bool journal_flushes_device(struct cache *ca) { return true; @@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j) spin_unlock(&j->lock); } -void bch_journal_free(struct journal *); -int bch_journal_alloc(struct journal *, unsigned); - ssize_t bch_journal_print_debug(struct journal *, char *); int bch_dev_journal_alloc(struct cache *); @@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j) int bch_journal_move(struct cache *); -void bch_journal_free_cache(struct cache *); -int bch_journal_init_cache(struct cache *); +void bch_fs_journal_stop(struct journal *); +void bch_dev_journal_exit(struct cache *); +int bch_dev_journal_init(struct cache *); +void bch_fs_journal_exit(struct journal *); +int bch_fs_journal_init(struct journal *, unsigned); #endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c index e40dfbca..27f5c63c 100644 --- a/libbcache/movinggc.c +++ b/libbcache/movinggc.c @@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca) } if (g->mark.owned_by_allocator || - g->mark.is_metadata) + g->mark.data_type != BUCKET_DATA) continue; sectors_used = bucket_sectors_used(g); @@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg) return 0; } -void bch_moving_init_cache(struct cache *ca) +void bch_moving_gc_stop(struct cache *ca) { - bch_pd_controller_init(&ca->moving_gc_pd); - ca->moving_gc_pd.d_term = 0; + ca->moving_gc_pd.rate.rate = UINT_MAX; + bch_ratelimit_reset(&ca->moving_gc_pd.rate); + + if (ca->moving_gc_read) + kthread_stop(ca->moving_gc_read); + ca->moving_gc_read = NULL; } -int bch_moving_gc_thread_start(struct cache *ca) +int bch_moving_gc_start(struct cache *ca) { struct task_struct *t; - /* The moving gc read thread must be stopped */ - BUG_ON(ca->moving_gc_read != NULL); + BUG_ON(ca->moving_gc_read); if (ca->set->opts.nochanges) return 0; @@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca) return 0; } -void bch_moving_gc_stop(struct cache *ca) +void bch_dev_moving_gc_init(struct cache *ca) { - ca->moving_gc_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - - if (ca->moving_gc_read) - kthread_stop(ca->moving_gc_read); - ca->moving_gc_read = NULL; + bch_pd_controller_init(&ca->moving_gc_pd); + ca->moving_gc_pd.d_term = 0; } diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h index 5f153085..e8ae95e5 100644 --- a/libbcache/movinggc.h +++ b/libbcache/movinggc.h @@ -23,8 +23,8 @@ #define COPYGC_SECTORS_PER_ITER(ca) \ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) -void bch_moving_init_cache(struct cache *); void bch_moving_gc_stop(struct cache *); -int bch_moving_gc_thread_start(struct cache *); +int bch_moving_gc_start(struct cache *); +void bch_dev_moving_gc_init(struct cache *); #endif diff --git a/libbcache/opts.h b/libbcache/opts.h index 95184db1..9b10310d 100644 --- a/libbcache/opts.h +++ b/libbcache/opts.h @@ -86,11 +86,17 @@ enum opt_type { BCH_OPT(noreplay, 0444, NO_SB_OPT, \ s8, OPT_BOOL()) \ BCH_OPT(norecovery, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) + s8, OPT_BOOL()) \ + BCH_OPT(noexcl, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(sb, 0444, NO_SB_OPT, \ + s64, OPT_UINT(0, S64_MAX)) \ #define BCH_OPTS() \ BCH_OPT(read_only, 0444, NO_SB_OPT, \ s8, OPT_BOOL()) \ + BCH_OPT(nostart, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ BCH_VISIBLE_OPTS() struct bch_opts { @@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src) #undef BCH_OPT } +#define opt_defined(_opt) ((_opt) >= 0) + void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64); struct bch_opts bch_sb_opts(struct bch_sb *); diff --git a/libbcache/super-io.c b/libbcache/super-io.c index be27d3ee..f50a5ee8 100644 --- a/libbcache/super-io.c +++ b/libbcache/super-io.c @@ -10,6 +10,7 @@ #include "vstructs.h" #include <linux/backing-dev.h> +#include <linux/sort.h> static inline void __bch_sb_layout_size_assert(void) { @@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void) } struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb, - enum bch_sb_field_types type) + enum bch_sb_field_type type) { struct bch_sb_field *f; @@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb) if (sb->bio) bio_put(sb->bio); if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(sb->bdev, sb->mode); free_pages((unsigned long) sb->sb, sb->page_order); memset(sb, 0, sizeof(*sb)); @@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order) return 0; } -int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s) +static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s) { u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; @@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb, le32_add_cpu(&sb->u64s, u64s - old_u64s); return f; +} +struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch_sb_field_get(sb->sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + + if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + f = __bch_sb_field_resize(sb->sb, f, u64s); + f->type = type; + return f; } struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c, - struct bch_sb_field *f, + enum bch_sb_field_type type, unsigned u64s) { + struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type); ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ssize_t d = -old_u64s + u64s; struct cache *ca; @@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c, for_each_cache(ca, c, i) { struct bcache_superblock *sb = &ca->disk_sb; - if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { percpu_ref_put(&ca->ref); return NULL; } } - return __bch_sb_field_resize(c->disk_sb, f, u64s); -} - -struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - return __bch_sb_field_resize(sb->sb, f, u64s); + f = __bch_sb_field_resize(c->disk_sb, f, u64s); + f->type = type; + return f; } static const char *validate_sb_layout(struct bch_sb_layout *layout) @@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) prev_offset = le64_to_cpu(layout->sb_offset[0]); - if (prev_offset != BCH_SB_SECTOR) - return "Invalid superblock layout: doesn't have default superblock location"; - for (i = 1; i < layout->nr_superblocks; i++) { offset = le64_to_cpu(layout->sb_offset[i]); @@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) return NULL; } +static int u64_cmp(const void *_l, const void *_r) +{ + u64 l = *((const u64 *) _l), r = *((const u64 *) _r); + + return l < r ? -1 : l > r ? 1 : 0; +} + +const char *bch_validate_journal_layout(struct bch_sb *sb, + struct cache_member_cpu mi) +{ + struct bch_sb_field_journal *journal; + const char *err; + unsigned nr; + unsigned i; + u64 *b; + + journal = bch_sb_get_journal(sb); + if (!journal) + return NULL; + + nr = bch_nr_journal_buckets(journal); + if (!nr) + return NULL; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return "cannot allocate memory"; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + err = "journal bucket at sector 0"; + if (!b[0]) + goto err; + + err = "journal bucket before first bucket"; + if (b[0] < mi.first_bucket) + goto err; + + err = "journal bucket past end of device"; + if (b[nr - 1] >= mi.nbuckets) + goto err; + + err = "duplicate journal buckets"; + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) + goto err; + + err = NULL; +err: + kfree(b); + return err; +} + const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *sb_mi; - struct bch_sb_field_journal *journal; struct cache_member_cpu mi; const char *err; u16 block_size; - unsigned i; switch (le64_to_cpu(sb->version)) { case BCACHE_SB_VERSION_CDEV_V4: @@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx); - for (i = 0; i < sb->layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(sb->layout.sb_offset[i]); - u64 max_size = 1 << sb->layout.sb_max_size_bits; - - if (offset + max_size > mi.first_bucket * mi.bucket_size) - return "Invalid superblock: first bucket comes before end of super"; - } - if (mi.nbuckets > LONG_MAX) return "Too many buckets"; @@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) mi.bucket_size * mi.nbuckets) return "Invalid superblock: device too small"; - /* Validate journal buckets: */ - journal = bch_sb_get_journal(sb); - if (journal) { - for (i = 0; i < bch_nr_journal_buckets(journal); i++) { - u64 b = le64_to_cpu(journal->buckets[i]); - - if (b < mi.first_bucket || b >= mi.nbuckets) - return "bad journal bucket"; - } - } + err = bch_validate_journal_layout(sb, mi); + if (err) + return err; return NULL; } @@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev) static bool bch_is_open(struct block_device *bdev) { - lockdep_assert_held(&bch_register_lock); + bool ret; - return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); + mutex_lock(&bch_register_lock); + ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); + mutex_unlock(&bch_register_lock); + + return ret; } -static const char *bch_blkdev_open(const char *path, void *holder, - struct bch_opts opts, - struct block_device **ret) +static const char *bch_blkdev_open(const char *path, fmode_t mode, + void *holder, struct block_device **ret) { struct block_device *bdev; - fmode_t mode = opts.nochanges > 0 - ? FMODE_READ - : FMODE_READ|FMODE_WRITE|FMODE_EXCL; const char *err; *ret = NULL; @@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca) unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; int ret; - ret = bch_dev_sb_realloc(&ca->disk_sb, u64s); + ret = bch_sb_realloc(&ca->disk_sb, u64s); if (ret) return ret; @@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset) reread: bio_reset(sb->bio); sb->bio->bi_bdev = sb->bdev; - sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR; + sb->bio->bi_iter.bi_sector = offset; sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch_bio_map(sb->bio, sb->sb); @@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb, struct bch_opts opts, const char *path) { + u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR; struct bch_sb_layout layout; const char *err; unsigned i; - lockdep_assert_held(&bch_register_lock); - memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; - err = bch_blkdev_open(path, &sb, opts, &sb->bdev); + if (!(opt_defined(opts.noexcl) && opts.noexcl)) + sb->mode |= FMODE_EXCL; + + if (!(opt_defined(opts.nochanges) && opts.nochanges)) + sb->mode |= FMODE_WRITE; + + err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev); if (err) return err; @@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb, if (bch_fs_init_fault("read_super")) goto err; - err = read_one_super(sb, BCH_SB_SECTOR); + err = read_one_super(sb, offset); if (!err) goto got_super; - pr_err("error reading default super: %s", err); + if (offset != BCH_SB_SECTOR) { + pr_err("error reading superblock: %s", err); + goto err; + } + + pr_err("error reading default superblock: %s", err); /* * Error reading primary superblock - read location of backup @@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c) lockdep_assert_held(&c->sb_lock); + if (c->opts.nochanges) + return; + closure_init_stack(cl); le64_add_cpu(&c->disk_sb->seq, 1); diff --git a/libbcache/super-io.h b/libbcache/super-io.h index 665de811..ae1e8b9d 100644 --- a/libbcache/super-io.h +++ b/libbcache/super-io.h @@ -6,16 +6,35 @@ #include <asm/byteorder.h> -struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types); +struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type); +struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *, + enum bch_sb_field_type, unsigned); +struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *, + enum bch_sb_field_type, unsigned); -#define BCH_SB_FIELD_TYPE(_name) \ -static inline struct bch_sb_field_##_name * \ -bch_sb_get_##_name(struct bch_sb *sb) \ -{ \ - struct bch_sb_field *f = \ - bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \ - \ - return container_of_or_null(f, struct bch_sb_field_##_name, field);\ +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + +#define BCH_SB_FIELD_TYPE(_name) \ +static inline struct bch_sb_field_##_name * \ +bch_sb_get_##_name(struct bch_sb *sb) \ +{ \ + return field_to_type(bch_sb_field_get(sb, \ + BCH_SB_FIELD_##_name), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \ +{ \ + return field_to_type(bch_sb_field_resize(sb, \ + BCH_SB_FIELD_##_name, u64s), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s) \ +{ \ + return field_to_type(bch_fs_sb_field_resize(c, \ + BCH_SB_FIELD_##_name, u64s), _name); \ } BCH_SB_FIELD_TYPE(journal); @@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned); int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *); int bch_sb_from_cache_set(struct cache_set *, struct cache *); -struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *, - struct bch_sb_field *, unsigned); -struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *, - struct bch_sb_field *, unsigned); - void bch_free_super(struct bcache_superblock *); int bch_super_realloc(struct bcache_superblock *, unsigned); +const char *bch_validate_journal_layout(struct bch_sb *, + struct cache_member_cpu); const char *bch_validate_cache_super(struct bcache_superblock *); const char *bch_read_super(struct bcache_superblock *, diff --git a/libbcache/super.c b/libbcache/super.c index fab34805..5535639c 100644 --- a/libbcache/super.c +++ b/libbcache/super.c @@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); struct workqueue_struct *bcache_io_wq; struct crypto_shash *bch_sha256; -static void bch_dev_stop(struct cache *); +static void bch_dev_free(struct cache *); static int bch_dev_online(struct cache *); static int bch_congested_fn(void *data, int bdi_bits) @@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits) } } } else { - /* Writes only go to tier 0: */ - group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) { + /* Writes prefer fastest tier: */ + struct bch_tier *tier = READ_ONCE(c->fastest_tier); + struct cache_group *grp = tier ? &tier->devs : &c->cache_all; + + group_for_each_cache_rcu(ca, grp, i) { bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); if (bdi_congested(bdi, bdi_bits)) { @@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits) return ret; } -/* Cache set RO/RW: */ +/* Filesystem RO/RW: */ /* * For startup/shutdown of RW stuff, the dependencies are: @@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c) struct cache *ca; unsigned i; - c->tiering_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&c->tiering_pd.rate); - bch_tiering_read_stop(c); + bch_tiering_stop(c); for_each_cache(ca, c, i) bch_moving_gc_stop(ca); @@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c) for_each_cache(ca, c, i) bch_dev_allocator_stop(ca); - /* - * Write a journal entry after flushing the btree, so we don't end up - * replaying everything we just flushed: - */ - if (test_bit(JOURNAL_STARTED, &c->journal.flags)) { - int ret; - - bch_journal_flush_async(&c->journal, NULL); - ret = bch_journal_meta(&c->journal); - BUG_ON(ret && !bch_journal_error(&c->journal)); - } - - cancel_delayed_work_sync(&c->journal.write_work); - cancel_delayed_work_sync(&c->journal.reclaim_work); + bch_fs_journal_stop(&c->journal); } static void bch_writes_disabled(struct percpu_ref *writes) @@ -167,67 +155,18 @@ static void bch_writes_disabled(struct percpu_ref *writes) wake_up(&bch_read_only_wait); } -static void bch_fs_read_only_work(struct work_struct *work) +void bch_fs_read_only(struct cache_set *c) { - struct cache_set *c = - container_of(work, struct cache_set, read_only_work); + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RW) + goto out; - percpu_ref_put(&c->writes); - - del_timer(&c->foreground_write_wakeup); - cancel_delayed_work(&c->pd_controllers_update); - - c->foreground_write_pd.rate.rate = UINT_MAX; - bch_wake_delayed_writes((unsigned long) c); - - if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious - * errors due to shutting down the allocator: - */ - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); - - __bch_fs_read_only(c); - - if (!bch_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb, true); - bch_write_super(c); - mutex_unlock(&c->sb_lock); - } - } else { - /* - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - __bch_fs_read_only(c); - - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); - } - - bch_notify_fs_read_only(c); - trace_fs_read_only_done(c); - - set_bit(BCH_FS_RO_COMPLETE, &c->flags); - wake_up(&bch_read_only_wait); -} - -bool bch_fs_read_only(struct cache_set *c) -{ - if (test_and_set_bit(BCH_FS_RO, &c->flags)) - return false; + if (test_bit(BCH_FS_ERROR, &c->flags)) + goto out; trace_fs_read_only(c); - percpu_ref_get(&c->writes); - /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: @@ -238,40 +177,83 @@ bool bch_fs_read_only(struct cache_set *c) */ percpu_ref_kill(&c->writes); - queue_work(system_freezable_wq, &c->read_only_work); - return true; + del_timer(&c->foreground_write_wakeup); + cancel_delayed_work(&c->pd_controllers_update); + + c->foreground_write_pd.rate.rate = UINT_MAX; + bch_wake_delayed_writes((unsigned long) c); + + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator: + * + * If we are doing an emergency shutdown outstanding writes may + * hang until we shutdown the allocator so we don't want to wait + * on outstanding writes before shutting everything down - but + * we do need to wait on them before returning and signalling + * that going RO is complete: + */ + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || + test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + + __bch_fs_read_only(c); + + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + + if (!bch_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags)) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb, true); + bch_write_super(c); + mutex_unlock(&c->sb_lock); + } + + c->state = BCH_FS_RO; + bch_notify_fs_read_only(c); + trace_fs_read_only_done(c); +out: + mutex_unlock(&c->state_lock); +} + +static void bch_fs_read_only_work(struct work_struct *work) +{ + struct cache_set *c = + container_of(work, struct cache_set, read_only_work); + + bch_fs_read_only(c); +} + +static void bch_fs_read_only_async(struct cache_set *c) +{ + queue_work(system_long_wq, &c->read_only_work); } bool bch_fs_emergency_read_only(struct cache_set *c) { bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); - bch_fs_read_only(c); + bch_fs_read_only_async(c); bch_journal_halt(&c->journal); wake_up(&bch_read_only_wait); return ret; } -void bch_fs_read_only_sync(struct cache_set *c) -{ - /* so we don't race with bch_fs_read_write() */ - lockdep_assert_held(&bch_register_lock); - - bch_fs_read_only(c); - - wait_event(bch_read_only_wait, - test_bit(BCH_FS_RO_COMPLETE, &c->flags) && - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -} - -static const char *__bch_fs_read_write(struct cache_set *c) +const char *bch_fs_read_write(struct cache_set *c) { struct cache *ca; - const char *err; + const char *err = NULL; unsigned i; - lockdep_assert_held(&bch_register_lock); + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RO) + goto out; err = "error starting allocator thread"; for_each_cache(ca, c, i) @@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c) if (bch_gc_thread_start(c)) goto err; - for_each_cache(ca, c, i) { - if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) - continue; - - err = "error starting moving GC thread"; - if (bch_moving_gc_thread_start(ca)) { + err = "error starting moving GC thread"; + for_each_cache(ca, c, i) + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && + bch_moving_gc_start(ca)) { percpu_ref_put(&ca->ref); goto err; } - } err = "error starting tiering thread"; - if (bch_tiering_read_start(c)) + if (bch_tiering_start(c)) goto err; schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); - return NULL; + if (c->state != BCH_FS_STARTING) + percpu_ref_reinit(&c->writes); + + c->state = BCH_FS_RW; + err = NULL; +out: + mutex_unlock(&c->state_lock); + return err; err: __bch_fs_read_only(c); - return err; + goto out; } -const char *bch_fs_read_write(struct cache_set *c) -{ - const char *err; - - lockdep_assert_held(&bch_register_lock); - - if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags)) - return NULL; - - err = __bch_fs_read_write(c); - if (err) - return err; - - percpu_ref_reinit(&c->writes); - - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_EMERGENCY_RO, &c->flags); - clear_bit(BCH_FS_RO_COMPLETE, &c->flags); - clear_bit(BCH_FS_RO, &c->flags); - return NULL; -} - -/* Cache set startup/shutdown: */ +/* Filesystem startup/shutdown: */ static void bch_fs_free(struct cache_set *c) { - del_timer_sync(&c->foreground_write_wakeup); - cancel_delayed_work_sync(&c->pd_controllers_update); - cancel_work_sync(&c->read_only_work); - cancel_work_sync(&c->bio_submit_work); - cancel_work_sync(&c->read_retry_work); - - bch_fs_encryption_free(c); - bch_btree_cache_free(c); - bch_journal_free(&c->journal); + bch_fs_encryption_exit(c); + bch_fs_btree_exit(c); + bch_fs_journal_exit(&c->journal); bch_io_clock_exit(&c->io_clock[WRITE]); bch_io_clock_exit(&c->io_clock[READ]); - bch_compress_free(c); + bch_fs_compress_exit(c); bch_fs_blockdev_exit(c); bdi_destroy(&c->bdi); lg_lock_free(&c->bucket_stats_lock); @@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c) module_put(THIS_MODULE); } +static void bch_fs_exit(struct cache_set *c) +{ + unsigned i; + + del_timer_sync(&c->foreground_write_wakeup); + cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); + cancel_work_sync(&c->bio_submit_work); + cancel_work_sync(&c->read_retry_work); + + for (i = 0; i < c->sb.nr_devices; i++) + if (c->cache[i]) + bch_dev_free(c->cache[i]); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void bch_fs_offline(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + mutex_lock(&bch_register_lock); + list_del(&c->list); + mutex_unlock(&bch_register_lock); + + if (c->kobj.state_in_sysfs) + kobject_del(&c->kobj); + + for_each_cache(ca, c, i) + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + + bch_fs_debug_exit(c); + bch_fs_chardev_exit(c); + + bch_cache_accounting_destroy(&c->accounting); + + kobject_put(&c->time_stats); + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + + __bch_fs_read_only(c); +} + /* * should be __bch_fs_stop4 - block devices are closed, now we can finally * free it @@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c) void bch_fs_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - struct completion *stop_completion = c->stop_completion; bch_notify_fs_stopped(c); - bch_info(c, "stopped"); - bch_fs_free(c); - - if (stop_completion) - complete(stop_completion); } /* @@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj) static void __bch_fs_stop3(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); - struct cache *ca; - unsigned i; - mutex_lock(&bch_register_lock); - for_each_cache(ca, c, i) - bch_dev_stop(ca); - - list_del(&c->list); - mutex_unlock(&bch_register_lock); - - closure_debug_destroy(&c->cl); - kobject_put(&c->kobj); + bch_fs_exit(c); } /* @@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - bch_debug_exit_cache_set(c); - bch_fs_chardev_exit(c); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch_cache_accounting_destroy(&c->accounting); - - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - mutex_lock(&bch_register_lock); - bch_fs_read_only_sync(c); - mutex_unlock(&bch_register_lock); + bch_fs_offline(c); closure_return(cl); } /* - * First phase of the shutdown process that's kicked off by bch_fs_stop(); we - * haven't waited for anything to stop yet, we're just punting to process + * First phase of the shutdown process that's kicked off by bch_fs_stop_async(); + * we haven't waited for anything to stop yet, we're just punting to process * context to shut down block devices: */ static void __bch_fs_stop1(struct closure *cl) @@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl) continue_at(cl, __bch_fs_stop2, system_wq); } -void bch_fs_stop(struct cache_set *c) +void bch_fs_stop_async(struct cache_set *c) { - if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags)) + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STOPPING) { + c->state = BCH_FS_STOPPING; closure_queue(&c->caching); + } + mutex_unlock(&c->state_lock); } -void bch_fs_stop_sync(struct cache_set *c) +void bch_fs_stop(struct cache_set *c) { - DECLARE_COMPLETION_ONSTACK(complete); + mutex_lock(&c->state_lock); + BUG_ON(c->state == BCH_FS_STOPPING); + c->state = BCH_FS_STOPPING; + mutex_unlock(&c->state_lock); + + bch_blockdevs_stop(c); + + closure_sync(&c->caching); + closure_debug_destroy(&c->caching); + + bch_fs_offline(c); - c->stop_completion = &complete; - bch_fs_stop(c); closure_put(&c->cl); + closure_sync(&c->cl); - /* Killable? */ - wait_for_completion(&complete); + bch_fs_exit(c); + kobject_put(&c->kobj); } /* Stop, detaching from backing devices: */ void bch_fs_detach(struct cache_set *c) { if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags)) - bch_fs_stop(c); + bch_fs_stop_async(c); } static unsigned bch_fs_nr_devices(struct cache_set *c) @@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->minor = -1; + mutex_init(&c->state_lock); mutex_init(&c->sb_lock); INIT_RADIX_TREE(&c->devices, GFP_KERNEL); mutex_init(&c->btree_cache_lock); @@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BCH_TIME_STATS() #undef BCH_TIME_STAT - bch_open_buckets_init(c); - bch_tiering_init_cache_set(c); + bch_fs_allocator_init(c); + bch_fs_tiering_init(c); INIT_LIST_HEAD(&c->list); INIT_LIST_HEAD(&c->cached_devs); @@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch_fs_blockdev_init(c) || bch_io_clock_init(&c->io_clock[READ]) || bch_io_clock_init(&c->io_clock[WRITE]) || - bch_journal_alloc(&c->journal, journal_entry_bytes) || - bch_btree_cache_alloc(c) || + bch_fs_journal_init(&c->journal, journal_entry_bytes) || + bch_fs_btree_init(c) || bch_fs_encryption_init(c) || - bch_compress_init(c) || + bch_fs_compress_init(c) || bch_check_set_has_compressed_data(c, c->opts.compression)) goto err; @@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) closure_init(&c->caching, &c->cl); set_closure_fn(&c->caching, __bch_fs_stop1, system_wq); + closure_get(&c->cl); continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq); return c; err: @@ -671,7 +660,20 @@ err: return NULL; } -static int bch_fs_online(struct cache_set *c) +static struct cache_set *bch_fs_lookup(uuid_le uuid) +{ + struct cache_set *c; + + lockdep_assert_held(&bch_register_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) + return c; + + return NULL; +} + +static const char *__bch_fs_online(struct cache_set *c) { struct cache *ca; unsigned i; @@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c) lockdep_assert_held(&bch_register_lock); if (!list_empty(&c->list)) - return 0; + return NULL; - list_add(&c->list, &bch_fs_list); + if (bch_fs_lookup(c->sb.uuid)) + return "filesystem UUID already open"; ret = bch_fs_chardev_init(c); if (ret) - return ret; + return "error creating character device"; + + bch_fs_debug_init(c); if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || kobject_add(&c->internal, &c->kobj, "internal") || kobject_add(&c->opts_dir, &c->kobj, "options") || kobject_add(&c->time_stats, &c->kobj, "time_stats") || bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) - return -1; + return "error creating sysfs objects"; for_each_cache(ca, c, i) if (bch_dev_online(ca)) { percpu_ref_put(&ca->ref); - return -1; + return "error creating sysfs objects"; } + mutex_lock(&c->state_lock); + + if (bch_blockdev_volumes_start(c)) { + mutex_unlock(&c->state_lock); + return "can't bring up blockdev volumes"; + } + + bch_attach_backing_devs(c); + + mutex_unlock(&c->state_lock); + + list_add(&c->list, &bch_fs_list); + return 0; } -static const char *bch_fs_start(struct cache_set *c) +static const char *bch_fs_online(struct cache_set *c) +{ + const char *err; + + mutex_lock(&bch_register_lock); + err = __bch_fs_online(c); + mutex_unlock(&bch_register_lock); + + return err; +} + +static const char *__bch_fs_start(struct cache_set *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; @@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c) struct jset *j; int ret = -EINVAL; - lockdep_assert_held(&bch_register_lock); - BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags)); - - /* We don't want bch_fatal_error() to free underneath us */ - closure_get(&c->caching); + BUG_ON(c->state != BCH_FS_STARTING); /* * Make sure that each cache object's mi is up to date before @@ -826,22 +851,8 @@ static const char *bch_fs_start(struct cache_set *c) bch_notice(c, "initializing new filesystem"); - err = "unable to allocate journal buckets"; - for_each_cache(ca, c, i) - if (bch_dev_journal_alloc(ca)) { - percpu_ref_put(&ca->ref); - goto err; - } - bch_initial_gc(c, NULL); - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - bch_journal_start(c); - bch_journal_set_replay_done(&c->journal); - err = "error starting allocator thread"; for_each_cache(ca, c, i) if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && @@ -850,6 +861,20 @@ static const char *bch_fs_start(struct cache_set *c) goto err; } + err = "unable to allocate journal buckets"; + for_each_cache(ca, c, i) + if (bch_dev_journal_alloc(ca)) { + percpu_ref_put(&ca->ref); + goto err; + } + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch_journal_start(c); + bch_journal_set_replay_done(&c->journal); + err = "cannot allocate new btree root"; for (id = 0; id < BTREE_ID_NR; id++) if (bch_btree_root_alloc(c, id, &cl)) { @@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c) goto err; } recovery_done: + err = "dynamic fault"; + if (bch_fs_init_fault("fs_start")) + goto err; + if (c->opts.read_only) { - bch_fs_read_only_sync(c); + bch_fs_read_only(c); } else { - err = __bch_fs_read_write(c); + err = bch_fs_read_write(c); if (err) goto err; } @@ -901,27 +930,9 @@ recovery_done: bch_write_super(c); mutex_unlock(&c->sb_lock); - err = "dynamic fault"; - if (bch_fs_init_fault("fs_start")) - goto err; - - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err; - - err = "can't bring up blockdev volumes"; - if (bch_blockdev_volumes_start(c)) - goto err; - - bch_debug_init_cache_set(c); - set_bit(BCH_FS_RUNNING, &c->flags); - bch_attach_backing_devs(c); - - bch_notify_fs_read_write(c); err = NULL; out: bch_journal_entries_free(&journal); - closure_put(&c->caching); return err; err: switch (ret) { @@ -955,6 +966,11 @@ err: goto out; } +const char *bch_fs_start(struct cache_set *c) +{ + return __bch_fs_start(c) ?: bch_fs_online(c); +} + static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c) { struct bch_sb_field_members *sb_mi; @@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c) return NULL; } -/* Cache device */ +/* Device startup/shutdown, ro/rw: */ bool bch_dev_read_only(struct cache *ca) { @@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca) bdevname(ca->disk_sb.bdev, buf); - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) return false; if (!bch_dev_may_remove(ca)) { bch_err(c, "required member %s going RO, forcing fs RO", buf); - bch_fs_read_only_sync(c); + bch_fs_read_only(c); } trace_bcache_cache_read_only(ca); @@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca) static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca) { - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) return NULL; @@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca) if (bch_dev_allocator_start(ca)) return "error starting allocator thread"; - if (bch_moving_gc_thread_start(ca)) + if (bch_moving_gc_start(ca)) return "error starting moving GC thread"; - bch_dev_group_add(&c->journal.devs, ca); - - wake_up_process(c->tiering_read); + if (bch_tiering_start(c)) + return "error starting tiering thread"; bch_notify_dev_read_write(ca); trace_bcache_cache_read_write_done(ca); @@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca) return NULL; } -/* - * bch_dev_stop has already returned, so we no longer hold the register - * lock at the point this is called. - */ - void bch_dev_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); - percpu_ref_exit(&ca->ref); kfree(ca); } -static void bch_dev_free_work(struct work_struct *work) +static void bch_dev_free(struct cache *ca) { - struct cache *ca = container_of(work, struct cache, free_work); struct cache_set *c = ca->set; unsigned i; @@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work) kobject_del(&ca->kobj); bch_free_super(&ca->disk_sb); - - /* - * bch_dev_stop can be called in the middle of initialization - * of the struct cache object. - * As such, not all the sub-structures may be initialized. - * However, they were zeroed when the object was allocated. - */ - - bch_journal_free_cache(ca); + bch_dev_journal_exit(ca); free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->bucket_stats_percpu); @@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); + percpu_ref_exit(&ca->ref); kobject_put(&ca->kobj); if (c) kobject_put(&c->kobj); } +static void bch_dev_free_work(struct work_struct *work) +{ + struct cache *ca = container_of(work, struct cache, free_work); + + bch_dev_free(ca); +} + static void bch_dev_percpu_ref_release(struct percpu_ref *ref) { struct cache *ca = container_of(ref, struct cache, ref); @@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca) { struct cache_set *c = ca->set; - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); - if (c) { - BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca); - rcu_assign_pointer(c->cache[ca->dev_idx], NULL); - } + BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca); + rcu_assign_pointer(c->cache[ca->dev_idx], NULL); call_rcu(&ca->free_rcu, bch_dev_free_rcu); } @@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work) */ closure_get(&c->cl); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); + bch_dev_stop(ca); /* @@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work) */ synchronize_rcu(); - lockdep_assert_held(&bch_register_lock); - /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: @@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work) memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); bch_write_super(c); - mutex_unlock(&c->sb_lock); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); closure_put(&c->cl); } -bool bch_dev_remove(struct cache *ca, bool force) +static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force) { - mutex_lock(&bch_register_lock); - if (test_bit(BCH_DEV_REMOVING, &ca->flags)) return false; if (!bch_dev_may_remove(ca)) { - bch_err(ca->set, "Can't remove last device in tier %u", - ca->mi.tier); + bch_err(ca->set, "Can't remove last RW device"); bch_notify_dev_remove_failed(ca); return false; } @@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force) if (force) set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags); + set_bit(BCH_DEV_REMOVING, &ca->flags); bch_notify_dev_removing(ca); - mutex_unlock(&bch_register_lock); - /* Migrate the data and finish removal asynchronously: */ queue_work(system_long_wq, &ca->remove_work); return true; } +bool bch_dev_remove(struct cache *ca, bool force) +{ + struct cache_set *c = ca->set; + bool ret; + + mutex_lock(&c->state_lock); + ret = __bch_dev_remove(c, ca, force); + mutex_unlock(&c->state_lock); + + return ret; +} + static int bch_dev_online(struct cache *ca) { char buf[12]; - lockdep_assert_held(&bch_register_lock); - sprintf(buf, "cache%u", ca->dev_idx); if (kobject_add(&ca->kobj, @@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, kobject_init(&ca->kobj, &bch_dev_ktype); spin_lock_init(&ca->self.lock); - ca->self.nr_devices = 1; + ca->self.nr = 1; rcu_assign_pointer(ca->self.d[0].dev, ca); ca->dev_idx = sb->sb->dev_idx; @@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->prio_buckets_lock); mutex_init(&ca->heap_lock); - bch_moving_init_cache(ca); + bch_dev_moving_gc_init(ca); ca->disk_sb = *sb; - ca->disk_sb.bdev->bd_holder = ca; + if (sb->mode & FMODE_EXCL) + ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work); @@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || - bch_journal_init_cache(ca)) + bch_dev_journal_init(ca)) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); @@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, err = "error creating kobject"; if (c->kobj.state_in_sysfs && bch_dev_online(ca)) - goto err; + pr_warn("error creating sysfs objects"); if (ret) *ret = ca; @@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, kobject_put(&ca->kobj); return NULL; err: - bch_dev_stop(ca); + bch_dev_free(ca); return err; } -static struct cache_set *bch_fs_lookup(uuid_le uuid) -{ - struct cache_set *c; - - lockdep_assert_held(&bch_register_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) - return c; - - return NULL; -} - int bch_dev_add(struct cache_set *c, const char *path) { struct bcache_superblock sb; const char *err; struct cache *ca; - struct bch_sb_field *f; struct bch_sb_field_members *mi, *dev_mi; struct bch_member saved_mi; unsigned dev_idx, nr_devices, u64s; int ret = -EINVAL; - mutex_lock(&bch_register_lock); - err = bch_read_super(&sb, c->opts, path); if (err) - goto err_unlock_register; + return -EINVAL; err = bch_validate_cache_super(&sb); if (err) - goto err_unlock_register; - - mutex_lock(&c->sb_lock); + return -EINVAL; err = bch_dev_may_add(sb.sb, c); if (err) - goto err_unlock; + return -EINVAL; + + mutex_lock(&c->state_lock); + mutex_lock(&c->sb_lock); /* * Preserve the old cache member information (esp. tier) @@ -1571,17 +1568,14 @@ have_slot: sizeof(struct bch_member) * nr_devices) / sizeof(u64); err = "no space in superblock for member info"; - f = bch_fs_sb_field_resize(c, &mi->field, u64s); - if (!f) + mi = bch_fs_sb_resize_members(c, u64s); + if (!mi) goto err_unlock; - mi = container_of(f, struct bch_sb_field_members, field); - - f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s); - if (!f) + dev_mi = bch_sb_resize_members(&sb, u64s); + if (!dev_mi) goto err_unlock; - dev_mi = container_of(f, struct bch_sb_field_members, field); memcpy(dev_mi, mi, u64s * sizeof(u64)); dev_mi->members[dev_idx] = saved_mi; @@ -1619,14 +1613,13 @@ have_slot: kobject_put(&ca->kobj); mutex_unlock(&c->sb_lock); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); return 0; err_put: bch_dev_stop(ca); err_unlock: mutex_unlock(&c->sb_lock); -err_unlock_register: - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); bch_free_super(&sb); bch_err(c, "Unable to add device: %s", err); @@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices, const char *err; struct cache_set *c = NULL; struct bcache_superblock *sb; - uuid_le uuid; unsigned i; - memset(&uuid, 0, sizeof(uuid_le)); - if (!nr_devices) return "need at least one device"; @@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices, if (!sb) goto err; - /* - * bch_read_super() needs to happen under register_lock, so that the - * exclusive open is atomic with adding the new cache set to the list of - * cache sets: - */ - mutex_lock(&bch_register_lock); - for (i = 0; i < nr_devices; i++) { err = bch_read_super(&sb[i], opts, devices[i]); if (err) - goto err_unlock; + goto err; err = "attempting to register backing device"; if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) - goto err_unlock; + goto err; err = bch_validate_cache_super(&sb[i]); if (err) - goto err_unlock; + goto err; } - err = "cache set already registered"; - if (bch_fs_lookup(sb->sb->uuid)) - goto err_unlock; - err = "cannot allocate memory"; c = bch_fs_alloc(sb[0].sb, opts); if (!c) - goto err_unlock; + goto err; for (i = 0; i < nr_devices; i++) { err = bch_dev_alloc(&sb[i], c, NULL); if (err) - goto err_unlock; + goto err; } err = "insufficient devices"; if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c)) - goto err_unlock; + goto err; - err = bch_fs_start(c); - if (err) - goto err_unlock; - - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err_unlock; - - if (ret) { - closure_get(&c->cl); - *ret = c; + if (!c->opts.nostart) { + err = __bch_fs_start(c); + if (err) + goto err; } - mutex_unlock(&bch_register_lock); + err = bch_fs_online(c); + if (err) + goto err; + + if (ret) + *ret = c; + else + closure_put(&c->cl); err = NULL; out: @@ -1717,20 +1696,18 @@ out: if (err) c = NULL; return err; -err_unlock: +err: if (c) bch_fs_stop(c); - mutex_unlock(&bch_register_lock); -err: + for (i = 0; i < nr_devices; i++) bch_free_super(&sb[i]); goto out; } static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, - struct bch_opts opts) + struct bch_opts opts) { - char name[BDEVNAME_SIZE]; const char *err; struct cache_set *c; bool allocated_cache_set = false; @@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, if (err) return err; - bdevname(sb->bdev, name); - + mutex_lock(&bch_register_lock); c = bch_fs_lookup(sb->sb->uuid); if (c) { + closure_get(&c->cl); + err = bch_dev_in_fs(sb->sb, c); if (err) - return err; + goto err; } else { c = bch_fs_alloc(sb->sb, opts); + err = "cannot allocate memory"; if (!c) - return "cannot allocate memory"; + goto err; allocated_cache_set = true; } @@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, if (err) goto err; - if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) { - err = bch_fs_start(c); + if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) && + !c->opts.nostart) { + err = __bch_fs_start(c); if (err) goto err; - } else { - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err; } - bch_info(c, "started"); + err = __bch_fs_online(c); + if (err) + goto err; + + closure_put(&c->cl); + mutex_unlock(&bch_register_lock); + return NULL; err: + mutex_unlock(&bch_register_lock); + if (allocated_cache_set) bch_fs_stop(c); + else if (c) + closure_put(&c->cl); + return err; } @@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path) struct bch_opts opts = bch_opts_empty(); const char *err; - mutex_lock(&bch_register_lock); - err = bch_read_super(&sb, opts, path); if (err) - goto err; + return err; - if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) + if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) { + mutex_lock(&bch_register_lock); err = bch_backing_dev_register(&sb); - else + mutex_unlock(&bch_register_lock); + } else { err = __bch_fs_open_incremental(&sb, opts); + } bch_free_super(&sb); -err: - mutex_unlock(&bch_register_lock); + return err; } @@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) pr_info("Setting all devices read only:"); list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only(c); + bch_fs_read_only_async(c); list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only_sync(c); + bch_fs_read_only(c); mutex_unlock(&bch_register_lock); } @@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot, reboot_test); static void bcache_exit(void) { bch_debug_exit(); - bch_fs_exit(); + bch_vfs_exit(); bch_blockdev_exit(); bch_chardev_exit(); if (bcache_kset) @@ -1917,7 +1904,7 @@ static int __init bcache_init(void) sysfs_create_files(&bcache_kset->kobj, files) || bch_chardev_init() || bch_blockdev_init() || - bch_fs_init() || + bch_vfs_init() || bch_debug_init()) goto err; diff --git a/libbcache/super.h b/libbcache/super.h index bcf7d983..bafd88e0 100644 --- a/libbcache/super.h +++ b/libbcache/super.h @@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c, static inline bool bch_dev_may_remove(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + struct cache_group *grp = &c->cache_all; - /* - * Right now, we can't remove the last device from a tier, - * - For tier 0, because all metadata lives in tier 0 and because - * there is no way to have foreground writes go directly to tier 1. - * - For tier 1, because the code doesn't completely support an - * empty tier 1. - */ - - /* - * Turning a device read-only removes it from the cache group, - * so there may only be one read-write device in a tier, and yet - * the device we are removing is in the same tier, so we have - * to check for identity. - * Removing the last RW device from a tier requires turning the - * whole cache set RO. - */ - - return tier->nr_devices != 1 || - rcu_access_pointer(tier->d[0].dev) != ca; + /* Can't remove the last RW device: */ + return grp->nr != 1 || + rcu_access_pointer(grp->d[0].dev) != ca; } void bch_dev_release(struct kobject *); @@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *); void bch_fs_detach(struct cache_set *); -bool bch_fs_read_only(struct cache_set *); bool bch_fs_emergency_read_only(struct cache_set *); -void bch_fs_read_only_sync(struct cache_set *); +void bch_fs_read_only(struct cache_set *); const char *bch_fs_read_write(struct cache_set *); void bch_fs_release(struct kobject *); +void bch_fs_stop_async(struct cache_set *); void bch_fs_stop(struct cache_set *); -void bch_fs_stop_sync(struct cache_set *); +const char *bch_fs_start(struct cache_set *); const char *bch_fs_open(char * const *, unsigned, struct bch_opts, struct cache_set **); const char *bch_fs_open_incremental(const char *path); diff --git a/libbcache/super_types.h b/libbcache/super_types.h index 41eaf0dd..69c747de 100644 --- a/libbcache/super_types.h +++ b/libbcache/super_types.h @@ -6,6 +6,7 @@ struct bcache_superblock { struct block_device *bdev; struct bio *bio; unsigned page_order; + fmode_t mode; }; #endif /* _BCACHE_SUPER_TYPES_H */ diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c index 9f45a6b0..48f9f1f6 100644 --- a/libbcache/sysfs.c +++ b/libbcache/sysfs.c @@ -22,6 +22,7 @@ #include "opts.h" #include "request.h" #include "super-io.h" +#include "tier.h" #include "writeback.h" #include <linux/blkdev.h> @@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy); rw_attribute(foreground_write_ratelimit_enabled); rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); + +rw_attribute(tier); rw_attribute(tiering_enabled); rw_attribute(tiering_percent); sysfs_pd_controller_attribute(tiering); @@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent); rw_attribute(size); read_attribute(meta_replicas_have); read_attribute(data_replicas_have); -read_attribute(tier); #define BCH_DEBUG_PARAM(name, description) \ rw_attribute(name); @@ -680,7 +682,8 @@ SHOW(bch_fs) sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); sysfs_print(tiering_percent, c->tiering_percent); - sysfs_pd_controller_show(tiering, &c->tiering_pd); + + sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); @@ -694,7 +697,7 @@ SHOW(bch_fs) BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_RUNNING, &c->flags)) + if (!bch_fs_running(c)) return -EPERM; if (attr == &sysfs_bset_tree_stats) @@ -723,7 +726,7 @@ STORE(__bch_fs) } if (attr == &sysfs_stop) { - bch_fs_stop(c); + bch_fs_stop_async(c); return size; } @@ -773,25 +776,18 @@ STORE(__bch_fs) ssize_t ret = strtoul_safe(buf, c->tiering_enabled) ?: (ssize_t) size; - if (c->tiering_read) - wake_up_process(c->tiering_read); + bch_tiering_start(c); /* issue wakeups */ return ret; } sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd); - if (attr == &sysfs_journal_flush) { - bch_journal_meta_async(&c->journal, NULL); - - return size; - } - sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); sysfs_strtoul(tiering_percent, c->tiering_percent); - sysfs_pd_controller_store(tiering, &c->tiering_pd); + sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ /* Debugging: */ @@ -799,11 +795,14 @@ STORE(__bch_fs) BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_RUNNING, &c->flags)) + if (!bch_fs_running(c)) return -EPERM; - if (test_bit(BCH_FS_STOPPING, &c->flags)) - return -EINTR; + if (attr == &sysfs_journal_flush) { + bch_journal_meta_async(&c->journal, NULL); + + return size; + } if (attr == &sysfs_blockdev_volume_create) { u64 v = strtoi_h_or_return(buf); @@ -836,9 +835,9 @@ STORE(bch_fs) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); size = __bch_fs_store(kobj, attr, buf, size); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); if (attr == &sysfs_add_device) { char *path = kstrdup(buf, GFP_KERNEL); @@ -1273,6 +1272,31 @@ STORE(__bch_dev) mutex_unlock(&c->sb_lock); } + if (attr == &sysfs_tier) { + unsigned prev_tier; + unsigned v = strtoul_restrict_or_return(buf, + 0, BCH_TIER_MAX - 1); + + mutex_lock(&c->sb_lock); + prev_tier = ca->mi.tier; + + if (v == ca->mi.tier) { + mutex_unlock(&c->sb_lock); + return size; + } + + mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_TIER(mi, v); + bch_write_super(c); + + bch_dev_group_remove(&c->tiers[prev_tier].devs, ca); + bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + mutex_unlock(&c->sb_lock); + + bch_recalc_capacity(c); + bch_tiering_start(c); + } + if (attr == &sysfs_state_rw) { char name[BDEVNAME_SIZE]; const char *err = NULL; diff --git a/libbcache/tier.c b/libbcache/tier.c index 46864594..0ab17708 100644 --- a/libbcache/tier.c +++ b/libbcache/tier.c @@ -16,8 +16,7 @@ #include <trace/events/bcache.h> struct tiering_state { - struct cache_group *tier; - unsigned tier_idx; + struct bch_tier *tier; unsigned sectors; unsigned stripe_size; unsigned dev_idx; @@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c, mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) if (ptr->dev < mi->nr_devices && - mi->m[ptr->dev].tier >= s->tier_idx) + mi->m[ptr->dev].tier >= s->tier->idx) replicas++; cache_member_info_put(); @@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s) s->sectors = 0; s->dev_idx++; - spin_lock(&s->tier->lock); - if (s->dev_idx >= s->tier->nr_devices) + spin_lock(&s->tier->devs.lock); + if (s->dev_idx >= s->tier->devs.nr) s->dev_idx = 0; - if (s->tier->nr_devices) { - s->ca = s->tier->d[s->dev_idx].dev; + if (s->tier->devs.nr) { + s->ca = s->tier->devs.d[s->dev_idx].dev; percpu_ref_get(&s->ca->ref); } - spin_unlock(&s->tier->lock); + spin_unlock(&s->tier->devs.lock); } } @@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c, * tiering_next_cache - issue a move to write an extent to the next cache * device in round robin order */ -static s64 read_tiering(struct cache_set *c, struct cache_group *tier) +static s64 read_tiering(struct cache_set *c, struct bch_tier *tier) { struct moving_context ctxt; struct tiering_state s; struct btree_iter iter; struct bkey_s_c k; - unsigned nr_devices = READ_ONCE(tier->nr_devices); + unsigned nr_devices = READ_ONCE(tier->devs.nr); int ret; if (!nr_devices) @@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) memset(&s, 0, sizeof(s)); s.tier = tier; - s.tier_idx = tier - c->cache_tiers; s.stripe_size = 2048; /* 1 mb for now */ - bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate, + bch_move_ctxt_init(&ctxt, &tier->pd.rate, nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); @@ -164,8 +162,8 @@ next: static int bch_tiering_thread(void *arg) { - struct cache_set *c = arg; - struct cache_group *tier = &c->cache_tiers[1]; + struct bch_tier *tier = arg; + struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]); struct io_clock *clock = &c->io_clock[WRITE]; struct cache *ca; u64 tier_capacity, available_sectors; @@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg) while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && - tier->nr_devices)) + tier->devs.nr)) break; while (1) { - struct cache_group *faster_tier; + struct bch_tier *faster_tier; last = atomic_long_read(&clock->now); tier_capacity = available_sectors = 0; rcu_read_lock(); - for (faster_tier = c->cache_tiers; + for (faster_tier = c->tiers; faster_tier != tier; faster_tier++) { - group_for_each_cache_rcu(ca, faster_tier, i) { + group_for_each_cache_rcu(ca, &faster_tier->devs, i) { tier_capacity += (ca->mi.nbuckets - ca->mi.first_bucket) << ca->bucket_bits; @@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg) return 0; } -void bch_tiering_init_cache_set(struct cache_set *c) +static void __bch_tiering_stop(struct bch_tier *tier) { - bch_pd_controller_init(&c->tiering_pd); + tier->pd.rate.rate = UINT_MAX; + bch_ratelimit_reset(&tier->pd.rate); + + if (tier->migrate) + kthread_stop(tier->migrate); + + tier->migrate = NULL; } -int bch_tiering_read_start(struct cache_set *c) +void bch_tiering_stop(struct cache_set *c) { - struct task_struct *t; + struct bch_tier *tier; + + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) + __bch_tiering_stop(tier); +} + +static int __bch_tiering_start(struct bch_tier *tier) +{ + if (!tier->migrate) { + struct task_struct *p = + kthread_create(bch_tiering_thread, tier, + "bch_tier[%u]", tier->idx); + if (IS_ERR(p)) + return PTR_ERR(p); + + tier->migrate = p; + } + + wake_up_process(tier->migrate); + return 0; +} + +int bch_tiering_start(struct cache_set *c) +{ + struct bch_tier *tier; + bool have_faster_tier = false; if (c->opts.nochanges) return 0; - t = kthread_create(bch_tiering_thread, c, "bch_tier_read"); - if (IS_ERR(t)) - return PTR_ERR(t); + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; - c->tiering_read = t; - wake_up_process(c->tiering_read); + if (have_faster_tier) { + int ret = __bch_tiering_start(tier); + if (ret) + return ret; + } else { + __bch_tiering_stop(tier); + } + + have_faster_tier = true; + } return 0; } -void bch_tiering_read_stop(struct cache_set *c) +void bch_fs_tiering_init(struct cache_set *c) { - if (!IS_ERR_OR_NULL(c->tiering_read)) { - kthread_stop(c->tiering_read); - c->tiering_read = NULL; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + c->tiers[i].idx = i; + bch_pd_controller_init(&c->tiers[i].pd); } } diff --git a/libbcache/tier.h b/libbcache/tier.h index 89c2bffd..b53e83d9 100644 --- a/libbcache/tier.h +++ b/libbcache/tier.h @@ -1,8 +1,8 @@ #ifndef _BCACHE_TIER_H #define _BCACHE_TIER_H -void bch_tiering_init_cache_set(struct cache_set *); -int bch_tiering_read_start(struct cache_set *); -void bch_tiering_read_stop(struct cache_set *); +void bch_tiering_stop(struct cache_set *); +int bch_tiering_start(struct cache_set *); +void bch_fs_tiering_init(struct cache_set *); #endif diff --git a/linux/blkdev.c b/linux/blkdev.c index 0bae9b0d..93459d0b 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -20,8 +20,14 @@ int submit_bio_wait(struct bio *bio) ssize_t ret; unsigned i; - if (bio->bi_opf & REQ_PREFLUSH) - fdatasync(bio->bi_bdev->bd_fd); + if (bio->bi_opf & REQ_PREFLUSH) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) { + fprintf(stderr, "fsync error: %s\n", + strerror(errno)); + return -EIO; + } + } i = 0; bio_for_each_segment(bv, bio, iter) @@ -49,10 +55,22 @@ int submit_bio_wait(struct bio *bio) BUG(); } - if (bio->bi_opf & REQ_FUA) - fdatasync(bio->bi_bdev->bd_fd); + if (ret != bio->bi_iter.bi_size) { + fprintf(stderr, "IO error: %li (%s)\n", + ret, strerror(errno)); + return -EIO; + } - return ret == bio->bi_iter.bi_size ? 0 : -EIO; + if (bio->bi_opf & REQ_FUA) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) { + fprintf(stderr, "fsync error: %s\n", + strerror(errno)); + return -EIO; + } + } + + return 0; } void generic_make_request(struct bio *bio) diff --git a/qcow2.c b/qcow2.c index cbc8d4c4..b7aa8c26 100644 --- a/qcow2.c +++ b/qcow2.c @@ -2,7 +2,6 @@ #include <errno.h> #include <sys/types.h> #include <unistd.h> -#include <linux/sort.h> #include "qcow2.h" #include "tools-util.h" @@ -69,18 +68,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset) img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED); } -static int range_cmp(const void *_l, const void *_r) -{ - const struct range *l = _l, *r = _r; - - if (l->start < r->start) - return -1; - if (l->start > r->start) - return 1; - return 0; -} - -void qcow2_write_image(int infd, int outfd, sparse_data *data, +void qcow2_write_image(int infd, int outfd, ranges *data, unsigned block_size) { u64 image_size = get_size(NULL, infd); @@ -98,30 +86,11 @@ void qcow2_write_image(int infd, int outfd, sparse_data *data, struct range *r; char *buf = xmalloc(block_size); u64 src_offset, dst_offset; - sparse_data m; assert(is_power_of_2(block_size)); - sort(&darray_item(*data, 0), - darray_size(*data), - sizeof(darray_item(*data, 0)), - range_cmp, NULL); - - /* Round to blocksize, merge contiguous ranges: */ - darray_init(m); - darray_foreach(r, *data) { - struct range *l = m.size ? &m.item[m.size - 1] : NULL; - - r->start = round_down(r->start, block_size); - r->end = round_up(r->end, block_size); - - if (l && l->end >= r->start) - l->end = max(l->end, r->end); - else - darray_append(m, *r); - } - darray_free(*data); - *data = m; + ranges_roundup(data, block_size); + ranges_sort_merge(data); /* Write data: */ darray_foreach(r, *data) diff --git a/qcow2.h b/qcow2.h index c6f0b6ba..0943d55c 100644 --- a/qcow2.h +++ b/qcow2.h @@ -2,23 +2,8 @@ #define _QCOW2_H #include <linux/types.h> -#include "ccan/darray/darray.h" +#include "tools-util.h" -struct range { - u64 start; - u64 end; -}; - -typedef darray(struct range) sparse_data; - -static inline void data_add(sparse_data *data, u64 offset, u64 size) -{ - darray_append(*data, (struct range) { - .start = offset, - .end = offset + size - }); -} - -void qcow2_write_image(int, int, sparse_data *, unsigned); +void qcow2_write_image(int, int, ranges *, unsigned); #endif /* _QCOW2_H */ diff --git a/tools-util.c b/tools-util.c index 0a95fbe9..07fb82d1 100644 --- a/tools-util.c +++ b/tools-util.c @@ -1,4 +1,3 @@ -#include <alloca.h> #include <assert.h> #include <ctype.h> #include <errno.h> @@ -19,6 +18,7 @@ #include "ccan/crc/crc.h" #include "linux/bcache-ioctl.h" +#include "linux/sort.h" #include "tools-util.h" #include "util.h" @@ -59,20 +59,12 @@ struct units_buf __pr_units(u64 v, enum units units) char *read_file_str(int dirfd, const char *path) { - int fd = openat(dirfd, path, O_RDONLY); + int fd = xopenat(dirfd, path, O_RDONLY); + size_t len = xfstat(fd).st_size; - if (fd < 0) - die("Unable to open %s\n", path); + char *buf = malloc(len + 1); - struct stat statbuf; - if (fstat(fd, &statbuf) < 0) - die("fstat error\n"); - - char *buf = malloc(statbuf.st_size + 1); - - int len = read(fd, buf, statbuf.st_size); - if (len < 0) - die("read error while reading from file %s\n", path); + xpread(fd, buf, len, 0); buf[len] = '\0'; if (len && buf[len - 1] == '\n') @@ -107,48 +99,33 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[], /* Returns size of file or block device: */ u64 get_size(const char *path, int fd) { - struct stat statbuf; - u64 ret; - - if (fstat(fd, &statbuf)) - die("Error statting %s: %s", path, strerror(errno)); + struct stat statbuf = xfstat(fd); if (!S_ISBLK(statbuf.st_mode)) return statbuf.st_size; - if (ioctl(fd, BLKGETSIZE64, &ret)) - die("Error getting block device size on %s: %s\n", - path, strerror(errno)); - + u64 ret; + xioctl(fd, BLKGETSIZE64, &ret); return ret; } /* Returns blocksize in units of 512 byte sectors: */ unsigned get_blocksize(const char *path, int fd) { - struct stat statbuf; - if (fstat(fd, &statbuf)) - die("Error statting %s: %s", path, strerror(errno)); + struct stat statbuf = xfstat(fd); if (!S_ISBLK(statbuf.st_mode)) return statbuf.st_blksize >> 9; unsigned ret; - if (ioctl(fd, BLKPBSZGET, &ret)) - die("Error getting blocksize on %s: %s\n", - path, strerror(errno)); - + xioctl(fd, BLKPBSZGET, &ret); return ret >> 9; } /* Global control device: */ int bcachectl_open(void) { - int fd = open("/dev/bcache-ctl", O_RDWR); - if (fd < 0) - die("Can't open bcache device: %s", strerror(errno)); - - return fd; + return xopen("/dev/bcache-ctl", O_RDWR); } /* Filesystem handles (ioctl, sysfs dir): */ @@ -162,47 +139,29 @@ struct bcache_handle bcache_fs_open(const char *path) if (!uuid_parse(path, tmp)) { /* It's a UUID, look it up in sysfs: */ - - char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1); - sprintf(sysfs, "%s%s", SYSFS_BASE, path); - - ret.sysfs_fd = open(sysfs, O_RDONLY); - if (!ret.sysfs_fd) - die("Unable to open %s\n", path); + char *sysfs = mprintf("%s%s", SYSFS_BASE, path); + ret.sysfs_fd = xopen(sysfs, O_RDONLY); char *minor = read_file_str(ret.sysfs_fd, "minor"); - char *ctl = alloca(20 + strlen(minor)); + char *ctl = mprintf("/dev/bcache%s-ctl", minor); + ret.ioctl_fd = xopen(ctl, O_RDWR); - sprintf(ctl, "/dev/bcache%s-ctl", minor); + free(sysfs); free(minor); - - ret.ioctl_fd = open(ctl, O_RDWR); - if (ret.ioctl_fd < 0) - die("Error opening control device: %s\n", - strerror(errno)); + free(ctl); } else { /* It's a path: */ - - ret.ioctl_fd = open(path, O_RDONLY); - if (ret.ioctl_fd < 0) - die("Error opening %s: %s\n", - path, strerror(errno)); + ret.ioctl_fd = xopen(path, O_RDONLY); struct bch_ioctl_query_uuid uuid; - if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid)) - die("ioctl error (not a bcache fs?): %s\n", - strerror(errno)); + xioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid); char uuid_str[40]; uuid_unparse(uuid.uuid.b, uuid_str); - char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1); - sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str); - - ret.sysfs_fd = open(sysfs, O_RDONLY); - if (ret.sysfs_fd < 0) - die("Unable to open sysfs dir %s: %s\n", - sysfs, strerror(errno)); + char *sysfs = mprintf("%s%s", SYSFS_BASE, uuid_str); + ret.sysfs_fd = xopen(sysfs, O_RDONLY); + free(sysfs); } return ret; @@ -225,3 +184,89 @@ bool ask_yn(void) free(buf); return ret; } + +static int range_cmp(const void *_l, const void *_r) +{ + const struct range *l = _l, *r = _r; + + if (l->start < r->start) + return -1; + if (l->start > r->start) + return 1; + return 0; +} + +void ranges_sort_merge(ranges *r) +{ + struct range *t, *i; + ranges tmp = { NULL }; + + sort(&darray_item(*r, 0), darray_size(*r), + sizeof(darray_item(*r, 0)), range_cmp, NULL); + + /* Merge contiguous ranges: */ + darray_foreach(i, *r) { + t = tmp.size ? &tmp.item[tmp.size - 1] : NULL; + + if (t && t->end >= i->start) + t->end = max(t->end, i->end); + else + darray_append(tmp, *i); + } + + darray_free(*r); + *r = tmp; +} + +void ranges_roundup(ranges *r, unsigned block_size) +{ + struct range *i; + + darray_foreach(i, *r) { + i->start = round_down(i->start, block_size); + i->end = round_up(i->end, block_size); + } +} + +void ranges_rounddown(ranges *r, unsigned block_size) +{ + struct range *i; + + darray_foreach(i, *r) { + i->start = round_up(i->start, block_size); + i->end = round_down(i->end, block_size); + i->end = max(i->end, i->start); + } +} + +struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter) +{ + struct fiemap_extent e; + + BUG_ON(iter->idx > iter->f.fm_mapped_extents); + + if (iter->idx == iter->f.fm_mapped_extents) { + xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f); + + if (!iter->f.fm_mapped_extents) + return (struct fiemap_extent) { .fe_length = 0 }; + + iter->idx = 0; + } + + e = iter->f.fm_extents[iter->idx++]; + BUG_ON(!e.fe_length); + + iter->f.fm_start = e.fe_logical + e.fe_length; + + return e; +} + +const char *strcmp_prefix(const char *a, const char *a_prefix) +{ + while (*a_prefix && *a == *a_prefix) { + a++; + a_prefix++; + } + return *a_prefix ? NULL : a; +} diff --git a/tools-util.h b/tools-util.h index 09f00efe..1aac56ae 100644 --- a/tools-util.h +++ b/tools-util.h @@ -5,21 +5,31 @@ #include <stdbool.h> #include <stdio.h> #include <stdlib.h> +#include <sys/stat.h> #include <sys/types.h> #include <unistd.h> +#include <linux/bug.h> #include <linux/byteorder.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/string.h> #include <linux/types.h> +#include "ccan/darray/darray.h" -#define die(arg, ...) \ -do { \ - fprintf(stderr, arg "\n", ##__VA_ARGS__); \ - exit(EXIT_FAILURE); \ +#define die(arg, ...) \ +do { \ + fprintf(stderr, arg "\n", ##__VA_ARGS__); \ + exit(EXIT_FAILURE); \ } while (0) +#define mprintf(...) \ +({ \ + char *_str; \ + asprintf(&_str, __VA_ARGS__); \ + _str; \ +}) + static inline void *xcalloc(size_t count, size_t size) { void *p = calloc(count, size); @@ -57,6 +67,38 @@ static inline void xpwrite(int fd, const void *buf, size_t count, off_t offset) die("write error (ret %zi err %s)", r, strerror(errno)); } +#define xopenat(_dirfd, _path, ...) \ +({ \ + int _fd = openat((_dirfd), (_path), __VA_ARGS__); \ + if (_fd < 0) \ + die("Error opening %s: %s", (_path), strerror(errno)); \ + _fd; \ +}) + +#define xopen(...) xopenat(AT_FDCWD, __VA_ARGS__) + +static inline struct stat xfstatat(int dirfd, const char *path, int flags) +{ + struct stat stat; + if (fstatat(dirfd, path, &stat, flags)) + die("stat error: %s", strerror(errno)); + return stat; +} + +static inline struct stat xfstat(int fd) +{ + struct stat stat; + if (fstat(fd, &stat)) + die("stat error: %s", strerror(errno)); + return stat; +} + +#define xioctl(_fd, _nr, ...) \ +do { \ + if (ioctl((_fd), (_nr), ##__VA_ARGS__)) \ + die(#_nr " ioctl error: %s", strerror(errno)); \ +} while (0) + enum units { BYTES, SECTORS, @@ -91,4 +133,74 @@ struct bcache_handle bcache_fs_open(const char *); bool ask_yn(void); +struct range { + u64 start; + u64 end; +}; + +typedef darray(struct range) ranges; + +static inline void range_add(ranges *data, u64 offset, u64 size) +{ + darray_append(*data, (struct range) { + .start = offset, + .end = offset + size + }); +} + +void ranges_sort_merge(ranges *); +void ranges_roundup(ranges *, unsigned); +void ranges_rounddown(ranges *, unsigned); + +struct hole_iter { + ranges r; + size_t idx; + u64 end; +}; + +static inline struct range hole_iter_next(struct hole_iter *iter) +{ + struct range r = { + .start = iter->idx ? iter->r.item[iter->idx - 1].end : 0, + .end = iter->idx < iter->r.size + ? iter->r.item[iter->idx].start : iter->end, + }; + + BUG_ON(r.start > r.end); + + iter->idx++; + return r; +} + +#define for_each_hole(_iter, _ranges, _end, _i) \ + for (_iter = (struct hole_iter) { .r = _ranges, .end = _end }; \ + (_iter.idx <= _iter.r.size && \ + (_i = hole_iter_next(&_iter), true));) + +#include <linux/fiemap.h> + +struct fiemap_iter { + struct fiemap f; + struct fiemap_extent fe[1024]; + unsigned idx; + int fd; +}; + +static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd) +{ + memset(iter, 0, sizeof(*iter)); + + iter->f.fm_extent_count = ARRAY_SIZE(iter->fe); + iter->f.fm_length = FIEMAP_MAX_OFFSET; + iter->fd = fd; +} + +struct fiemap_extent fiemap_iter_next(struct fiemap_iter *); + +#define fiemap_for_each(fd, iter, extent) \ + for (fiemap_iter_init(&iter, fd); \ + (extent = fiemap_iter_next(&iter)).fe_length;) + +const char *strcmp_prefix(const char *, const char *); + #endif /* _TOOLS_UTIL_H */