diff --git a/.bcachefs_revision b/.bcachefs_revision index f35d38b8..274236e3 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -496cbe9474173ec41bf221dc8ab1f5d70a128c3b +d5e561b3cc023dd247d2b3d08b680709ec21b477 diff --git a/include/linux/bio.h b/include/linux/bio.h index dcaffedb..1bd21ee3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -257,6 +257,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi) extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); + +void bio_free_pages(struct bio *bio); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); diff --git a/include/linux/completion.h b/include/linux/completion.h index 1808d21e..d11a8dd0 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -37,4 +37,6 @@ static inline void reinit_completion(struct completion *x) void complete(struct completion *); void wait_for_completion(struct completion *); +#define wait_for_completion_interruptible(x) (wait_for_completion(x), 0) + #endif diff --git a/include/linux/random.h b/include/linux/random.h index 243c0602..90fe5749 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -29,6 +29,11 @@ static inline void get_random_bytes(void *buf, int nbytes) BUG_ON(getrandom(buf, nbytes, 0) != nbytes); } +static inline void prandom_bytes(void *buf, int nbytes) +{ + return get_random_bytes(buf, nbytes); +} + #define get_random_type(type) \ static inline type get_random_##type(void) \ { \ diff --git a/libbcachefs.c b/libbcachefs.c index 3632e30d..238cca99 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -459,6 +459,11 @@ static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f, { } +static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f, + enum units units) +{ +} + typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); struct bch_sb_field_ops { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 298f26d4..cb9906c5 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -496,6 +496,8 @@ struct bch_fs { struct bch_replicas_cpu __rcu *replicas_gc; struct mutex replicas_gc_lock; + struct bch_disk_groups_cpu __rcu *disk_groups; + struct bch_opts opts; /* Updated by bch2_sb_update():*/ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index cb9e450b..85f728f2 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -782,7 +782,8 @@ struct bch_sb_field { x(members, 1) \ x(crypt, 2) \ x(replicas, 3) \ - x(quota, 4) + x(quota, 4) \ + x(disk_groups, 5) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -815,8 +816,9 @@ LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) /* 8-10 unused, was HAS_(META)DATA */ LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20); +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -933,6 +935,23 @@ struct bch_sb_field_quota { struct bch_sb_quota_type q[QTYP_NR]; } __attribute__((packed, aligned(8))); +/* BCH_SB_FIELD_disk_groups: */ + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +}; + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[0]; +}; + /* Superblock: */ /* @@ -947,7 +966,6 @@ struct bch_sb_field_quota { #define BCH_SB_VERSION_MAX 9 #define BCH_SB_SECTOR 8 -#define BCH_SB_LABEL_SIZE 32 #define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ struct bch_sb_layout { @@ -1069,20 +1087,6 @@ enum bch_sb_features { #define BCH_REPLICAS_MAX 4U -#if 0 -#define BCH_ERROR_ACTIONS() \ - x(BCH_ON_ERROR_CONTINUE, 0, "continue") \ - x(BCH_ON_ERROR_RO, 1, "remount-ro") \ - x(BCH_ON_ERROR_PANIC, 2, "panic") \ - x(BCH_NR_ERROR_ACTIONS, 3, NULL) - -enum bch_error_actions { -#define x(_opt, _nr, _str) _opt = _nr, - BCH_ERROR_ACTIONS() -#undef x -}; -#endif - enum bch_error_actions { BCH_ON_ERROR_CONTINUE = 0, BCH_ON_ERROR_RO = 1, diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index aa2a2050..6578847b 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -46,7 +46,6 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) #define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -#define BCH_IOCTL_DISK_EVACUATE _IOW(0xbc, 9, struct bch_ioctl_disk) #define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) #define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) @@ -75,30 +74,37 @@ struct bch_ioctl_disk_set_state { __u64 dev; }; -#define BCH_REWRITE_INCREASE_REPLICAS (1 << 0) -#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) - -#define BCH_REWRITE_RECOMPRESS (1 << 0) -#define BCH_REWRITE_DECREASE_REPLICAS (1 << 1) - enum bch_data_ops { - BCH_DATA_SCRUB, -}; - -struct bch_data_op { - __u8 type; + BCH_DATA_OP_SCRUB = 0, + BCH_DATA_OP_REREPLICATE = 1, + BCH_DATA_OP_MIGRATE = 2, + BCH_DATA_OP_NR = 3, }; struct bch_ioctl_data { + __u32 op; __u32 flags; - __u32 pad; - __u64 start_inode; - __u64 start_offset; + struct bpos start; + struct bpos end; - __u64 end_inode; - __u64 end_offset; -}; + union { + struct { + __u32 dev; + __u32 pad; + } migrate; + }; +} __attribute__((packed, aligned(8))); + +struct bch_ioctl_data_progress { + __u8 data_type; + __u8 btree_id; + __u8 pad[2]; + struct bpos pos; + + __u64 sectors_done; + __u64 sectors_total; +} __attribute__((packed, aligned(8))); struct bch_ioctl_dev_usage { __u8 state; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 1498832b..5ff90cc0 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -1,20 +1,25 @@ #ifndef NO_BCACHEFS_CHARDEV #include "bcachefs.h" +#include "alloc.h" #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" +#include "move.h" #include "super.h" #include "super-io.h" -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/major.h> +#include <linux/anon_inodes.h> #include <linux/cdev.h> #include <linux/device.h> +#include <linux/file.h> +#include <linux/fs.h> #include <linux/ioctl.h> -#include <linux/uaccess.h> +#include <linux/kthread.h> +#include <linux/major.h> +#include <linux/sched/task.h> #include <linux/slab.h> +#include <linux/uaccess.h> /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, @@ -266,23 +271,108 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, return ret; } -static long bch2_ioctl_disk_evacuate(struct bch_fs *c, - struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - int ret; +struct bch_data_ctx { + struct bch_fs *c; + struct bch_ioctl_data arg; + struct bch_move_stats stats; - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) + int ret; + + struct task_struct *thread; +}; + +static int bch2_data_thread(void *arg) +{ + struct bch_data_ctx *ctx = arg; + + ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + + ctx->stats.data_type = U8_MAX; + return 0; +} + +static int bch2_data_job_release(struct inode *inode, struct file *file) +{ + struct bch_data_ctx *ctx = file->private_data; + + kthread_stop(ctx->thread); + put_task_struct(ctx->thread); + kfree(ctx); + return 0; +} + +static ssize_t bch2_data_job_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct bch_data_ctx *ctx = file->private_data; + struct bch_fs *c = ctx->c; + struct bch_ioctl_data_progress p = { + .data_type = ctx->stats.data_type, + .btree_id = ctx->stats.iter.btree_id, + .pos = ctx->stats.iter.pos, + .sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)), + }; + + if (len != sizeof(p)) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); + return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p); +} - ret = bch2_dev_evacuate(c, ca); +static const struct file_operations bcachefs_data_ops = { + .release = bch2_data_job_release, + .read = bch2_data_job_read, + .llseek = no_llseek, +}; - percpu_ref_put(&ca->ref); +static long bch2_ioctl_data(struct bch_fs *c, + struct bch_ioctl_data arg) +{ + struct bch_data_ctx *ctx = NULL; + struct file *file = NULL; + unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; + int ret, fd = -1; + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->c = c; + ctx->arg = arg; + + ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); + if (IS_ERR(ctx->thread)) { + ret = PTR_ERR(ctx->thread); + goto err; + } + + ret = get_unused_fd_flags(flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err; + } + + fd_install(fd, file); + + get_task_struct(ctx->thread); + wake_up_process(ctx->thread); + + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (!IS_ERR_OR_NULL(ctx->thread)) + kthread_stop(ctx->thread); + kfree(ctx); return ret; } @@ -474,8 +564,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_offline, struct bch_ioctl_disk); case BCH_IOCTL_DISK_SET_STATE: BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); - case BCH_IOCTL_DISK_EVACUATE: - BCH_IOCTL(disk_evacuate, struct bch_ioctl_disk); + case BCH_IOCTL_DATA: + BCH_IOCTL(data, struct bch_ioctl_data); case BCH_IOCTL_READ_SUPER: BCH_IOCTL(read_super, struct bch_ioctl_read_super); case BCH_IOCTL_DISK_GET_IDX: @@ -488,9 +578,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) } } +static DEFINE_IDR(bch_chardev_minor); + static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) { - struct bch_fs *c = filp->private_data; + unsigned minor = iminor(file_inode(filp)); + struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; void __user *arg = (void __user *) v; return c @@ -507,7 +600,6 @@ static const struct file_operations bch_chardev_fops = { static int bch_chardev_major; static struct class *bch_chardev_class; static struct device *bch_chardev; -static DEFINE_IDR(bch_chardev_minor); void bch2_fs_chardev_exit(struct bch_fs *c) { @@ -524,7 +616,7 @@ int bch2_fs_chardev_init(struct bch_fs *c) return c->minor; c->chardev = device_create(bch_chardev_class, NULL, - MKDEV(bch_chardev_major, c->minor), NULL, + MKDEV(bch_chardev_major, c->minor), c, "bcachefs%u-ctl", c->minor); if (IS_ERR(c->chardev)) return PTR_ERR(c->chardev); @@ -536,7 +628,7 @@ void bch2_chardev_exit(void) { if (!IS_ERR_OR_NULL(bch_chardev_class)) device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, 255)); + MKDEV(bch_chardev_major, U8_MAX)); if (!IS_ERR_OR_NULL(bch_chardev_class)) class_destroy(bch_chardev_class); if (bch_chardev_major > 0) @@ -554,7 +646,7 @@ int __init bch2_chardev_init(void) return PTR_ERR(bch_chardev_class); bch_chardev = device_create(bch_chardev_class, NULL, - MKDEV(bch_chardev_major, 255), + MKDEV(bch_chardev_major, U8_MAX), NULL, "bcachefs-ctl"); if (IS_ERR(bch_chardev)) return PTR_ERR(bch_chardev); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c2469167..f5dccfad 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -139,6 +139,34 @@ bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) return dropped; } +const struct bch_extent_ptr * +bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = c->devs[ptr->dev]; + + if (ca->mi.group && + ca->mi.group == group) + return ptr; + } + + return NULL; +} + +const struct bch_extent_ptr * +bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) + if (dev_in_target(c->devs[ptr->dev], target)) + return ptr; + + return NULL; +} + unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index eda34381..e8f54f2e 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -43,6 +43,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); bool bch2_extent_drop_device(struct bkey_s_extent, unsigned); +const struct bch_extent_ptr * +bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); +const struct bch_extent_ptr * +bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned); unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1870534d..a1e45625 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -54,6 +54,13 @@ static inline u64 journal_last_seq(struct journal *j) return j->pin.front; } +static inline u64 journal_cur_seq(struct journal *j) +{ + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + return j->pin.back - 1; +} + static inline u64 journal_pin_seq(struct journal *j, struct journal_entry_pin_list *pin_list) { @@ -264,7 +271,9 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) if (!seq) return 0; - journal_seq = atomic64_read(&j->seq); + spin_lock(&j->lock); + journal_seq = journal_cur_seq(j); + spin_unlock(&j->lock); /* Interier updates aren't journalled: */ BUG_ON(b->level); @@ -989,6 +998,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) u64 cur_seq, end_seq, seq; unsigned iter, keys = 0, entries = 0; size_t nr; + bool degraded = false; int ret = 0; closure_init_stack(&jlist.cl); @@ -996,12 +1006,19 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) jlist.head = list; jlist.ret = 0; - for_each_readable_member(ca, c, iter) { - percpu_ref_get(&ca->io_ref); - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_unbound_wq, - &jlist.cl); + for_each_member_device(ca, c, iter) { + if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + continue; + + if ((ca->mi.state == BCH_MEMBER_STATE_RW || + ca->mi.state == BCH_MEMBER_STATE_RO) && + percpu_ref_tryget(&ca->io_ref)) + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, + &jlist.cl); + else + degraded = true; } closure_sync(&jlist.cl); @@ -1022,11 +1039,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (ret) goto fsck_err; - if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL, - i->devs), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL)) { + /* + * If we're mounting in degraded mode - if we didn't read all + * the devices - this is wrong: + */ + + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL, + i->devs), c, + "superblock not marked as containing replicas (type %u)", + BCH_DATA_JOURNAL))) { ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, i->devs); if (ret) @@ -1111,7 +1134,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } bch_info(c, "journal read done, %i keys in %i entries, seq %llu", - keys, entries, (u64) atomic64_read(&j->seq)); + keys, entries, journal_cur_seq(j)); fsck_err: return ret; } @@ -1174,9 +1197,6 @@ static void journal_pin_new_entry(struct journal *j, int count) atomic64_inc(&j->seq); p = fifo_push_ref(&j->pin); - EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != - &fifo_peek_back(&j->pin)); - INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); @@ -1190,7 +1210,7 @@ static void bch2_journal_buf_init(struct journal *j) memset(buf->has_inode, 0, sizeof(buf->has_inode)); memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(atomic64_read(&j->seq)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); buf->data->u64s = 0; } @@ -1472,7 +1492,7 @@ void bch2_journal_start(struct bch_fs *c) set_bit(JOURNAL_STARTED, &j->flags); - while (atomic64_read(&j->seq) < new_seq) + while (journal_cur_seq(j) < new_seq) journal_pin_new_entry(j, 0); /* @@ -2015,9 +2035,11 @@ static void journal_reclaim_work(struct work_struct *work) mutex_unlock(&j->reclaim_lock); /* Also flush if the pin fifo is more than half full */ + spin_lock(&j->lock); seq_to_flush = max_t(s64, seq_to_flush, - (s64) atomic64_read(&j->seq) - + (s64) journal_cur_seq(j) - (j->pin.size >> 1)); + spin_unlock(&j->lock); /* * If it's been longer than j->reclaim_delay_ms since we last flushed, @@ -2110,7 +2132,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, ja->sectors_free = ca->mi.bucket_size - sectors; ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); extent_ptr_append(bkey_i_to_extent(&j->key), (struct bch_extent_ptr) { @@ -2436,9 +2458,9 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) spin_lock(&j->lock); if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = atomic64_read(&j->seq); + seq = journal_cur_seq(j); else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = atomic64_read(&j->seq) - 1; + seq = journal_cur_seq(j) - 1; spin_unlock(&j->lock); return seq; @@ -2547,7 +2569,7 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j) u64 seq; spin_lock(&j->lock); - seq = atomic64_read(&j->seq); + seq = journal_cur_seq(j); if (j->reservations.prev_buf_unwritten) seq--; spin_unlock(&j->lock); @@ -2560,9 +2582,9 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare int ret; spin_lock(&j->lock); - BUG_ON(seq > atomic64_read(&j->seq)); + BUG_ON(seq > journal_cur_seq(j)); - if (seq < atomic64_read(&j->seq) || + if (seq < journal_cur_seq(j) || journal_entry_is_open(j)) { spin_unlock(&j->lock); return 1; @@ -2583,17 +2605,17 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent { spin_lock(&j->lock); - BUG_ON(seq > atomic64_read(&j->seq)); + BUG_ON(seq > journal_cur_seq(j)); if (bch2_journal_error(j)) { spin_unlock(&j->lock); return; } - if (seq == atomic64_read(&j->seq)) { + if (seq == journal_cur_seq(j)) { if (!closure_wait(&journal_cur_buf(j)->wait, parent)) BUG(); - } else if (seq + 1 == atomic64_read(&j->seq) && + } else if (seq + 1 == journal_cur_seq(j) && j->reservations.prev_buf_unwritten) { if (!closure_wait(&journal_prev_buf(j)->wait, parent)) BUG(); @@ -2615,14 +2637,14 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa spin_lock(&j->lock); - BUG_ON(seq > atomic64_read(&j->seq)); + BUG_ON(seq > journal_cur_seq(j)); if (bch2_journal_error(j)) { spin_unlock(&j->lock); return; } - if (seq == atomic64_read(&j->seq)) { + if (seq == journal_cur_seq(j)) { bool set_need_write = false; buf = journal_cur_buf(j); @@ -2643,7 +2665,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa case JOURNAL_ENTRY_CLOSED: /* * Journal entry hasn't been opened yet, but caller - * claims it has something (seq == j->seq): + * claims it has something */ BUG(); case JOURNAL_ENTRY_INUSE: @@ -2652,7 +2674,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa return; } } else if (parent && - seq + 1 == atomic64_read(&j->seq) && + seq + 1 == journal_cur_seq(j) && j->reservations.prev_buf_unwritten) { buf = journal_prev_buf(j); @@ -2676,9 +2698,9 @@ static int journal_seq_flushed(struct journal *j, u64 seq) int ret = 1; spin_lock(&j->lock); - BUG_ON(seq > atomic64_read(&j->seq)); + BUG_ON(seq > journal_cur_seq(j)); - if (seq == atomic64_read(&j->seq)) { + if (seq == journal_cur_seq(j)) { bool set_need_write = false; ret = 0; @@ -2697,7 +2719,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq) case JOURNAL_ENTRY_CLOSED: /* * Journal entry hasn't been opened yet, but caller - * claims it has something (seq == j->seq): + * claims it has something */ BUG(); case JOURNAL_ENTRY_INUSE: @@ -2705,7 +2727,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq) case JOURNAL_UNLOCKED: return 0; } - } else if (seq + 1 == atomic64_read(&j->seq) && + } else if (seq + 1 == journal_cur_seq(j) && j->reservations.prev_buf_unwritten) { ret = bch2_journal_error(j); } @@ -2762,7 +2784,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent) u64 seq, journal_seq; spin_lock(&j->lock); - journal_seq = atomic64_read(&j->seq); + journal_seq = journal_cur_seq(j); if (journal_entry_is_open(j)) { seq = journal_seq; @@ -2782,7 +2804,7 @@ int bch2_journal_flush(struct journal *j) u64 seq, journal_seq; spin_lock(&j->lock); - journal_seq = atomic64_read(&j->seq); + journal_seq = journal_cur_seq(j); if (journal_entry_is_open(j)) { seq = journal_seq; @@ -2797,7 +2819,7 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } -int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) +int bch2_journal_flush_device(struct journal *j, int dev_idx) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; @@ -2807,7 +2829,9 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) spin_lock(&j->lock); fifo_for_each_entry_ptr(p, &j->pin, iter) - if (bch2_dev_list_has_dev(p->devs, dev_idx)) + if (dev_idx >= 0 + ? bch2_dev_list_has_dev(p->devs, dev_idx) + : p->devs.nr < c->opts.metadata_replicas) seq = iter; spin_unlock(&j->lock); @@ -2821,7 +2845,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) seq = 0; spin_lock(&j->lock); - while (!ret && seq < atomic64_read(&j->seq)) { + while (!ret && seq < j->pin.back) { seq = max(seq, journal_last_seq(j)); devs = journal_seq_pin(j, seq)->devs; seq++; @@ -2982,7 +3006,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) "dirty:\t\t\t%i\n" "replay done:\t\t%i\n", fifo_used(&j->pin), - (u64) atomic64_read(&j->seq), + journal_cur_seq(j), journal_last_seq(j), j->last_seq_ondisk, journal_state_count(*s, s->idx), diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 52d74eec..46ae8f0d 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -368,7 +368,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); int bch2_journal_meta(struct journal *); -int bch2_journal_flush_device(struct journal *, unsigned); +int bch2_journal_flush_device(struct journal *, int); void bch2_journal_halt(struct journal *); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 01c88960..9c2920cf 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -40,12 +40,15 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, * operations */ do { + memset(&stats, 0, sizeof(stats)); + ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, NULL, writepoint_hashed((unsigned long) current), 0, ca->dev_idx, + POS_MIN, POS_MAX, migrate_pred, ca, &stats); if (ret) { diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a67e7a45..e5a46ba6 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -87,7 +87,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) m->move_dev))) bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); - extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { /* @@ -194,6 +193,8 @@ static void move_free(struct closure *cl) struct bio_vec *bv; int i; + bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) if (bv->bv_page) __free_page(bv->bv_page); @@ -243,20 +244,21 @@ static int bch2_move_extent(struct bch_fs *c, int btree_insert_flags, int move_device, struct bch_io_opts opts, - struct bkey_s_c k) + struct bkey_s_c_extent e) { struct extent_pick_ptr pick; struct moving_io *io; const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; - unsigned sectors = k.k->size, pages; + unsigned sectors = e.k->size, pages, nr_good; + int ret = -ENOMEM; - bch2_extent_pick_ptr(c, k, NULL, &pick); + bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); if (IS_ERR_OR_NULL(pick.ca)) return pick.ca ? PTR_ERR(pick.ca) : 0; /* write path might have to decompress data: */ - extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc) + extent_for_each_ptr_crc(e, ptr, crc) sectors = max_t(unsigned, sectors, crc.uncompressed_size); pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); @@ -266,7 +268,7 @@ static int bch2_move_extent(struct bch_fs *c, goto err; io->write.ctxt = ctxt; - io->sectors = k.k->size; + io->sectors = e.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->write.op.wbio.bio, @@ -274,10 +276,8 @@ static int bch2_move_extent(struct bch_fs *c, io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9; bch2_bio_map(&io->write.op.wbio.bio, NULL); - if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) { - kfree(io); - goto err; - } + if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) + goto err_free; io->rbio.opts = opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); @@ -285,7 +285,7 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_size = sectors << 9; bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); io->rbio.bio.bi_end_io = move_read_endio; io->write.btree_insert_flags = btree_insert_flags; @@ -298,10 +298,22 @@ static int bch2_move_extent(struct bch_fs *c, io->write.op.devs = devs; io->write.op.write_point = wp; - atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + if (move_device < 0 && + ((nr_good = bch2_extent_nr_good_ptrs(c, e)) < + c->opts.data_replicas)) { + io->write.op.nr_replicas = c->opts.data_replicas - nr_good; - trace_move_extent(k.k); + ret = bch2_disk_reservation_get(c, &io->write.op.res, + e.k->size, + io->write.op.nr_replicas, 0); + if (ret) + goto err_free_pages; + } + + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(e.k->size, &ctxt->stats->sectors_moved); + + trace_move_extent(e.k); atomic_add(io->sectors, &ctxt->sectors_in_flight); list_add_tail(&io->list, &ctxt->reads); @@ -311,12 +323,16 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k), - &pick, BCH_READ_NODECODE); + bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE); return 0; +err_free_pages: + bio_free_pages(&io->write.op.wbio.bio); +err_free: + kfree(io); err: - trace_move_alloc_fail(k.k); - return -ENOMEM; + percpu_ref_put(&pick.ca->io_ref); + trace_move_alloc_fail(e.k); + return ret; } static void do_pending_writes(struct moving_context *ctxt) @@ -355,6 +371,8 @@ int bch2_move_data(struct bch_fs *c, struct write_point_specifier wp, int btree_insert_flags, int move_device, + struct bpos start, + struct bpos end, move_pred_fn pred, void *arg, struct bch_move_stats *stats) { @@ -363,14 +381,16 @@ int bch2_move_data(struct bch_fs *c, struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct bkey_s_c k; + struct bkey_s_c_extent e; u64 cur_inum = U64_MAX; int ret = 0; - memset(stats, 0, sizeof(*stats)); closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); - bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, POS_MIN, + + stats->data_type = BCH_DATA_USER; + bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start, BTREE_ITER_PREFETCH); if (rate) @@ -396,10 +416,14 @@ peek: ret = btree_iter_err(k); if (ret) break; + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; if (!bkey_extent_is_data(k.k)) goto next_nondata; + e = bkey_s_c_to_extent(k); + if (cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; @@ -413,7 +437,7 @@ peek: goto peek; } - if (!pred(arg, bkey_s_c_to_extent(k))) + if (!pred(arg, e)) goto next; /* unlock before doing IO: */ @@ -423,7 +447,8 @@ peek: if (bch2_move_extent(c, &ctxt, devs, wp, btree_insert_flags, - move_device, opts, k)) { + move_device, opts, + bkey_s_c_to_extent(k))) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(&ctxt); continue; @@ -453,3 +478,157 @@ next_nondata: return ret; } + +static int bch2_gc_data_replicas(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_PREFETCH, k) { + ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k)); + if (ret) + break; + } + ret = bch2_btree_iter_unlock(&iter) ?: ret; + + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} + +static int bch2_gc_btree_replicas(struct bch_fs *c) +{ + struct btree_iter iter; + struct btree *b; + unsigned id; + int ret = 0; + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); + + for (id = 0; id < BTREE_ID_NR; id++) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { + ret = bch2_check_mark_super(c, BCH_DATA_BTREE, + bch2_bkey_devs(bkey_i_to_s_c(&b->key))); + + bch2_btree_iter_cond_resched(&iter); + } + + ret = bch2_btree_iter_unlock(&iter) ?: ret; + } + + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} + +static int bch2_move_btree(struct bch_fs *c, + move_pred_fn pred, + void *arg, + struct bch_move_stats *stats) +{ + struct btree *b; + unsigned id; + int ret = 0; + + stats->data_type = BCH_DATA_BTREE; + + for (id = 0; id < BTREE_ID_NR; id++) { + for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { + if (pred(arg, bkey_i_to_s_c_extent(&b->key))) + ret = bch2_btree_node_rewrite(c, &stats->iter, + b->data->keys.seq, 0) ?: ret; + + bch2_btree_iter_cond_resched(&stats->iter); + } + + ret = bch2_btree_iter_unlock(&stats->iter) ?: ret; + } + + return ret; +} + +#if 0 +static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e) +{ +} +#endif + +static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e) +{ + struct bch_fs *c = arg; + unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); + + return nr_good && nr_good < c->opts.metadata_replicas; +} + +static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e) +{ + struct bch_fs *c = arg; + unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); + + return nr_good && nr_good < c->opts.data_replicas; +} + +static bool migrate_pred(void *arg, struct bkey_s_c_extent e) +{ + struct bch_ioctl_data *op = arg; + + return bch2_extent_has_device(e, op->migrate.dev); +} + +int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +{ + int ret = 0; + + switch (op.op) { + case BCH_DATA_OP_REREPLICATE: + stats->data_type = BCH_DATA_JOURNAL; + ret = bch2_journal_flush_device(&c->journal, -1); + + ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret; + ret = bch2_gc_btree_replicas(c) ?: ret; + + ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, + NULL, + writepoint_hashed((unsigned long) current), + 0, -1, + op.start, + op.end, + rereplicate_data_pred, c, stats) ?: ret; + ret = bch2_gc_data_replicas(c) ?: ret; + break; + case BCH_DATA_OP_MIGRATE: + if (op.migrate.dev >= c->sb.nr_devices) + return -EINVAL; + + stats->data_type = BCH_DATA_JOURNAL; + ret = bch2_journal_flush_device(&c->journal, op.migrate.dev); + + ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; + ret = bch2_gc_btree_replicas(c) ?: ret; + + ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, + NULL, + writepoint_hashed((unsigned long) current), + 0, -1, + op.start, + op.end, + migrate_pred, &op, stats) ?: ret; + ret = bch2_gc_data_replicas(c) ?: ret; + break; + default: + ret = -EINVAL; + } + + return ret; +} diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 24d6ddfa..07aa5669 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -27,6 +27,7 @@ void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *); typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent); struct bch_move_stats { + enum bch_data_type data_type; struct btree_iter iter; atomic64_t keys_moved; @@ -38,7 +39,12 @@ struct bch_move_stats { int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, unsigned, struct bch_devs_mask *, struct write_point_specifier, - int, int, move_pred_fn, void *, + int, int, struct bpos, struct bpos, + move_pred_fn, void *, struct bch_move_stats *); +int bch2_data_job(struct bch_fs *, + struct bch_move_stats *, + struct bch_ioctl_data); + #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index d6f2968e..515d5001 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -106,6 +106,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) size_t b; int ret; + memset(&move_stats, 0, sizeof(move_stats)); closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); /* @@ -166,6 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) writepoint_ptr(&ca->copygc_write_point), BTREE_INSERT_USE_RESERVE, ca->dev_idx, + POS_MIN, POS_MAX, copygc_pred, ca, &move_stats); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 1f266ba3..f333b8fa 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -13,6 +13,7 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); +static int bch2_sb_disk_groups_to_cpu(struct bch_fs *); /* superblock fields (optional/variable size sections: */ @@ -43,6 +44,7 @@ static const struct bch_sb_field_ops bch2_sb_field_ops[] = { static const char *bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f) + { unsigned type = le32_to_cpu(f->type); @@ -297,7 +299,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!sb->nr_devices || sb->nr_devices <= sb->dev_idx || sb->nr_devices > BCH_SB_MEMBERS_MAX) - return "Bad cache device number in set"; + return "Bad number of member devices"; if (!BCH_SB_META_REPLICAS_WANT(sb) || BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) @@ -458,6 +460,10 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) if (ret) return ret; + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + bch2_sb_update(c); return 0; } @@ -1557,3 +1563,129 @@ static const char *bch2_sb_validate_quota(struct bch_sb *sb, return NULL; } + +/* Disk groups: */ + +#if 0 +static size_t trim_nulls(const char *str, size_t len) +{ + while (len && !str[len - 1]) + --len; + return len; +} +#endif + +static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_sb_field_members *mi; + struct bch_member *m; + struct bch_disk_group *g; + unsigned nr_groups; + + mi = bch2_sb_get_members(sb); + groups = bch2_sb_get_disk_groups(sb); + nr_groups = disk_groups_nr(groups); + + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) { + if (!BCH_MEMBER_GROUP(m)) + continue; + + if (BCH_MEMBER_GROUP(m) >= nr_groups) + return "disk has invalid group"; + + g = &groups->entries[BCH_MEMBER_GROUP(m)]; + if (BCH_GROUP_DELETED(g)) + return "disk has invalid group"; + } +#if 0 + if (!groups) + return NULL; + + char **labels; + labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL); + if (!labels) + return "cannot allocate memory"; + + for (g = groups->groups; + g < groups->groups + nr_groups; + g++) { + + } +#endif + return NULL; +} + +static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field_disk_groups *groups; + struct bch_disk_groups_cpu *cpu_g, *old_g; + unsigned i, nr_groups; + + lockdep_assert_held(&c->sb_lock); + + mi = bch2_sb_get_members(c->disk_sb); + groups = bch2_sb_get_disk_groups(c->disk_sb); + nr_groups = disk_groups_nr(groups); + + if (!groups) + return 0; + + cpu_g = kzalloc(sizeof(*cpu_g) + + sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); + if (!cpu_g) + return -ENOMEM; + + cpu_g->nr = nr_groups; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *src = &groups->entries[i]; + struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; + + dst->deleted = BCH_GROUP_DELETED(src); + } + + for (i = 0; i < c->disk_sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + struct bch_disk_group_cpu *dst = + &cpu_g->entries[BCH_MEMBER_GROUP(m)]; + + if (!bch2_member_exists(m)) + continue; + + __set_bit(i, dst->devs.d); + } + + old_g = c->disk_groups; + rcu_assign_pointer(c->disk_groups, cpu_g); + if (old_g) + kfree_rcu(old_g, rcu); + + return 0; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_DEV: + BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]); + return &c->devs[t.dev]->self; + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = + rcu_dereference(c->disk_groups); + + /* XXX: what to do here? */ + BUG_ON(t.group >= g->nr || g->entries[t.group].deleted); + return &g->entries[t.group].devs; + } + default: + BUG(); + } +} diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 59a8b816..eb85410c 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -127,6 +127,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .nbuckets = le64_to_cpu(mi->nbuckets), .first_bucket = le16_to_cpu(mi->first_bucket), .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), .state = BCH_MEMBER_STATE(mi), .tier = BCH_MEMBER_TIER(mi), .replacement = BCH_MEMBER_REPLACEMENT(mi), @@ -177,4 +178,65 @@ replicas_entry_next(struct bch_replicas_entry *i) (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ (_i) = replicas_entry_next(_i)) +/* disk groups: */ + +static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) +{ + return groups + ? (vstruct_end(&groups->field) - + (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) + : 0; +} + +struct target { + enum { + TARGET_NULL, + TARGET_DEV, + TARGET_GROUP, + } type; + union { + unsigned dev; + unsigned group; + }; +}; + +static inline u16 dev_to_target(unsigned dev) +{ + return 1 + dev; +} + +static inline u16 group_to_target(unsigned group) +{ + return 1 + U8_MAX + group; +} + +static inline struct target target_decode(unsigned target) +{ + if (!target) + return (struct target) { .type = TARGET_NULL }; + + --target; + if (target <= U8_MAX) + return (struct target) { .type = TARGET_DEV, .dev = target }; + + target -= U8_MAX; + return (struct target) { .type = TARGET_GROUP, .group = target }; +} + +static inline bool dev_in_target(struct bch_dev *ca, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_DEV: + return ca->dev_idx == t.dev; + case TARGET_GROUP: + return ca->mi.group && ca->mi.group == t.group; + default: + BUG(); + } +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 8c7a147a..f836c199 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -426,6 +426,7 @@ static void bch2_fs_free(struct bch_fs *c) mempool_exit(&c->fill_iter); percpu_ref_exit(&c->writes); kfree(rcu_dereference_protected(c->replicas, 1)); + kfree(rcu_dereference_protected(c->disk_groups, 1)); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); @@ -1169,6 +1170,12 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + if (get_capacity(sb->bdev->bd_disk) < + ca->mi.bucket_size * ca->mi.nbuckets) { + bch_err(c, "device too small"); + return -EINVAL; + } + ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; @@ -1495,10 +1502,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) mutex_lock(&c->state_lock); mutex_lock(&c->sb_lock); - /* - * Preserve the old cache member information (esp. tier) - * before we start bashing the disk stuff. - */ + /* Grab member info for new disk: */ dev_mi = bch2_sb_get_members(sb.sb); saved_mi = dev_mi->members[sb.sb->dev_idx]; saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); @@ -1646,47 +1650,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } -int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned data; - int ret = 0; - - mutex_lock(&c->state_lock); - - if (ca->mi.state == BCH_MEMBER_STATE_RW && - bch2_dev_is_online(ca)) { - bch_err(ca, "Cannot migrate data off RW device"); - ret = -EINVAL; - goto err; - } - - ret = bch2_dev_data_migrate(c, ca, 0); - if (ret) { - bch_err(ca, "Error migrating data: %i", ret); - goto err; - } - - ret = bch2_journal_flush_device(&c->journal, ca->dev_idx); - if (ret) { - bch_err(ca, "Migrate failed: error %i flushing journal", ret); - goto err; - } - - data = bch2_dev_has_data(c, ca); - if (data) { - char buf[100]; - - bch2_scnprint_flag_list(buf, sizeof(buf), - bch2_data_types, data); - bch_err(ca, "Migrate failed, still has data (%s)", buf); - ret = -EINVAL; - goto err; - } -err: - mutex_unlock(&c->state_lock); - return ret; -} - int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *mi; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index a35ee3db..d0a38cf6 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -30,7 +30,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return ca->disk_sb.bdev != NULL; } -static inline unsigned dev_mask_nr(struct bch_devs_mask *devs) +static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) { return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); } @@ -68,7 +68,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, } static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - struct bch_devs_mask *mask) + const struct bch_devs_mask *mask) { struct bch_dev *ca = NULL; @@ -188,7 +188,6 @@ int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); int bch2_dev_add(struct bch_fs *, const char *); int bch2_dev_online(struct bch_fs *, const char *); int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_evacuate(struct bch_fs *, struct bch_dev *); int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); bool bch2_fs_emergency_read_only(struct bch_fs *); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 35c8bebf..966da4af 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -22,6 +22,7 @@ struct bch_member_cpu { u64 nbuckets; /* device size */ u16 first_bucket; /* index of first bucket used */ u16 bucket_size; /* sectors */ + u16 group; u8 state; u8 tier; u8 replacement; @@ -42,4 +43,15 @@ struct bch_replicas_cpu { struct bch_replicas_cpu_entry entries[]; }; +struct bch_disk_group_cpu { + struct bch_devs_mask devs; + bool deleted; +}; + +struct bch_disk_groups_cpu { + struct rcu_head rcu; + unsigned nr; + struct bch_disk_group_cpu entries[]; +}; + #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 6a581097..c4625c80 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -44,6 +44,7 @@ static int bch2_tiering_thread(void *arg) unsigned long last; unsigned i, nr_devices; + memset(&move_stats, 0, sizeof(move_stats)); set_freezable(); while (!kthread_should_stop()) { @@ -91,6 +92,7 @@ static int bch2_tiering_thread(void *arg) writepoint_ptr(&tier->wp), 0, -1, + POS_MIN, POS_MAX, tiering_pred, tier, &move_stats); } diff --git a/linux/bio.c b/linux/bio.c index d8256989..79f50dc2 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -163,6 +163,15 @@ struct bio *bio_split(struct bio *bio, int sectors, return split; } +void bio_free_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); +} + int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) { int i;