From 12cf9df98b99478d1ebeacab815c8dfdb5bb772c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Nov 2025 17:31:00 -0500 Subject: [PATCH] Update bcachefs sources to 1381a92a5d23 bcachefs: sysfs trigger_check_inconsistent_replicas --- .bcachefs_revision | 2 +- libbcachefs/alloc/accounting.c | 25 +- libbcachefs/alloc/accounting.h | 2 +- libbcachefs/alloc/accounting_format.h | 19 +- libbcachefs/alloc/buckets.c | 7 +- libbcachefs/alloc/disk_groups.c | 17 +- libbcachefs/alloc/replicas.c | 412 +++--- libbcachefs/alloc/replicas.h | 19 +- libbcachefs/alloc/replicas_types.h | 12 +- libbcachefs/bcachefs.h | 3 +- libbcachefs/bcachefs_format.h | 14 +- libbcachefs/btree/check.c | 6 +- libbcachefs/btree/interior.c | 105 +- libbcachefs/btree/interior.h | 1 + libbcachefs/btree/types.h | 2 +- libbcachefs/data/checksum.h | 11 + libbcachefs/data/ec.c | 67 +- libbcachefs/data/ec.h | 9 + libbcachefs/data/extents.c | 87 +- libbcachefs/data/extents.h | 1 + libbcachefs/data/extents_format.h | 18 +- libbcachefs/data/migrate.c | 23 +- libbcachefs/data/move.c | 7 +- libbcachefs/data/rebalance.c | 1650 ++++++++++++++++++++----- libbcachefs/data/rebalance.h | 80 +- libbcachefs/data/rebalance_format.h | 191 ++- libbcachefs/data/update.c | 28 +- libbcachefs/data/write.c | 23 +- libbcachefs/debug/sysfs.c | 55 + libbcachefs/fs/inode.h | 2 +- libbcachefs/init/dev.c | 27 +- libbcachefs/init/fs.c | 6 +- libbcachefs/journal/init.c | 29 +- libbcachefs/journal/journal.c | 5 +- libbcachefs/journal/read.c | 15 +- libbcachefs/journal/reclaim.c | 68 +- libbcachefs/journal/reclaim.h | 2 +- libbcachefs/journal/types.h | 1 + libbcachefs/journal/write.c | 39 +- libbcachefs/opts.c | 31 +- libbcachefs/opts.h | 2 + libbcachefs/sb/clean.c | 13 +- libbcachefs/sb/clean.h | 2 +- libbcachefs/sb/downgrade.c | 66 +- libbcachefs/sb/downgrade.h | 1 + libbcachefs/sb/errors_format.h | 14 +- libbcachefs/sb/io.c | 7 + libbcachefs/vfs/fs.c | 4 +- 48 files changed, 2494 insertions(+), 736 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 5a168bd8..84402bdc 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d13053c5782b5c9993cf9cfba52dd19c0732091b +1381a92a5d23df9d9bbf6cae1ecf0f3eb39f4b0b diff --git a/libbcachefs/alloc/accounting.c b/libbcachefs/alloc/accounting.c index a6fb4045..cc1d87a3 100644 --- a/libbcachefs/alloc/accounting.c +++ b/libbcachefs/alloc/accounting.c @@ -250,7 +250,9 @@ fsck_err: return ret; } -void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) +void bch2_accounting_key_to_text(struct printbuf *out, + struct bch_fs *c, + struct disk_accounting_pos *k) { if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { prt_printf(out, "unknown type %u", k->type); @@ -283,6 +285,17 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_str(out, "btree="); bch2_btree_id_to_text(out, k->btree.id); break; + case BCH_DISK_ACCOUNTING_rebalance_work_v2: + bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type); + break; + case BCH_DISK_ACCOUNTING_dev_leaving: { + struct bch_dev *ca = c ? bch2_dev_rcu_noerror(c, k->dev_leaving.dev) : NULL; + if (ca) + prt_printf(out, "%s ", ca->name); + else + prt_printf(out, "%u ", k->dev_leaving.dev); + break; + } } } @@ -292,7 +305,7 @@ void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); - bch2_accounting_key_to_text(out, &acc_k); + bch2_accounting_key_to_text(out, c, &acc_k); for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) prt_printf(out, " %lli", acc.v->d[i]); @@ -607,7 +620,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) if (memcmp(dst_v, src_v, nr * sizeof(u64))) { printbuf_reset(&buf); prt_str(&buf, "accounting mismatch for "); - bch2_accounting_key_to_text(&buf, &acc_k); + bch2_accounting_key_to_text(&buf, c, &acc_k); prt_str(&buf, ":\n got"); for (unsigned j = 0; j < nr; j++) @@ -672,7 +685,7 @@ static int disk_accounting_invalid_dev(struct btree_trans *trans, unsigned dev) { CLASS(printbuf, buf)(); - bch2_accounting_key_to_text(&buf, acc); + bch2_accounting_key_to_text(&buf, trans->c, acc); int ret = 0; if (fsck_err(trans, accounting_to_invalid_device, @@ -719,7 +732,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), + bch2_accounting_key_to_text(&buf, c, acc), buf.buf))) try(bch2_mark_replicas(c, &r.e)); break; @@ -849,7 +862,7 @@ static int accounting_read_mem_fixups(struct btree_trans *trans) bch2_log_msg_start(c, &underflow_err); prt_printf(&underflow_err, "Accounting underflow for\n"); } - bch2_accounting_key_to_text(&underflow_err, &k); + bch2_accounting_key_to_text(&underflow_err, c, &k); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) prt_printf(&underflow_err, " %lli", v[j]); diff --git a/libbcachefs/alloc/accounting.h b/libbcachefs/alloc/accounting.h index 72aabf4f..317cc9fb 100644 --- a/libbcachefs/alloc/accounting.h +++ b/libbcachefs/alloc/accounting.h @@ -124,7 +124,7 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); -void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); +void bch2_accounting_key_to_text(struct printbuf *, struct bch_fs *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(const struct bch_fs *, struct bkey_s); diff --git a/libbcachefs/alloc/accounting_format.h b/libbcachefs/alloc/accounting_format.h index 730a17ea..0bfa9f4b 100644 --- a/libbcachefs/alloc/accounting_format.h +++ b/libbcachefs/alloc/accounting_format.h @@ -110,7 +110,9 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(snapshot, 5, 1) \ x(btree, 6, 3) \ x(rebalance_work, 7, 1) \ - x(inum, 8, 3) + x(inum, 8, 3) \ + x(rebalance_work_v2, 9, 1) \ + x(dev_leaving, 10, 1) enum disk_accounting_type { #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, @@ -210,6 +212,19 @@ struct bch_acct_inum { struct bch_acct_rebalance_work { }; +struct bch_acct_rebalance_work_v2 { + __u8 type; +}; + +struct bch_acct_dev_leaving { + __u32 dev; +}; + +/* + * XXX: need per-device counters for "how much data are we going to move off of + * this device + */ + struct disk_accounting_pos { union { struct { @@ -224,6 +239,8 @@ struct disk_accounting_pos { struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; struct bch_acct_inum inum; + struct bch_acct_rebalance_work_v2 rebalance_work_v2; + struct bch_acct_dev_leaving dev_leaving; } __packed; } __packed; struct bpos _pad; diff --git a/libbcachefs/alloc/buckets.c b/libbcachefs/alloc/buckets.c index af023a47..f4211b78 100644 --- a/libbcachefs/alloc/buckets.c +++ b/libbcachefs/alloc/buckets.c @@ -317,8 +317,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, if (do_update) { struct bkey_i *new = - errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_rebalance))); + errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64))); bkey_reassemble(new, k); scoped_guard(rcu) @@ -386,7 +385,7 @@ found: struct bch_inode_opts opts; try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); - try(bch2_bkey_set_needs_rebalance(c, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0)); + try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0)); if (!(flags & BTREE_TRIGGER_is_root)) { CLASS(btree_node_iter, iter)(trans, btree, new->k.p, 0, level, @@ -889,7 +888,7 @@ int bch2_trigger_extent(struct btree_trans *trans, try(__trigger_extent(trans, btree, level, new.s_c, flags & ~BTREE_TRIGGER_overwrite)); - try(bch2_trigger_extent_rebalance(trans, old, new.s_c, flags)); + try(bch2_trigger_extent_rebalance(trans, btree, level, old, new, flags)); } return 0; diff --git a/libbcachefs/alloc/disk_groups.c b/libbcachefs/alloc/disk_groups.c index 91aac1a3..17a185bb 100644 --- a/libbcachefs/alloc/disk_groups.c +++ b/libbcachefs/alloc/disk_groups.c @@ -3,6 +3,8 @@ #include "alloc/disk_groups.h" +#include "data/rebalance.h" + #include "init/dev.h" #include "sb/members.h" @@ -469,9 +471,18 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { - guard(mutex)(&c->sb_lock); - return __bch2_dev_group_set(c, ca, name) ?: - bch2_write_super(c); + struct rebalance_scan s = { .type = REBALANCE_SCAN_pending }; + + try(bch2_set_rebalance_needs_scan(c, s, false)); + + /* bch2_rebalance_wakeup_pending goes here */ + scoped_guard(mutex,&c->sb_lock) { + try(__bch2_dev_group_set(c, ca, name)); + try(bch2_write_super(c)); + } + + try(bch2_set_rebalance_needs_scan(c, s, true)); + return 0; } int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, diff --git a/libbcachefs/alloc/replicas.c b/libbcachefs/alloc/replicas.c index f3b8378a..3de48fe9 100644 --- a/libbcachefs/alloc/replicas.c +++ b/libbcachefs/alloc/replicas.c @@ -16,25 +16,40 @@ DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu, kfree(_T.entries), (struct bch_replicas_cpu) {}, void) -static inline struct bch_replicas_entry_v1 * +static inline struct bch_replicas_entry_cpu * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } +static inline unsigned __cpu_replicas_entry_bytes(unsigned v1_bytes) +{ + return offsetof(struct bch_replicas_entry_cpu, e) + v1_bytes; +} + +static inline unsigned cpu_replicas_entry_bytes(struct bch_replicas_entry_cpu *e) +{ + return __cpu_replicas_entry_bytes(replicas_entry_bytes(&e->e)); +} + #define for_each_cpu_replicas_entry(_r, _i) \ - for (struct bch_replicas_entry_v1 *_i = (_r)->entries; \ + for (struct bch_replicas_entry_cpu *_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \ _i = (void *) (_i) + (_r)->entry_size) static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); -/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) +static int cpu_replicas_entry_cmp(const struct bch_replicas_entry_cpu *l, + const struct bch_replicas_entry_cpu *r, + size_t size) { - size_t size = (size_t) priv; - return memcmp(l, r, size); + return memcmp(&l->e, &r->e, size - offsetof(struct bch_replicas_entry_cpu, e)); +} + +static int cpu_replicas_entry_cmp_r(const void *l, const void *r, const void *priv) +{ + return cpu_replicas_entry_cmp(l, r, (size_t) priv); } /* Replicas tracking - in memory: */ @@ -60,7 +75,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); + cpu_replicas_entry_cmp_r, NULL, + (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -85,6 +101,13 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } +static void bch2_replicas_entry_cpu_to_text(struct printbuf *out, + struct bch_replicas_entry_cpu *e) +{ + prt_printf(out, "ref=%u ", atomic_read(&e->ref)); + bch2_replicas_entry_to_text(out, &e->e); +} + static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) @@ -151,7 +174,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, prt_printf(out, " "); first = false; - bch2_replicas_entry_to_text(out, i); + bch2_replicas_entry_cpu_to_text(out, i); } } @@ -232,6 +255,44 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, bch2_replicas_entry_sort(e); } +/* @l is bch_replicas_entry_v1, @r is bch_replicas_entry_cpu */ +static int replicas_entry_search_cmp(const void *_l, const void *_r, const void *priv) +{ + const struct bch_replicas_entry_v1 *l = _l; + const struct bch_replicas_entry_cpu *r = _r; + size_t size = (size_t) priv; + + return memcmp(l, &r->e, size); +} + +static inline struct bch_replicas_entry_cpu * +replicas_entry_search(struct bch_replicas_cpu *r, + struct bch_replicas_entry_v1 *search) +{ + verify_replicas_entry(search); + + size_t entry_size = replicas_entry_bytes(search); + int idx = likely(__cpu_replicas_entry_bytes(entry_size) <= r->entry_size) + ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, + replicas_entry_search_cmp, + (void *) entry_size, search) + : -1; + return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; +} + +bool bch2_replicas_marked_locked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + return !search->nr_devs || replicas_entry_search(&c->replicas, search); +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + guard(percpu_read)(&c->mark_lock); + return bch2_replicas_marked_locked(c, search); +} + static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, @@ -240,9 +301,12 @@ cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)), + __cpu_replicas_entry_bytes(replicas_entry_bytes(new_entry))), }; + /* alignment */ + new.entry_size = round_up(new.entry_size, sizeof(atomic_t)); + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -252,7 +316,7 @@ cpu_replicas_add_entry(struct bch_fs *c, cpu_replicas_entry(old, i), old->entry_size); - memcpy(cpu_replicas_entry(&new, old->nr), + memcpy(&cpu_replicas_entry(&new, old->nr)->e, new_entry, replicas_entry_bytes(new_entry)); @@ -260,152 +324,56 @@ cpu_replicas_add_entry(struct bch_fs *c, return new; } -static inline struct bch_replicas_entry_v1 * -replicas_entry_search(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - size_t entry_size = replicas_entry_bytes(search); - int idx = likely(entry_size <= r->entry_size) - ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, (void *) entry_size, search) - : -1; - return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - return !search->nr_devs || - (replicas_entry_search(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - replicas_entry_search(&c->replicas_gc, search))); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - guard(percpu_read)(&c->mark_lock); - return bch2_replicas_marked_locked(c, search); -} - noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry) + struct bch_replicas_entry_v1 *new_entry, + unsigned ref) { verify_replicas_entry(new_entry); - CLASS(bch_replicas_cpu, new_r)(); - CLASS(bch_replicas_cpu, new_gc)(); - guard(mutex)(&c->sb_lock); + bool write_sb = false; - if (c->replicas_gc.entries && - !replicas_entry_search(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) - return bch_err_throw(c, ENOMEM_cpu_replicas); - } - - if (!replicas_entry_search(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) - return bch_err_throw(c, ENOMEM_cpu_replicas); - - try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); - } - - if (!new_r.entries && - !new_gc.entries) - return 0; - - /* allocations done, now commit: */ - - if (new_r.entries) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ scoped_guard(percpu_write, &c->mark_lock) { - if (new_r.entries) + if (!replicas_entry_search(&c->replicas, new_entry)) { + CLASS(bch_replicas_cpu, new_r)(); + + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); + if (!new_r.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); + + try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); + swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); + write_sb = true; + } + + atomic_add(ref, &replicas_entry_search(&c->replicas, new_entry)->ref); } + /* After dropping mark_lock */ + if (write_sb) + bch2_write_super(c); + return 0; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r); + ? 0 : bch2_mark_replicas_slowpath(c, r, 0); } -/* - * Old replicas_gc mechanism: only used for journal replicas entries now, should - * die at some point: - */ - -int bch2_replicas_gc_end(struct bch_fs *c, int ret) +static void __replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_cpu *e) { - lockdep_assert_held(&c->replicas_gc_lock); + struct bch_replicas_cpu *r = &c->replicas; - guard(mutex)(&c->sb_lock); - scoped_guard(percpu_write, &c->mark_lock) { - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); + memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); + bch2_cpu_replicas_sort(r); - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; - } - - if (!ret) - bch2_write_super(c); - - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - lockdep_assert_held(&c->replicas_gc_lock); - - guard(mutex)(&c->sb_lock); - BUG_ON(c->replicas_gc.entries); - - c->replicas_gc.nr = 0; - c->replicas_gc.entry_size = 0; - - for_each_cpu_replicas_entry(&c->replicas, e) { - /* Preserve unknown data types */ - if (e->data_type >= BCH_DATA_NR || - !(BIT(e->data_type) & typemask)) { - c->replicas_gc.nr++; - c->replicas_gc.entry_size = - max_t(unsigned, c->replicas_gc.entry_size, - replicas_entry_bytes(e)); - } - } - - c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, - c->replicas_gc.entry_size, - GFP_KERNEL); - if (!c->replicas_gc.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - unsigned i = 0; - for_each_cpu_replicas_entry(&c->replicas, e) - if (e->data_type >= BCH_DATA_NR || - !(BIT(e->data_type) & typemask)) - memcpy(cpu_replicas_entry(&c->replicas_gc, i++), - e, c->replicas_gc.entry_size); - - bch2_cpu_replicas_sort(&c->replicas_gc); - return 0; + int ret = bch2_cpu_replicas_to_sb_replicas(c, r); + if (WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret))) + return; } void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill) @@ -413,18 +381,95 @@ void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *ki lockdep_assert_held(&c->mark_lock); lockdep_assert_held(&c->sb_lock); - struct bch_replicas_cpu *r = &c->replicas; + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, kill); - struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill); if (WARN(!e, "replicas entry not found in sb")) return; - memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); + __replicas_entry_kill(c, e); - bch2_cpu_replicas_sort(r); + /* caller does write_super() after dropping mark_lock */ +} - int ret = bch2_cpu_replicas_to_sb_replicas(c, r); - WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)); +void bch2_replicas_entry_put_many(struct bch_fs *c, struct bch_replicas_entry_v1 *r, unsigned nr) +{ + if (!r->nr_devs) + return; + + BUG_ON(r->data_type != BCH_DATA_journal); + verify_replicas_entry(r); + + scoped_guard(percpu_read, &c->mark_lock) { + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + + int v = atomic_sub_return(nr, &e->ref); + BUG_ON(v < 0); + if (v) + return; + } + + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + if (e && !atomic_read(&e->ref)) + __replicas_entry_kill(c, e); + } + + bch2_write_super(c); +} + +static inline bool bch2_replicas_entry_get_inmem(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + guard(percpu_read)(&c->mark_lock); + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + if (e) + atomic_inc(&e->ref); + return e != NULL; +} + +int bch2_replicas_entry_get(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + if (!r->nr_devs) + return 0; + + BUG_ON(r->data_type != BCH_DATA_journal); + verify_replicas_entry(r); + + return bch2_replicas_entry_get_inmem(c, r) + ? 0 + : bch2_mark_replicas_slowpath(c, r, 1); +} + +int bch2_replicas_gc_reffed(struct bch_fs *c) +{ + bool write_sb = false; + + guard(mutex)(&c->sb_lock); + + scoped_guard(percpu_write, &c->mark_lock) { + unsigned dst = 0; + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_cpu *e = + cpu_replicas_entry(&c->replicas, i); + + if (e->e.data_type != BCH_DATA_journal || + atomic_read(&e->ref)) + memcpy(cpu_replicas_entry(&c->replicas, dst++), + e, + c->replicas.entry_size); + } + + if (c->replicas.nr != dst) { + c->replicas.nr = dst; + bch2_cpu_replicas_sort(&c->replicas); + + try(bch2_cpu_replicas_to_sb_replicas(c, &c->replicas)); + } + } + + if (write_sb) + bch2_write_super(c); + return 0; } /* Replicas tracking - superblock: */ @@ -441,6 +486,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } + entry_size = __cpu_replicas_entry_bytes(entry_size); + entry_size = round_up(entry_size, sizeof(atomic_t)); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -448,10 +496,10 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, cpu_r->nr = nr; cpu_r->entry_size = entry_size; - for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(dst, e, replicas_entry_bytes(e)); - bch2_replicas_entry_sort(dst); + for_each_replicas_entry(sb_r, src) { + struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(&dst->e, src, replicas_entry_bytes(src)); + bch2_replicas_entry_sort(&dst->e); } return 0; @@ -469,9 +517,13 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } + entry_size = __cpu_replicas_entry_bytes(entry_size); + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); + entry_size = round_up(entry_size, sizeof(atomic_t)); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -480,14 +532,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, src) { - struct bch_replicas_entry_v1 *dst = + struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++); - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - dst->nr_required = 1; - memcpy(dst->devs, src->devs, src->nr_devs); - bch2_replicas_entry_sort(dst); + dst->e.data_type = src->data_type; + dst->e.nr_devs = src->nr_devs; + dst->e.nr_required = 1; + memcpy(dst->e.devs, src->devs, src->nr_devs); + bch2_replicas_entry_sort(&dst->e); } return 0; @@ -495,6 +547,12 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { + /* + * If called after fs is started (after journal read), we'll be blowing + * away refcounts + */ + BUG_ON(test_bit(BCH_FS_started, &c->flags)); + struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; CLASS(bch_replicas_cpu, new_r)(); @@ -522,7 +580,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(src) - 1; + bytes += replicas_entry_bytes(&src->e) - 1; sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); @@ -538,9 +596,9 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - memcpy(dst->devs, src->devs, src->nr_devs); + dst->data_type = src->e.data_type; + dst->nr_devs = src->e.nr_devs; + memcpy(dst->devs, src->e.devs, src->e.nr_devs); dst = replicas_entry_next(dst); @@ -561,8 +619,8 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(src); - if (src->nr_required != 1) + bytes += replicas_entry_bytes(&src->e); + if (src->e.nr_required != 1) need_v1 = true; } @@ -583,7 +641,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - memcpy(dst, src, replicas_entry_bytes(src)); + memcpy(dst, &src->e, replicas_entry_bytes(&src->e)); dst = replicas_entry_next(dst); @@ -602,24 +660,26 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_r(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - bch2_memcmp, NULL, + cpu_replicas_entry_cmp_r, NULL, (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_v1 *e = + struct bch_replicas_entry_cpu *e = cpu_replicas_entry(cpu_r, i); - try(bch2_replicas_entry_sb_validate(e, sb, err)); + try(bch2_replicas_entry_sb_validate(&e->e, sb, err)); if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_v1 *n = + struct bch_replicas_entry_cpu *n = cpu_replicas_entry(cpu_r, i + 1); - BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); + int cmp = cpu_replicas_entry_cmp(e, n, cpu_r->entry_size); - if (!memcmp(e, n, cpu_r->entry_size)) { + BUG_ON(cmp > 0); + + if (!cmp) { prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, e); + bch2_replicas_entry_to_text(err, &e->e); return -BCH_ERR_invalid_sb_replicas; } } @@ -702,7 +762,9 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, struct printbuf *err) { guard(percpu_read)(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { + for_each_cpu_replicas_entry(&c->replicas, i) { + struct bch_replicas_entry_v1 *e = &i->e; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; @@ -820,6 +882,25 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return bch2_can_read_fs_with_devs(c, devs, flags, err); } +bool bch2_sb_has_journal(struct bch_sb *sb) +{ + struct bch_sb_field_replicas *replicas = bch2_sb_field_get(sb, replicas); + struct bch_sb_field_replicas_v0 *replicas_v0 = bch2_sb_field_get(sb, replicas_v0); + + if (replicas) { + for_each_replicas_entry(replicas, r) + if (r->data_type == BCH_DATA_journal) + return true; + } else if (replicas_v0) { + for_each_replicas_entry(replicas_v0, r) + if (r->data_type == BCH_DATA_journal) + return true; + } + + + return false; +} + unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; @@ -863,5 +944,4 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) void bch2_fs_replicas_exit(struct bch_fs *c) { kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); } diff --git a/libbcachefs/alloc/replicas.h b/libbcachefs/alloc/replicas.h index cb5ce189..7fd6769b 100644 --- a/libbcachefs/alloc/replicas.h +++ b/libbcachefs/alloc/replicas.h @@ -39,13 +39,22 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, unsigned, struct printbuf *, bool); +bool bch2_sb_has_journal(struct bch_sb *); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); +void bch2_replicas_entry_put_many(struct bch_fs *, struct bch_replicas_entry_v1 *, unsigned); +static inline void bch2_replicas_entry_put(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + bch2_replicas_entry_put_many(c, r, 1); +} + +int bch2_replicas_entry_get(struct bch_fs *, struct bch_replicas_entry_v1 *); + void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *); +int bch2_replicas_gc_reffed(struct bch_fs *); + static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev) { for (unsigned i = 0; i < r->nr_devs; i++) @@ -54,6 +63,12 @@ static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, return false; } +static inline bool bch2_replicas_entry_eq(struct bch_replicas_entry_v1 *l, + struct bch_replicas_entry_v1 *r) +{ + return l->nr_devs == r->nr_devs && !memcmp(l, r, replicas_entry_bytes(l)); +} + /* iterate over superblock replicas - used by userspace tools: */ #define replicas_entry_next(_i) \ diff --git a/libbcachefs/alloc/replicas_types.h b/libbcachefs/alloc/replicas_types.h index 418e702e..50d8f87c 100644 --- a/libbcachefs/alloc/replicas_types.h +++ b/libbcachefs/alloc/replicas_types.h @@ -2,10 +2,16 @@ #ifndef _BCACHEFS_REPLICAS_TYPES_H #define _BCACHEFS_REPLICAS_TYPES_H +/* unsized - bch_replicas_entry_v1 is variable length */ +struct bch_replicas_entry_cpu { + atomic_t ref; + struct bch_replicas_entry_v1 e; +}; + struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_v1 *entries; + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry_cpu *entries; }; union bch_replicas_padded { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 9b39b811..2e309739 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -811,8 +811,6 @@ struct bch_fs { struct bch_accounting_mem accounting; struct bch_replicas_cpu replicas; - struct bch_replicas_cpu replicas_gc; - struct mutex replicas_gc_lock; struct journal_entry_res btree_root_journal_res; struct journal_entry_res clock_journal_res; @@ -1075,6 +1073,7 @@ struct bch_fs { GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; + struct hlist_head ec_stripes_new_buckets[64]; spinlock_t ec_stripes_new_lock; /* ERASURE CODING */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 9a16dcc5..a5182e6c 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -711,7 +711,8 @@ struct bch_sb_field_ext { x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ x(btree_node_accounting, BCH_VERSION(1, 31)) \ - x(sb_field_extent_type_u64s, BCH_VERSION(1, 32)) + x(sb_field_extent_type_u64s, BCH_VERSION(1, 32)) \ + x(rebalance_v2, BCH_VERSION(1, 33)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1430,6 +1431,17 @@ enum btree_id_flags { BTREE_IS_snapshot_field| \ BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_accounting)) \ + x(rebalance_hipri, 21, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)) \ + x(rebalance_pending, 22, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)) \ + x(rebalance_scan, 23, 0, \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_backpointer)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/libbcachefs/btree/check.c b/libbcachefs/btree/check.c index 8d15d59f..229fc32f 100644 --- a/libbcachefs/btree/check.c +++ b/libbcachefs/btree/check.c @@ -682,9 +682,11 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), BTREE_TRIGGER_check_repair|flags)); - if (bch2_trans_has_updates(trans)) - return bch2_trans_commit(trans, NULL, NULL, 0) ?: + if (bch2_trans_has_updates(trans)) { + CLASS(disk_reservation, res)(c); + return bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -BCH_ERR_transaction_restart_nested; + } try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags)); diff --git a/libbcachefs/btree/interior.c b/libbcachefs/btree/interior.c index 2517bdd5..cf915c9e 100644 --- a/libbcachefs/btree/interior.c +++ b/libbcachefs/btree/interior.c @@ -22,6 +22,7 @@ #include "data/extents.h" #include "data/keylist.h" +#include "data/rebalance.h" #include "data/write.h" #include "init/error.h" @@ -654,6 +655,35 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) bch2_write_super(c); } +static void bkey_strip_rebalance(const struct bch_fs *c, struct bkey_s k) +{ + bool dropped; + + do { + dropped = false; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_v2 || + extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_bp) { + extent_entry_drop(c, k, entry); + dropped = true; + break; + } + } while (dropped); +} + +static bool bkey_has_rebalance(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_v2) + return true; + return false; +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -661,26 +691,70 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { + struct bch_fs *c = trans->c; + struct bch_inode_opts opts; + bch2_inode_opts_get(as->c, &opts, true); + trans->journal_pin = &as->journal; - darray_for_each(as->old_nodes, i) + darray_for_each(as->old_nodes, i) { try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key), BTREE_TRIGGER_transactional)); - darray_for_each(as->new_nodes, i) { - try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key), - BTREE_TRIGGER_transactional)); - journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, jset_u64s(i->key.k.u64s))), - i->root - ? BCH_JSET_ENTRY_btree_root - : BCH_JSET_ENTRY_btree_keys, + BCH_JSET_ENTRY_overwrite, as->btree_id, - i->root ? i->level : i->level + 1, + i->level + 1, &i->key, i->key.k.u64s); } + darray_for_each(as->new_nodes, i) { + i->update_node_key = false; + bkey_strip_rebalance(c, bkey_i_to_s(&i->key)); + + try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, &i->key, + SET_NEEDS_REBALANCE_foreground, 0)); + + if (bkey_has_rebalance(c, bkey_i_to_s_c(&i->key))) { + CLASS(btree_iter_uninit, iter)(trans); + int ret = bch2_btree_node_get_iter(trans, &iter, i->b); + if (ret && ret != -BCH_ERR_btree_node_dying) + return ret; + if (!ret) + i->update_node_key = true; + else + bkey_strip_rebalance(c, bkey_i_to_s(&i->key)); + } + + try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key), + BTREE_TRIGGER_transactional)); + + if (!i->update_node_key || i->root) { + journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, + jset_u64s(i->key.k.u64s))), + i->root + ? BCH_JSET_ENTRY_btree_root + : BCH_JSET_ENTRY_btree_keys, + as->btree_id, + i->root ? i->level : i->level + 1, + &i->key, i->key.k.u64s); + } else { + CLASS(btree_node_iter, parent_iter)(trans, + as->btree_id, + i->key.k.p, + 0, + i->level + 1, + BTREE_ITER_intent); + try(bch2_btree_iter_traverse(&parent_iter)); + /* + * XXX: we shouldn't be logging overwrites here, need a + * flag for that + */ + try(bch2_trans_update(trans, &parent_iter, &i->key, BTREE_TRIGGER_norun)); + } + } + return 0; } @@ -760,19 +834,23 @@ static void btree_update_nodes_written(struct btree_update *as) BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, btree_update_nodes_written_trans(trans, as)); - bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s", bch2_err_str(ret)); + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), + c, "%s", bch2_err_str(ret)); /* * Clear will_make_reachable while we still hold intent locks on * all our new nodes, to avoid racing with * btree_node_update_key(): */ - darray_for_each(as->new_nodes, i) + darray_for_each(as->new_nodes, i) { + if (i->update_node_key) + bkey_copy(&i->b->key, &i->key); + if (i->b) { BUG_ON(i->b->will_make_reachable != (unsigned long) as); i->b->will_make_reachable = 0; clear_btree_node_will_make_reachable(i->b); } + } } /* @@ -2422,7 +2500,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, */ } - try(bch2_trans_commit(trans, NULL, NULL, commit_flags)); + CLASS(disk_reservation, res)(c); + try(bch2_trans_commit(trans, &res.r, NULL, commit_flags)); bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); bkey_copy(&b->key, new_key); diff --git a/libbcachefs/btree/interior.h b/libbcachefs/btree/interior.h index de45d62f..70bfba6b 100644 --- a/libbcachefs/btree/interior.h +++ b/libbcachefs/btree/interior.h @@ -26,6 +26,7 @@ struct btree_update_node { struct btree *b; unsigned level; bool root; + bool update_node_key; __le64 seq; __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; diff --git a/libbcachefs/btree/types.h b/libbcachefs/btree/types.h index 29ef9734..78cc748b 100644 --- a/libbcachefs/btree/types.h +++ b/libbcachefs/btree/types.h @@ -560,7 +560,7 @@ struct btree_trans { struct bch_fs_usage_base fs_usage_delta; unsigned journal_u64s; - unsigned extra_disk_res; /* XXX kill */ + u64 extra_disk_res; __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); diff --git a/libbcachefs/data/checksum.h b/libbcachefs/data/checksum.h index 6f0c888c..a7a998da 100644 --- a/libbcachefs/data/checksum.h +++ b/libbcachefs/data/checksum.h @@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, return bch2_csum_opt_to_type(opts.data_checksum, true); } +static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c, + struct bch_extent_rebalance_v2 opts) +{ + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opts.data_checksum, true); +} + static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) diff --git a/libbcachefs/data/ec.c b/libbcachefs/data/ec.c index 7defe513..58390a58 100644 --- a/libbcachefs/data/ec.c +++ b/libbcachefs/data/ec.c @@ -878,8 +878,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, * Hash table of open stripes: * Stripes that are being created or modified are kept in a hash table, so that * stripe deletion can skip them. + * + * Additionally, we have a hash table for buckets that have stripes being + * created, to avoid racing with rebalance: */ +static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + struct ec_stripe_new_bucket *s; + + hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash) + if (s->dev_bucket == dev_bucket) + return true; + return false; +} + +bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + guard(spinlock)(&c->ec_stripes_new_lock); + return __bch2_bucket_has_new_stripe(c, dev_bucket); +} + +static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket) +{ + s->dev_bucket = dev_bucket; + + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]); +} + +static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s) +{ + unsigned nr_blocks = s->nr_data + s->nr_parity; + + guard(spinlock)(&c->ec_stripes_new_lock); + for (unsigned i = 0; i < nr_blocks; i++) { + if (!s->blocks[i]) + continue; + + struct open_bucket *ob = c->open_buckets + s->blocks[i]; + struct bpos bucket = POS(ob->dev, ob->bucket); + + stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket)); + } +} + +static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + + for (unsigned i = 0; i < v->nr_blocks; i++) + hlist_del_init(&s->buckets[i].hash); +} + static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); @@ -920,6 +972,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) hlist_del_init(&s->hash); s->idx = 0; + + stripe_new_buckets_del(c, s); } /* stripe deletion */ @@ -1045,6 +1099,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct ec_stripe_buf *s, struct bkey_s_c_backpointer bp, struct stripe_update_bucket_stats *stats, + struct disk_reservation *res, struct wb_maybe_flush *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; @@ -1110,7 +1165,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, .idx = s->key.k.p.offset, }; - struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr))); + struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64))); bkey_reassemble(n, k); @@ -1126,10 +1181,9 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_inode_opts opts; try(bch2_bkey_get_io_opts(trans, NULL, bkey_i_to_s_c(n), &opts)); - try(bch2_bkey_set_needs_rebalance(trans->c, &opts, n, - SET_NEEDS_REBALANCE_other, 0)); + try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_other, 0)); try(bch2_trans_update(trans, &iter, n, 0)); - try(bch2_trans_commit(trans, NULL, NULL, + try(bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc)); @@ -1159,6 +1213,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct stripe_update_bucket_stats stats = {}; + CLASS(disk_reservation, res)(c); + try(for_each_btree_key_max(trans, bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket_pos), bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, ({ @@ -1174,7 +1230,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b wb_maybe_flush_inc(&last_flushed); ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, - &stats, &last_flushed); + &stats, &res.r, &last_flushed); }))); if (trace_stripe_update_bucket_enabled()) { @@ -2026,6 +2082,7 @@ allocate_buf: if (ret) goto err; + stripe_new_buckets_add(c, s); s->allocated = true; allocated: BUG_ON(!s->idx); diff --git a/libbcachefs/data/ec.h b/libbcachefs/data/ec.h index 4ef8d142..4721f197 100644 --- a/libbcachefs/data/ec.h +++ b/libbcachefs/data/ec.h @@ -191,6 +191,11 @@ enum ec_stripe_ref { STRIPE_REF_NR }; +struct ec_stripe_new_bucket { + struct hlist_node hash; + u64 dev_bucket; +}; + struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; @@ -217,6 +222,8 @@ struct ec_stripe_new { open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; + struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX]; + struct ec_stripe_buf new_stripe; struct ec_stripe_buf existing_stripe; }; @@ -248,6 +255,8 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); +bool bch2_bucket_has_new_stripe(struct bch_fs *, u64); + void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); diff --git a/libbcachefs/data/extents.c b/libbcachefs/data/extents.c index e3566fca..dd5e645c 100644 --- a/libbcachefs/data/extents.c +++ b/libbcachefs/data/extents.c @@ -796,7 +796,7 @@ void bch2_bkey_propagate_incompressible(const struct bch_fs *c, struct bkey_i *d /* * XXX: if some data actually is compressed, we want - * bch_extent_rebalance.wont_recompress_smaller + * bch_extent_rebalance_v2.wont_recompress_smaller */ bkey_extent_entry_for_each(ptrs, entry) { @@ -884,6 +884,15 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) return durability; } +void bch2_bkey_extent_entry_drop_s(const struct bch_fs *c, struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *end = bkey_val_end(k); + union bch_extent_entry *next = extent_entry_next(c, entry); + + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k.k->u64s -= extent_entry_u64s(c, entry); +} + void bch2_bkey_extent_entry_drop(const struct bch_fs *c, struct bkey_i *k, union bch_extent_entry *entry) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); @@ -1378,14 +1387,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "idx %llu block %u", (u64) ec->idx, ec->block); break; } - case BCH_EXTENT_ENTRY_rebalance: - bch2_extent_rebalance_to_text(out, c, &entry->rebalance); + case BCH_EXTENT_ENTRY_rebalance_v1: + bch2_extent_rebalance_v1_to_text(out, c, &entry->rebalance_v1); + break; + + case BCH_EXTENT_ENTRY_rebalance_v2: + bch2_extent_rebalance_v2_to_text(out, c, &entry->rebalance_v2); break; case BCH_EXTENT_ENTRY_flags: prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); break; + case BCH_EXTENT_ENTRY_rebalance_bp: + prt_printf(out, "idx %llu", (u64) entry->rebalance_bp.idx); + break; + default: prt_printf(out, "(unknown extent entry %.16llx)", *((u64 *) entry)); return; @@ -1439,6 +1456,18 @@ fsck_err: return ret; } +static inline bool btree_ptr_entry_type_allowed(enum bch_extent_entry_type type) +{ + switch (type) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_rebalance_v2: + case BCH_EXTENT_ENTRY_rebalance_bp: + return true; + default: + return false; + }; +} + int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -1449,23 +1478,27 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; + bool have_inval_dev_ptrs = false, have_non_inval_dev_ptrs = false; int ret = 0; if (bkey_is_btree_ptr(k.k)) size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(extent_entry_type(entry) >= c->extent_types_known, + unsigned type = extent_entry_type(entry); + + bkey_fsck_err_on(type >= c->extent_types_known, c, extent_ptrs_invalid_entry, "invalid extent entry type (got %u, max %u)", - extent_entry_type(entry), c->extent_types_known); + type, c->extent_types_known); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), + type < BCH_EXTENT_ENTRY_MAX && + !btree_ptr_entry_type_allowed(type), c, btree_ptr_has_non_ptr, - "has non ptr field"); + "has non allowed field"); - switch (extent_entry_type(entry)) { + switch (type) { case BCH_EXTENT_ENTRY_ptr: try(extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false)); @@ -1480,6 +1513,12 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, have_ec = false; crc_since_last_ptr = false; + + if (entry->ptr.dev == BCH_SB_MEMBER_INVALID) + have_inval_dev_ptrs = true; + else + have_non_inval_dev_ptrs = true; + nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: @@ -1527,30 +1566,18 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; + have_non_inval_dev_ptrs = true; break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - union bch_compression_opt opt = { .value = r->compression }; - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif + case BCH_EXTENT_ENTRY_rebalance_v2: + try(bch2_extent_rebalance_validate(c, k, from, &entry->rebalance_v2)); break; - } case BCH_EXTENT_ENTRY_flags: bkey_fsck_err_on(entry != ptrs.start, c, extent_flags_not_at_start, "extent flags entry not at start"); break; + case BCH_EXTENT_ENTRY_rebalance_bp: + break; } } @@ -1572,6 +1599,9 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(have_ec, c, extent_ptrs_redundant_stripe, "redundant stripe entry"); + bkey_fsck_err_on(have_inval_dev_ptrs && !have_non_inval_dev_ptrs, + c, extent_ptrs_all_invalid, + "extent ptrs all to BCH_SB_MEMBER_INVALID"); fsck_err: return ret; } @@ -1608,7 +1638,8 @@ void bch2_ptr_swab(const struct bch_fs *c, struct bkey_s k) break; case BCH_EXTENT_ENTRY_stripe_ptr: break; - case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_rebalance_v1: + case BCH_EXTENT_ENTRY_rebalance_v2: break; default: /* Bad entry type: will be caught by validate() */ @@ -1682,8 +1713,10 @@ int bch2_cut_front_s(const struct bch_fs *c, struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_rebalance_v1: + case BCH_EXTENT_ENTRY_rebalance_v2: case BCH_EXTENT_ENTRY_flags: + case BCH_EXTENT_ENTRY_rebalance_bp: break; } diff --git a/libbcachefs/data/extents.h b/libbcachefs/data/extents.h index 88dc9717..adfcd054 100644 --- a/libbcachefs/data/extents.h +++ b/libbcachefs/data/extents.h @@ -596,6 +596,7 @@ bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c); bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned); +void bch2_bkey_extent_entry_drop_s(const struct bch_fs *, struct bkey_s, union bch_extent_entry *); void bch2_bkey_extent_entry_drop(const struct bch_fs *, struct bkey_i *, union bch_extent_entry *); static inline void bch2_bkey_append_ptr(const struct bch_fs *c, struct bkey_i *k, struct bch_extent_ptr ptr) diff --git a/libbcachefs/data/extents_format.h b/libbcachefs/data/extents_format.h index 74c0252c..5116f212 100644 --- a/libbcachefs/data/extents_format.h +++ b/libbcachefs/data/extents_format.h @@ -79,9 +79,11 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) \ - x(flags, 6) -#define BCH_EXTENT_ENTRY_MAX 7 + x(rebalance_v1, 5) \ + x(flags, 6) \ + x(rebalance_v2, 7) \ + x(rebalance_bp, 8) +#define BCH_EXTENT_ENTRY_MAX 9 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -221,7 +223,7 @@ struct bch_extent_flags { #endif }; -/* bch_extent_rebalance: */ +/* bch_extent_rebalance_v2: */ #include "rebalance_format.h" union bch_extent_entry { @@ -270,13 +272,13 @@ struct bch_extent { } __packed __aligned(8); /* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ +#define BKEY_EXTENT_PTR_U64s_MAX \ ((sizeof(struct bch_extent_crc128) + \ sizeof(struct bch_extent_ptr)) / sizeof(__u64)) /* Maximum possible size of an entire extent value: */ #define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + (5 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) /* * Maximum possible size of an entire extent, key + value: */ #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) @@ -284,7 +286,9 @@ struct bch_extent { /* Btree pointers don't carry around checksums: */ #define BKEY_BTREE_PTR_VAL_U64s_MAX \ ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX + \ + sizeof(struct bch_extent_rebalance_v2) + \ + sizeof(struct bch_extent_rebalance_bp)) / sizeof(__u64)) #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) diff --git a/libbcachefs/data/migrate.c b/libbcachefs/data/migrate.c index 4c9c9147..51eddf21 100644 --- a/libbcachefs/data/migrate.c +++ b/libbcachefs/data/migrate.c @@ -75,14 +75,15 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (!bch2_bkey_has_device_c(c, k, dev_idx)) return 0; - struct bkey_i *n = - errptr_try(bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node)); + /* blah */ + struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64))); + bkey_reassemble(n, k); try(drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false)); struct bch_inode_opts opts; try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); - try(bch2_bkey_set_needs_rebalance(c, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0)); + try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0)); /* * Since we're not inserting through an extent iterator @@ -92,7 +93,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, */ if (bkey_deleted(&n->k)) n->k.size = 0; - return 0; + return bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node); } static int bch2_dev_btree_drop_key(struct btree_trans *trans, @@ -116,6 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned flags, struct printbuf *err) { CLASS(btree_trans, trans)(c); + CLASS(disk_reservation, res)(c); /* FIXME: this does not handle unknown btrees with data pointers */ for (unsigned id = 0; id < BTREE_ID_NR; id++) { @@ -126,14 +128,13 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, if (id == BTREE_ID_stripes) continue; - int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, + try(for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res.r); bch2_progress_update_iter(trans, progress, &iter, "dropping user data") ?: bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err); - })); - if (ret) - return ret; + }))); } return 0; @@ -218,6 +219,7 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig struct printbuf *err) { CLASS(btree_trans, trans)(c); + CLASS(disk_reservation, res)(c); struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); wb_maybe_flush_init(&last_flushed); @@ -226,11 +228,12 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, POS(dev_idx, 0), POS(dev_idx, U64_MAX), 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({ if (k.k->type != KEY_TYPE_backpointer) continue; wb_maybe_flush_inc(&last_flushed); + bch2_disk_reservation_put(c, &res.r); data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), &last_flushed, flags, err); diff --git a/libbcachefs/data/move.c b/libbcachefs/data/move.c index fe5b387e..c7dcd20c 100644 --- a/libbcachefs/data/move.c +++ b/libbcachefs/data/move.c @@ -324,8 +324,11 @@ int bch2_move_extent(struct moving_context *ctxt, struct bch_inode_opts opts; try(bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts)); - try(bch2_update_rebalance_opts(trans, &opts, iter, k, SET_NEEDS_REBALANCE_other)); - try(bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc)); + try(bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, iter, level, k, + SET_NEEDS_REBALANCE_other)); + + CLASS(disk_reservation, res)(c); + try(bch2_trans_commit_lazy(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc)); struct data_update_opts data_opts = {}; int ret = pred(trans, arg, iter->btree_id, k, &opts, &data_opts); diff --git a/libbcachefs/data/rebalance.c b/libbcachefs/data/rebalance.c index 465f79e2..63631c7c 100644 --- a/libbcachefs/data/rebalance.c +++ b/libbcachefs/data/rebalance.c @@ -3,15 +3,18 @@ #include "bcachefs.h" #include "alloc/background.h" +#include "alloc/backpointers.h" #include "alloc/buckets.h" #include "alloc/disk_groups.h" #include "alloc/foreground.h" +#include "btree/interior.h" #include "btree/iter.h" #include "btree/update.h" #include "btree/write_buffer.h" #include "data/compress.h" +#include "data/ec.h" #include "data/move.h" #include "data/rebalance.h" #include "data/write.h" @@ -29,28 +32,61 @@ #include #include -/* bch_extent_rebalance: */ +/* bch_extent_rebalance_v2: */ -static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(const struct bch_fs *c, +int bch2_extent_rebalance_validate(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + const struct bch_extent_rebalance_v2 *r) +{ + int ret = 0; + + bkey_fsck_err_on(r->pending && + !(r->need_rb & (BIT(BCH_REBALANCE_background_target)| + BIT(BCH_REBALANCE_data_replicas)| + BIT(BCH_REBALANCE_erasure_code))), + c, extent_rebalance_bad_pending, + "pending incorrectly set"); + + bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)), + c, extent_rebalance_bad_hipri, + "hipri incorrectly set"); + + bkey_fsck_err_on(!r->data_replicas, + c, extent_rebalance_bad_replicas, + "bad replicas"); + +fsck_err: + return ret; +} + +static const struct bch_extent_rebalance_v2 *bch2_bkey_ptrs_rebalance_opts(const struct bch_fs *c, struct bkey_ptrs_c ptrs) { const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_v2) + return &entry->rebalance_v2; return NULL; } -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(const struct bch_fs *c, - struct bkey_s_c k) +const struct bch_extent_rebalance_v2 *bch2_bkey_rebalance_opts(const struct bch_fs *c, + struct bkey_s_c k) { return bch2_bkey_ptrs_rebalance_opts(c, bch2_bkey_ptrs_c(k)); } -void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, - const struct bch_extent_rebalance *r) +static const char * const rebalance_opts[] = { +#define x(n) #n, + BCH_REBALANCE_OPTS() +#undef x + NULL +}; + +void bch2_extent_rebalance_v1_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance_v1 *r) { prt_printf(out, "replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) @@ -98,194 +134,735 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -int bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) +void bch2_extent_rebalance_v2_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance_v2 *r) +{ + prt_str(out, "need_rb="); + prt_bitflags(out, rebalance_opts, r->need_rb); + + if (r->hipri) + prt_str(out, " hipri"); + if (r->pending) + prt_str(out, " pending"); + + prt_printf(out, " replicas=%u", r->data_replicas); + if (r->data_replicas_from_inode) + prt_str(out, " (inode)"); + + prt_str(out, " checksum="); + bch2_prt_csum_opt(out, r->data_checksum); + if (r->data_checksum_from_inode) + prt_str(out, " (inode)"); + + if (r->background_compression || r->background_compression_from_inode) { + prt_str(out, " background_compression="); + bch2_compression_opt_to_text(out, r->background_compression); + + if (r->background_compression_from_inode) + prt_str(out, " (inode)"); + } + + if (r->background_target || r->background_target_from_inode) { + prt_str(out, " background_target="); + if (c) + bch2_target_to_text(out, c, r->background_target); + else + prt_printf(out, "%u", r->background_target); + + if (r->background_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->promote_target || r->promote_target_from_inode) { + prt_str(out, " promote_target="); + if (c) + bch2_target_to_text(out, c, r->promote_target); + else + prt_printf(out, "%u", r->promote_target); + + if (r->promote_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->erasure_code || r->erasure_code_from_inode) { + prt_printf(out, " ec=%u", r->erasure_code); + if (r->erasure_code_from_inode) + prt_str(out, " (inode)"); + } +} + +/* + * XXX: check in bkey_validate that if r->hipri or r->pending are set, + * r->data_replicas are also set + */ + +static enum btree_id rb_work_btree(const struct bch_extent_rebalance_v2 *r) +{ + if (!r || !r->need_rb) + return 0; + if (r->hipri) + return BTREE_ID_rebalance_hipri; + if (r->pending) + return BTREE_ID_rebalance_pending; + return BTREE_ID_rebalance_work; +} + +static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance_v2 *r) +{ + if (!r) + return 0; + + unsigned ret = r->need_rb; + if (r->hipri) + ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority); + if (r->pending) { + ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending); + ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_target); + } + return ret; +} + +static u64 bch2_bkey_get_rebalance_bp(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_bp) + return entry->rebalance_bp.idx; + return 0; +} + +static void bch2_bkey_set_rebalance_bp(const struct bch_fs *c, struct bkey_s k, u64 idx) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_bp) { + if (idx) + entry->rebalance_bp.idx = idx; + else + bch2_bkey_extent_entry_drop_s(c, k, entry); + return; + } + + if (!idx) + return; + + struct bch_extent_rebalance_bp r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance_bp), + .idx = idx, + }; + union bch_extent_entry *end = bkey_val_end(k); + memcpy_u64s(end, &r, sizeof(r) / sizeof(u64)); + k.k->u64s += sizeof(r) / sizeof(u64); +} + +static inline struct bch_backpointer rb_bp(enum btree_id btree, unsigned level, struct bkey_s_c k) +{ + return (struct bch_backpointer) { + .btree_id = btree, + .level = level, + .pos = k.k->p, + }; +} + +static int rebalance_bp_del(struct btree_trans *trans, enum btree_id work_btree, + enum btree_id btree, unsigned level, struct bkey_s_c k, + u64 bp_idx) +{ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_scan, POS(1, bp_idx), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + struct bkey_s_c bp_k = bkey_try(bch2_btree_iter_peek_slot(&iter)); + + struct bch_backpointer bp = rb_bp(btree, level, k); + + if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bp_k.v, &bp, sizeof(bp))) { + int ret = 0; + + CLASS(printbuf, buf)(); + prt_printf(&buf, "btree ptr points to bad/missing rebalance bp\n"); + bch2_bkey_val_to_text(&buf, trans->c, k); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, trans->c, bp_k); + + fsck_err(trans, btree_ptr_to_bad_rebalance_bp, "%s", buf.buf); +fsck_err: + return ret; + } + + return bch2_btree_delete_at(trans, &iter, 0); +} + +static int rebalance_bp_add(struct btree_trans *trans, enum btree_id work_btree, + enum btree_id btree, unsigned level, struct bkey_s k, + u64 *bp_idx) +{ + CLASS(btree_iter_uninit, iter)(trans); + try(bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_rebalance_scan, + POS(1, 1), POS(1, U64_MAX))); + + *bp_idx = iter.pos.offset; + + struct bkey_i_backpointer *bp = errptr_try(bch2_bkey_alloc(trans, &iter, 0, backpointer)); + bp->v = rb_bp(btree, level, k.s_c); + return 0; +} + +static struct bkey_s_c rebalance_bp_get_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_backpointer bp) { struct bch_fs *c = trans->c; - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; + int ret = 0; + CLASS(printbuf, buf)(); - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; + /* don't allow bps to non btree nodes: */ + if (fsck_err_on(!bp.v->level, + trans, rebalance_bp_to_leaf_node_key, + "rebalance bp to leaf node key\n%s", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete(trans, BTREE_ID_rebalance_scan, bp.k->p, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } - s = bch2_bkey_sectors_need_rebalance(c, new); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; + bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, 0, bp.v->level, 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + return k; - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) - try(bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0)); + /* + * peek_slot() doesn't normally return NULL - except when we ask for a + * key at a btree level that doesn't exist. + * + * We may want to revisit this and change peek_slot(): + */ + if (k.k && bch2_bkey_get_rebalance_bp(c, k) == bp.k->p.offset) + return k; - if (need_rebalance_sectors_delta[0]) - try(bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work)); + /* walk down a level, check for btree_node_will_make_reachable(b)) */ + + CLASS(btree_node_iter, iter2)(trans, bp.v->btree_id, bp.v->pos, 0, bp.v->level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter2); + if (IS_ERR(b)) + return bkey_s_c_err(PTR_ERR(b)); + + if (b) { + if (btree_node_will_make_reachable(b)) + return bkey_s_c_null; + + k = bkey_i_to_s_c(&b->key); + if (bch2_bkey_get_rebalance_bp(c, k) == bp.k->p.offset) + return k; + } + + prt_printf(&buf, "rebalance backpointer to missing/incorrect btree ptr\n"); + bch2_bkey_val_to_text(&buf, c, bp.s_c); + prt_newline(&buf); + if (k.k) + bch2_bkey_val_to_text(&buf, c, k); + else + prt_str(&buf, "(no key)"); + + if (b) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + } + + if (fsck_err(trans, rebalance_bp_to_missing_btree_ptr, "%s", buf.buf)) + ret = bch2_btree_delete(trans, BTREE_ID_rebalance_scan, bp.k->p, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +fsck_err: + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static int trigger_dev_counters(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_rebalance_v2 *r, + enum btree_iter_update_trigger_flags flags) +{ + if (!r || !r->ptrs_moving || r->pending) + return 0; + + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (r->ptrs_moving & ptr_bit) { + u64 v[1] = { p.crc.compressed_size }; + if (flags & BTREE_TRIGGER_overwrite) + v[0] = -v[0]; + + try(bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, v, dev_leaving, p.ptr.dev)); + } + + ptr_bit <<= 1; + } return 0; } -static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, - struct bch_inode_opts *io_opts, - unsigned *move_ptrs, - unsigned *compress_ptrs, - u64 *sectors) +int __bch2_trigger_extent_rebalance(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + const struct bch_extent_rebalance_v2 *old_r, + const struct bch_extent_rebalance_v2 *new_r, + enum btree_iter_update_trigger_flags flags) { - *move_ptrs = 0; - *compress_ptrs = 0; - *sectors = 0; + enum btree_id old_btree = rb_work_btree(old_r); + enum btree_id new_btree = rb_work_btree(new_r); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (flags & BTREE_TRIGGER_transactional) { + if (!level) { + if (old_btree && old_btree != new_btree) + try(bch2_btree_bit_mod_buffered(trans, old_btree, old.k->p, false)); - const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(c, ptrs); - if (!io_opts && !rb_opts) - return; + if (new_btree && old_btree != new_btree) + try(bch2_btree_bit_mod_buffered(trans, new_btree, new.k->p, true)); + } else { + struct bch_fs *c = trans->c; + u64 bp_idx = bch2_bkey_get_rebalance_bp(c, old); - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return; + if (bp_idx && !new_btree) { + try(rebalance_bp_del(trans, old_btree, btree, level, old, bp_idx)); + bp_idx = 0; + } - unsigned compression_type = - bch2_compression_opt_to_type(io_opts - ? io_opts->background_compression - : rb_opts->background_compression); - unsigned target = io_opts - ? io_opts->background_target - : rb_opts->background_target; - if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) - target = 0; + if (!bp_idx && new_btree) + try(rebalance_bp_add(trans, old_btree, btree, level, new, &bp_idx)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool incompressible = false, unwritten = false; - - unsigned ptr_idx = 1; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - incompressible |= p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; - unwritten |= p.ptr.unwritten; - - if (!p.ptr.cached) { - if (p.crc.compression_type != compression_type) - *compress_ptrs |= ptr_idx; - - if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) - *move_ptrs |= ptr_idx; + bch2_bkey_set_rebalance_bp(c, new, bp_idx); } - - ptr_idx <<= 1; } - if (unwritten) - *compress_ptrs = 0; - if (incompressible) - *compress_ptrs = 0; + unsigned old_a = rb_accounting_counters(old_r); + unsigned new_a = rb_accounting_counters(new_r); + unsigned delta = old.k->size == new.k->size + ? old_a ^ new_a + : old_a | new_a; - unsigned rb_ptrs = *move_ptrs | *compress_ptrs; + while (delta) { + unsigned c = __ffs(delta); + delta ^= BIT(c); - if (!rb_ptrs) - return; + s64 v[1] = { 0 }; + if (old_a & BIT(c)) + v[0] -= (s64) old.k->size; + if (new_a & BIT(c)) + v[0] += (s64) new.k->size; - ptr_idx = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (rb_ptrs & ptr_idx) - *sectors += p.crc.compressed_size; - ptr_idx <<= 1; + try(bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, v, rebalance_work_v2, c)); } + + try(trigger_dev_counters(trans, old, old_r, flags & ~BTREE_TRIGGER_insert)); + try(trigger_dev_counters(trans, new.s_c, new_r, flags & ~BTREE_TRIGGER_overwrite)); + + return 0; } -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; - - bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); - return sectors; -} - -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) -{ - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; - - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - return move_ptrs|compress_ptrs; -} - -static inline bool bkey_should_have_rb_opts(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static inline bool bkey_should_have_rb_opts(struct bkey_s_c k, + struct bch_extent_rebalance_v2 new) { if (k.k->type == KEY_TYPE_reflink_v) { -#define x(n) if (opts->n##_from_inode) return true; +#define x(n) if (new.n##_from_inode) return true; BCH_REBALANCE_OPTS() #undef x } - return bch2_bkey_ptrs_need_rebalance(c, opts, k); + return new.need_rb; } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, +static bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *opts, + int *need_update_invalid_devs, + struct bch_extent_rebalance_v2 *ret) +{ + bool btree = bkey_is_btree_ptr(k.k); + + if (btree && + bch2_request_incompat_feature(c, bcachefs_metadata_version_rebalance_v2)) + return false; + + struct bch_extent_rebalance_v2 r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance_v2), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bool poisoned = bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned); + unsigned compression_type = bch2_compression_opt_to_type(r.background_compression); + unsigned csum_type = bch2_data_checksum_type_rb(c, r); + + bool incompressible = false, unwritten = false, ec = false; + unsigned durability = 0, durability_acct = 0, invalid = 0, min_durability = INT_MAX; + + scoped_guard(rcu) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + incompressible |= p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; + unwritten |= p.ptr.unwritten; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + if (ca && !p.ptr.cached) { + if (!poisoned && + !btree && + p.crc.csum_type != csum_type) + r.need_rb |= BIT(BCH_REBALANCE_data_checksum); + + if (!poisoned && + p.crc.compression_type != compression_type) + r.need_rb |= BIT(BCH_REBALANCE_background_compression); + + if (!poisoned && + r.background_target && + !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) { + r.need_rb |= BIT(BCH_REBALANCE_background_target); + r.ptrs_moving |= ptr_bit; + } + + if (ca->mi.state == BCH_MEMBER_STATE_failed) { + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + r.hipri = 1; + r.ptrs_moving |= ptr_bit; + } + + unsigned d = __extent_ptr_durability(ca, &p); + + durability_acct += d; + + if (ca->mi.state == BCH_MEMBER_STATE_failed) + d = 0; + + durability += d; + min_durability = min(min_durability, d); + + ec |= p.has_ec; + } + + invalid += p.ptr.dev == BCH_SB_MEMBER_INVALID; + + ptr_bit <<= 1; + } + } + + if (unwritten || incompressible) + r.need_rb &= ~BIT(BCH_REBALANCE_background_compression); + + if (unwritten) + r.need_rb &= ~BIT(BCH_REBALANCE_data_checksum); + + if (durability < r.data_replicas) { + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + r.hipri = 1; + } + + if (durability >= r.data_replicas + min_durability) + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + + if (!unwritten && r.erasure_code != ec) + r.need_rb |= BIT(BCH_REBALANCE_erasure_code); + + *need_update_invalid_devs = + min_t(int, durability_acct + invalid - r.data_replicas, invalid); + + /* Multiple pointers to BCH_SB_MEMBER_INVALID is an incompat feature: */ + if (*need_update_invalid_devs < 0 && + bch2_request_incompat_feature(c, bcachefs_metadata_version_rebalance_v2)) + *need_update_invalid_devs = 0; + + const struct bch_extent_rebalance_v2 *old = bch2_bkey_ptrs_rebalance_opts(c, ptrs); + if (old && !(old->need_rb & ~r.need_rb)) { + r.pending = old->pending; + if (r.hipri && !old->hipri) + r.pending = 0; + } + + bool should_have_rb = bkey_should_have_rb_opts(k, r); + + *ret = r; + + return (*need_update_invalid_devs || + should_have_rb != !!old || + (should_have_rb ? memcmp(old, &r, sizeof(r)) : old != NULL)) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_sb_field_extent_type_u64s); +} + +static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v) +{ + if (v && *v) + return 1; + + /* + * If opts need to be propagated to the extent, a scan cookie should be + * present: + */ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_scan, POS(0, inum), 0); + struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); + + int ret = k.k->type == KEY_TYPE_cookie; + if (v) + *v = ret; + return ret; +} + +static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k, + struct bch_devs_mask *v) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + if (v && test_bit(ptr->dev, v->d)) + return 1; + + bkey_for_each_ptr(ptrs, ptr) { + int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL); + if (ret < 0) + return ret; + if (ret) { + if (v) + __set_bit(ptr->dev, v->d); + return ret; + } + } + + return 0; +} + +static bool bkey_has_ec(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) + return true; + return false; +} + +static int new_needs_rb_allowed(struct btree_trans *trans, + struct per_snapshot_io_opts *s, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx, + unsigned opt_change_cookie, + const struct bch_extent_rebalance_v2 *old, + const struct bch_extent_rebalance_v2 *new, + unsigned new_need_rb) +{ + struct bch_fs *c = trans->c; + /* + * New need_rb - pointers that don't match the current io path options - + * are only allowed in certain situations: + * + * Propagating new options: from bch2_set_rebalance_needs_scan + * + * Foreground writes: background_compression and background_target are + * allowed + * + * Foreground writes: we may have raced with an option change: + * opt_change_cookie checks for this + * + * XXX: foreground writes should still match compression, + * foreground_target - figure out how to check for this + */ + if (ctx == SET_NEEDS_REBALANCE_opt_change || + ctx == SET_NEEDS_REBALANCE_opt_change_indirect) + return 0; + + if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) && + !bkey_has_ec(c, k)) { + /* Foreground writes are not initially erasure coded - and we + * may crash before a stripe is created + */ + new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code); + } + + if (ctx == SET_NEEDS_REBALANCE_foreground) { + new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| + BIT(BCH_REBALANCE_background_target)); + + /* + * Foreground writes might end up degraded when a device is + * getting yanked: + * + * XXX: this is something we need to fix, but adding retries to + * the write path is something we have to do carefully. + */ + new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas); + if (!new_need_rb) + return 0; + + if (opt_change_cookie != atomic_read(&c->opt_change_cookie)) + return 0; + } + + /* + * Either the extent data or the extent io options (from + * bch_extent_rebalance_v2) should match the io_opts from the + * inode/filesystem, unless + * + * - There's a scan pending to propagate new options + * - It's an indirect extent: it may be referenced by inodes + * with inconsistent options + * + * For efficiency (so that we can cache checking for scan + * cookies), only check option consistency when we're called + * with snapshot_io_opts - don't bother when we're called from + * move_data_phys() -> get_io_opts_one() + * + * Note that we can cache the existence of a cookie, but not the + * non-existence, to avoid spurious false positives. + */ + int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?: + check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL); + if (ret) + return min(ret, 0); + + if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) { + ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL); + if (ret) + return min(ret, 0); + } + + CLASS(printbuf, buf)(); + + prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n"); + bch2_bkey_val_to_text(&buf, c, k); + prt_printf(&buf, "\nnew rebalance: "); + bch2_extent_rebalance_v2_to_text(&buf, c, new); + + const struct bch_extent_rebalance_v2 _old = {}; + if (!old) + old = &_old; + +#define x(_name) \ + if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \ + prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name); + BCH_REBALANCE_OPTS() +#undef x + + fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf); +fsck_err: + return ret; +} + +int bch2_bkey_set_needs_rebalance(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bch_inode_opts *opts, struct bkey_i *_k, enum set_needs_rebalance_ctx ctx, - u32 change_cookie) + u32 opt_change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; + struct bch_fs *c = trans->c; struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(c, k.s_c); - if (bkey_should_have_rb_opts(c, opts, k.s_c)) { + int need_update_invalid_devs; + struct bch_extent_rebalance_v2 new; + + if (!bch2_bkey_needs_rebalance(c, k.s_c, opts, &need_update_invalid_devs, &new)) + return 0; + + struct bch_extent_rebalance_v2 *old = + (struct bch_extent_rebalance_v2 *) bch2_bkey_rebalance_opts(c, k.s_c); + unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0); + + if (unlikely(new_need_rb)) + try(new_needs_rb_allowed(trans, snapshot_io_opts, k.s_c, ctx, opt_change_cookie, + old, &new, new_need_rb)); + + if (bkey_should_have_rb_opts(k.s_c, new)) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(c, k, (union bch_extent_entry *) old); + *old = new; + } else if (old) + extent_entry_drop(c, k, (union bch_extent_entry *) old); + + if (unlikely(need_update_invalid_devs)) { + if (need_update_invalid_devs > 0) { + bch2_bkey_drop_ptrs(k, p, entry, + (p.ptr.dev == BCH_SB_MEMBER_INVALID && + need_update_invalid_devs && + need_update_invalid_devs--)); + } else { + need_update_invalid_devs = -need_update_invalid_devs; + + trans->extra_disk_res += (u64) need_update_invalid_devs * + (bkey_is_btree_ptr(k.k) ? btree_sectors(c) : k.k->size); + + while (need_update_invalid_devs--) { + union bch_extent_entry *end = bkey_val_end(k); + + end->ptr = (struct bch_extent_ptr) { + .type = BIT(BCH_EXTENT_ENTRY_ptr), + .dev = BCH_SB_MEMBER_INVALID, + }; + + _k->k.u64s++; + } + } } return 0; } int bch2_update_rebalance_opts(struct btree_trans *trans, - struct bch_inode_opts *io_opts, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bch_inode_opts *opts, struct btree_iter *iter, + unsigned level, struct bkey_s_c k, enum set_needs_rebalance_ctx ctx) { - struct bch_fs *c = trans->c; - BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); if (!bkey_extent_is_direct_data(k.k)) return 0; - if (bkey_is_btree_ptr(k.k)) + struct bch_fs *c = trans->c; + int need_update_invalid_devs; + struct bch_extent_rebalance_v2 new; + + if (!bch2_bkey_needs_rebalance(c, k, opts, &need_update_invalid_devs, &new)) return 0; - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(c, k); - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); + if (!level) { + struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_rebalance_v2) + + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX)); + bkey_reassemble(n, k); - if (bkey_should_have_rb_opts(c, io_opts, k) - ? old && !memcmp(old, &new, sizeof(new)) - : !old) - return 0; + return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, opts, n, ctx, 0) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node); + } else { + CLASS(btree_node_iter, iter2)(trans, iter->btree_id, iter->pos, 0, level - 1, 0); + struct btree *b = errptr_try(bch2_btree_iter_peek_node(&iter2)); - struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_rebalance))); - bkey_reassemble(n, k); + struct bkey_i *n = + errptr_try(bch2_trans_kmalloc(trans, BKEY_BTREE_PTR_U64s_MAX * sizeof(u64))); + bkey_copy(n, &b->key); - return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node); + return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, opts, n, ctx, 0) ?: + bch2_btree_node_update_key(trans, &iter2, b, n, BCH_TRANS_COMMIT_no_enospc, false) ?: + bch_err_throw(c, transaction_restart_commit); + } } int bch2_bkey_get_io_opts(struct btree_trans *trans, @@ -340,7 +917,8 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, darray_push(&snapshot_opts->d, e); })); - snapshot_opts->cur_inum = k.k->p.inode; + snapshot_opts->cur_inum = k.k->p.inode; + snapshot_opts->inum_scan_cookie = false; return ret ?: bch_err_throw(c, transaction_restart_nested); } @@ -355,7 +933,7 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, *opts = snapshot_opts->fs_io_opts; } - const struct bch_extent_rebalance *old; + const struct bch_extent_rebalance_v2 *old; if (k.k->type == KEY_TYPE_reflink_v && (old = bch2_bkey_rebalance_opts(c, k))) { #define x(_name) \ @@ -371,8 +949,6 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, return 0; } -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - static const char * const bch2_rebalance_state_strs[] = { #define x(t) #t, BCH_REBALANCE_STATES() @@ -380,15 +956,22 @@ static const char * const bch2_rebalance_state_strs[] = { #undef x }; +#define REBALANCE_SCAN_COOKIE_device 32 +#define REBALANCE_SCAN_COOKIE_pending 2 +#define REBALANCE_SCAN_COOKIE_metadata 1 +#define REBALANCE_SCAN_COOKIE_fs 0 + static u64 rebalance_scan_encode(struct rebalance_scan s) { switch (s.type) { case REBALANCE_SCAN_fs: - return 0; + return REBALANCE_SCAN_COOKIE_fs; case REBALANCE_SCAN_metadata: - return 1; + return REBALANCE_SCAN_COOKIE_metadata; + case REBALANCE_SCAN_pending: + return REBALANCE_SCAN_COOKIE_pending; case REBALANCE_SCAN_device: - return s.dev + 32; + return REBALANCE_SCAN_COOKIE_device + s.dev; case REBALANCE_SCAN_inum: return s.inum; default: @@ -396,30 +979,30 @@ static u64 rebalance_scan_encode(struct rebalance_scan s) } } -static struct rebalance_scan rebalance_scan_decode(u64 v) +static struct rebalance_scan rebalance_scan_decode(struct bch_fs *c, u64 v) { - if (v == 0) - return (struct rebalance_scan) { .type = REBALANCE_SCAN_fs }; - if (v == 1) - return (struct rebalance_scan) { .type = REBALANCE_SCAN_metadata }; - if (v < BCACHEFS_ROOT_INO) + if (v >= BCACHEFS_ROOT_INO) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_inum, .inum = v, }; + if (v >= REBALANCE_SCAN_COOKIE_device) return (struct rebalance_scan) { .type = REBALANCE_SCAN_device, - .dev = v - 32, - }; + .dev = v - REBALANCE_SCAN_COOKIE_device, + }; + if (v == REBALANCE_SCAN_COOKIE_pending) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_pending }; + if (v == REBALANCE_SCAN_COOKIE_metadata) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_metadata }; + if (v == REBALANCE_SCAN_COOKIE_fs) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_fs}; - return (struct rebalance_scan) { - .type = REBALANCE_SCAN_inum, - .inum = v, - }; + bch_err(c, "unknown realance scan cookie %llu", v); + return (struct rebalance_scan) { .type = REBALANCE_SCAN_fs}; } int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, struct rebalance_scan s) { - CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, - SPOS(rebalance_scan_encode(s), - REBALANCE_WORK_SCAN_OFFSET, - U32_MAX), + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_scan, + POS(0, rebalance_scan_encode(s)), BTREE_ITER_intent); struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); @@ -436,41 +1019,59 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, struct rebala return bch2_trans_update(trans, &iter, &cookie->k_i, 0); } -int bch2_set_rebalance_needs_scan(struct bch_fs *c, struct rebalance_scan s) +int bch2_set_rebalance_needs_scan(struct bch_fs *c, struct rebalance_scan s, bool wakeup) { CLASS(btree_trans, trans)(c); - return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, s)); + try(commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_set_rebalance_needs_scan_trans(trans, s))); + if (wakeup) + bch2_rebalance_wakeup(c); + return 0; } int bch2_set_fs_needs_rebalance(struct bch_fs *c) { return bch2_set_rebalance_needs_scan(c, - (struct rebalance_scan) { .type = REBALANCE_SCAN_fs }); + (struct rebalance_scan) { .type = REBALANCE_SCAN_fs }, + true); } -static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) +static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, struct bpos pos, u64 cookie) { - CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_scan, pos, BTREE_ITER_intent); + struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); - u64 v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - return v == cookie - ? bch2_btree_delete_at(trans, &iter, 0) - : 0; + u64 v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + v == cookie + ? bch2_btree_delete_at(trans, &iter, 0) + : 0; + })); } #define REBALANCE_WORK_BUF_NR 1024 -DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i_cookie); +DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i); -static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, - darray_rebalance_work *buf, struct bpos *work_pos) +static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + darray_rebalance_work *buf, + enum btree_id btree, + struct bpos *work_pos) { + if (btree == BTREE_ID_rebalance_scan) { + buf->nr = 0; + + int ret = for_each_btree_key(trans, iter, btree, *work_pos, + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ + bkey_reassemble(&darray_top(*buf), k); + return bkey_i_to_s_c(&darray_top(*buf)); + 0; + })); + + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } + if (unlikely(!buf->nr)) { /* * Avoid contention with write buffer flush: buffer up rebalance @@ -479,14 +1080,12 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, BUG_ON(!buf->size);; - bch2_trans_begin(trans); - - for_each_btree_key(trans, iter, BTREE_ID_rebalance_work, *work_pos, + int ret = for_each_btree_key(trans, iter, btree, *work_pos, BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ - /* we previously used darray_make_room */ BUG_ON(bkey_bytes(k.k) > sizeof(buf->data[0])); - bkey_reassemble(&darray_top(*buf).k_i, k); + /* we previously used darray_make_room */ + bkey_reassemble(&darray_top(*buf), k); buf->nr++; *work_pos = bpos_successor(iter.pos); @@ -494,9 +1093,11 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, break; 0; })); + if (ret) + return bkey_s_c_err(ret); if (!buf->nr) - return NULL; + return bkey_s_c_null; unsigned l = 0, r = buf->nr - 1; while (l < r) { @@ -506,23 +1107,24 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, } } - return &(&darray_pop(buf))->k_i; + return bkey_i_to_s_c(&darray_pop(buf)); } -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs) { struct bch_fs *c = trans->c; - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(c, k)) - return 0; + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) + continue; - struct bkey_i *n = errptr_try(bch2_bkey_make_mut(trans, iter, &k, 0)); - - extent_entry_drop(c, bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(c, bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket))) + return true; + } + return false; } static int rebalance_set_data_opts(struct btree_trans *trans, @@ -532,24 +1134,109 @@ static int rebalance_set_data_opts(struct btree_trans *trans, struct bch_inode_opts *opts, struct data_update_opts *data_opts) { - struct btree_iter *extent_iter = arg; struct bch_fs *c = trans->c; + const struct bch_extent_rebalance_v2 *r = bch2_bkey_rebalance_opts(c, k); + if (!r || !r->need_rb) /* Write buffer race? */ + return 0; memset(data_opts, 0, sizeof(*data_opts)); data_opts->type = BCH_DATA_UPDATE_rebalance; - data_opts->ptrs_rewrite = bch2_bkey_ptrs_need_rebalance(c, opts, k); - data_opts->target = opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; + if (!r->hipri) + data_opts->write_flags |= BCH_WRITE_only_specified_devs; + data_opts->target = r->background_target; - if (!data_opts->ptrs_rewrite) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - try(bch2_bkey_clear_needs_rebalance(trans, extent_iter, k)); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + unsigned csum_type = bch2_data_checksum_type_rb(c, *r); + unsigned compression_type = bch2_compression_opt_to_type(r->background_compression); + + if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) { + unsigned durability = bch2_bkey_durability(c, k); + unsigned ptr_bit = 1; + + guard(rcu)(); + if (durability <= r->data_replicas) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (ca && !ptr->cached && !ca->mi.durability) + data_opts->ptrs_kill |= ptr_bit; + ptr_bit <<= 1; + } + + data_opts->extra_replicas = r->data_replicas - durability; + } else { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= r->data_replicas) { + data_opts->ptrs_kill |= ptr_bit; + durability -= d; + } + + ptr_bit <<= 1; + } + + ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec && durability - p.ec.redundancy >= r->data_replicas) { + data_opts->ptrs_kill_ec |= ptr_bit; + durability -= p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) { + if (r->erasure_code) { + /* XXX: we'll need ratelimiting */ + if (extent_ec_pending(trans, ptrs)) + return false; + + data_opts->extra_replicas = r->data_replicas; + } else { + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec) { + data_opts->ptrs_kill_ec |= ptr_bit; + data_opts->extra_replicas += p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + scoped_guard(rcu) { + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if ((r->need_rb & BIT(BCH_REBALANCE_data_checksum)) && + p.crc.csum_type != csum_type) + data_opts->ptrs_rewrite |= ptr_bit; + + if ((r->need_rb & BIT(BCH_REBALANCE_background_compression)) && + p.crc.compression_type != compression_type) + data_opts->ptrs_rewrite |= ptr_bit; + + if ((r->need_rb & BIT(BCH_REBALANCE_background_target)) && + !bch2_dev_in_target(c, p.ptr.dev, r->background_target)) + data_opts->ptrs_rewrite |= ptr_bit; + + ptr_bit <<= 1; + } + } + + if (!data_opts->ptrs_rewrite && + !data_opts->ptrs_kill && + !data_opts->ptrs_kill_ec && + !data_opts->extra_replicas) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s", buf.buf); return 0; } @@ -557,9 +1244,24 @@ static int rebalance_set_data_opts(struct btree_trans *trans, return 1; } -static int do_rebalance_extent(struct moving_context *ctxt, - struct per_snapshot_io_opts *snapshot_io_opts, - struct bpos work_pos) +static int bch2_extent_set_rb_pending(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *n = errptr_try(bch2_bkey_make_mut(trans, iter, &k, 0)); + + struct bch_extent_rebalance_v2 *r = (struct bch_extent_rebalance_v2 *) + bch2_bkey_rebalance_opts(trans->c, bkey_i_to_s_c(n)); + BUG_ON(!r); + + r->pending = true; + + return bch2_trans_commit(trans, NULL, NULL, 0); +} + +static int __do_rebalance_extent(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, + struct btree_iter *iter, struct bkey_s_c k) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -568,17 +1270,22 @@ static int do_rebalance_extent(struct moving_context *ctxt, ctxt->stats = &c->rebalance.work_stats; c->rebalance.state = BCH_REBALANCE_working; - CLASS(btree_iter, iter)(trans, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, BTREE_ITER_all_snapshots); - struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); - int ret = bch2_move_extent(ctxt, NULL, snapshot_io_opts, rebalance_set_data_opts, NULL, - &iter, 0, k); + iter, iter->min_depth, k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; if (bch2_err_matches(ret, EROFS)) return ret; + if (bch2_err_matches(ret, BCH_ERR_data_update_fail_no_rw_devs) || + bch2_err_matches(ret, BCH_ERR_insufficient_devices)) { + if (rb_work_btree(bch2_bkey_rebalance_opts(c, k)) != + BTREE_ID_rebalance_pending) + try(bch2_trans_relock(trans) ?: + bch2_extent_set_rb_pending(trans, iter, k)); + + return 0; + } if (ret) { WARN_ONCE(ret != -BCH_ERR_data_update_fail_no_snapshot && ret != -BCH_ERR_data_update_fail_no_rw_devs, @@ -594,8 +1301,64 @@ static int do_rebalance_extent(struct moving_context *ctxt, return 0; } +static int do_rebalance_extent(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bpos work_pos) +{ + struct btree_trans *trans = ctxt->trans; + + CLASS(btree_iter, iter)(trans, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_all_snapshots); + struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); + + return __do_rebalance_extent(ctxt, snapshot_io_opts, &iter, k); +} + +noinline_for_stack +static int do_rebalance_btree(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bkey_s_c_backpointer bp) +{ + struct btree_trans *trans = ctxt->trans; + + CLASS(btree_iter_uninit, iter)(trans); + struct bkey_s_c k = bkey_try(rebalance_bp_get_key(trans, &iter, bp)); + + if (!k.k) + return 0; + + return __do_rebalance_extent(ctxt, snapshot_io_opts, &iter, k); +} + +static int do_rebalance_scan_bp(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct wb_maybe_flush *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + + CLASS(btree_iter_uninit, iter)(trans); + struct bkey_s_c k = bkey_try(bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed)); + if (!k.k) + return 0; + + atomic64_add(!bp.v->level ? k.k->size : c->opts.btree_node_size >> 9, + &r->scan_stats.sectors_seen); + + struct bch_inode_opts opts; + try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); + + return bch2_update_rebalance_opts(trans, NULL, &opts, &iter, bp.v->level, k, + SET_NEEDS_REBALANCE_opt_change); +} + static int do_rebalance_scan_indirect(struct btree_trans *trans, + struct disk_reservation *res, struct bkey_s_c_reflink_p p, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *opts) { u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); @@ -606,10 +1369,12 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans, POS(0, idx), BTREE_ITER_intent| BTREE_ITER_not_extents, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) break; - bch2_update_rebalance_opts(trans, opts, &iter, k, + + bch2_disk_reservation_put(trans->c, res); + bch2_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, 0, k, SET_NEEDS_REBALANCE_opt_change_indirect); }))); @@ -653,7 +1418,8 @@ static int do_rebalance_scan_btree(struct moving_context *ctxt, struct bch_inode_opts opts; ret = bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts) ?: - bch2_update_rebalance_opts(trans, &opts, &iter, k, SET_NEEDS_REBALANCE_opt_change); + bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, &iter, level, k, + SET_NEEDS_REBALANCE_opt_change); root_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -669,6 +1435,7 @@ root_err: BTREE_ITER_prefetch| BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); + CLASS(disk_reservation, res)(c); return for_each_btree_key_max_continue(trans, iter, end, 0, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -676,23 +1443,52 @@ root_err: atomic64_add(!level ? k.k->size : c->opts.btree_node_size >> 9, &r->scan_stats.sectors_seen); - struct bch_inode_opts opts; + bch2_disk_reservation_put(c, &res.r); + struct bch_inode_opts opts; bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts) ?: - bch2_update_rebalance_opts(trans, &opts, &iter, k, SET_NEEDS_REBALANCE_opt_change) ?: + bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, &iter, level, k, + SET_NEEDS_REBALANCE_opt_change) ?: (start.inode && k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) - ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), &opts) + ? do_rebalance_scan_indirect(trans, &res.r, bkey_s_c_to_reflink_p(k), + snapshot_io_opts, &opts) : 0) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc); })); } +static int do_rebalance_scan_fs(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, + bool metadata) +{ + struct bch_fs *c = ctxt->trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + + r->scan_start = BBPOS_MIN; + r->scan_end = BBPOS_MAX; + + for (enum btree_id btree = 0; btree < btree_id_nr_alive(c); btree++) { + if (!bch2_btree_id_root(c, btree)->b) + continue; + + bool scan_leaves = !metadata && + (btree == BTREE_ID_extents || + btree == BTREE_ID_reflink); + + for (unsigned level = !scan_leaves; level < BTREE_MAX_DEPTH; level++) + try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, btree, level, + POS_MIN, SPOS_MAX)); + } + + return 0; +} + noinline_for_stack static int do_rebalance_scan(struct moving_context *ctxt, struct per_snapshot_io_opts *snapshot_io_opts, - u64 scan_v, u64 cookie, u64 *sectors_scanned) + struct bpos cookie_pos, u64 cookie, u64 *sectors_scanned) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -703,19 +1499,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->state = BCH_REBALANCE_scanning; - struct rebalance_scan s = rebalance_scan_decode(scan_v); + struct rebalance_scan s = rebalance_scan_decode(c, cookie_pos.offset); if (s.type == REBALANCE_SCAN_fs) { - r->scan_start = BBPOS_MIN; - r->scan_end = BBPOS_MAX; + try(do_rebalance_scan_fs(ctxt, snapshot_io_opts, false)); + } else if (s.type == REBALANCE_SCAN_metadata) { + try(do_rebalance_scan_fs(ctxt, snapshot_io_opts, true)); + } else if (s.type == REBALANCE_SCAN_device) { + r->scan_start = BBPOS(BTREE_ID_backpointers, POS(s.dev, 0)); + r->scan_end = BBPOS(BTREE_ID_backpointers, POS(s.dev, U64_MAX)); - for (enum btree_id btree = 0; btree < btree_id_nr_alive(c); btree++) { - if (btree != BTREE_ID_extents && - btree != BTREE_ID_reflink) + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); + + bch2_btree_write_buffer_flush_sync(trans); + + CLASS(disk_reservation, res)(c); + + try(for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(s.dev, 0), POS(s.dev, U64_MAX), + BTREE_ITER_prefetch, k, + &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (k.k->type != KEY_TYPE_backpointer) continue; - try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, btree, 0, - POS_MIN, SPOS_MAX)); - } + bch2_disk_reservation_put(c, &res.r); + do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed); + }))); } else if (s.type == REBALANCE_SCAN_inum) { r->scan_start = BBPOS(BTREE_ID_extents, POS(s.inum, 0)); r->scan_end = BBPOS(BTREE_ID_extents, POS(s.inum, U64_MAX)); @@ -724,8 +1535,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->scan_start.pos, r->scan_end.pos)); } - try(commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, scan_v, cookie))); + try(bch2_clear_rebalance_needs_scan(trans, cookie_pos, cookie)); *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); /* @@ -777,7 +1587,6 @@ static int do_rebalance(struct moving_context *ctxt) u32 kick = r->kick; int ret = 0; - struct bpos work_pos = POS_MIN; CLASS(darray_rebalance_work, work)(); try(darray_make_room(&work, REBALANCE_WORK_BUF_NR)); @@ -785,6 +1594,18 @@ static int do_rebalance(struct moving_context *ctxt) CLASS(per_snapshot_io_opts, snapshot_io_opts)(c); + static enum btree_id scan_btrees[] = { + BTREE_ID_rebalance_scan, + BTREE_ID_rebalance_hipri, + BTREE_ID_rebalance_work, + BTREE_ID_rebalance_pending, + }; + unsigned i = 0; + struct bpos work_pos = POS_MIN; + + struct bkey_i_cookie pending_cookie; + bkey_init(&pending_cookie.k); + while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); @@ -794,21 +1615,62 @@ static int do_rebalance(struct moving_context *ctxt) break; } - struct bkey_i *k = next_rebalance_entry(trans, &work, &work_pos); - if (!k) - break; + if (kick != r->kick) { + kick = r->kick; + i = 0; + work_pos = POS_MIN; + work.nr = 0; + } - ret = k->k.type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, &snapshot_io_opts, - k->k.p.inode, - le64_to_cpu(bkey_i_to_cookie(k)->v.cookie), - §ors_scanned) - : lockrestart_do(trans, - do_rebalance_extent(ctxt, &snapshot_io_opts, k->k.p)); + struct bkey_s_c k = next_rebalance_entry(trans, &work, scan_btrees[i], &work_pos); + ret = bkey_err(k); if (ret) break; + + if (!k.k) { + if (++i == ARRAY_SIZE(scan_btrees)) + break; + + work_pos = POS_MIN; + + if (scan_btrees[i] == BTREE_ID_rebalance_pending && + bkey_deleted(&pending_cookie.k)) + break; + continue; + } + + if (k.k->type == KEY_TYPE_cookie && + rebalance_scan_decode(c, k.k->p.offset).type == REBALANCE_SCAN_pending) + bkey_reassemble(&pending_cookie.k_i, k); + + if (k.k->type == KEY_TYPE_cookie) + ret = do_rebalance_scan(ctxt, &snapshot_io_opts, + k.k->p, + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie), + §ors_scanned); + else if (k.k->type == KEY_TYPE_backpointer) + ret = do_rebalance_btree(ctxt, &snapshot_io_opts, + bkey_s_c_to_backpointer(k)); + else + ret = lockrestart_do(trans, + do_rebalance_extent(ctxt, &snapshot_io_opts, k.k->p)); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + + if (ret) + break; + + if (scan_btrees[i] == BTREE_ID_rebalance_scan) + work_pos = bpos_successor(work_pos); } + if (!ret && !bkey_deleted(&pending_cookie.k)) + try(bch2_clear_rebalance_needs_scan(trans, + pending_cookie.k.p, pending_cookie.v.cookie)); + bch2_move_stats_exit(&r->work_stats, c); if (!ret && @@ -910,6 +1772,21 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) } } +void bch2_rebalance_scan_pending_to_text(struct printbuf *out, struct bch_fs *c) +{ + /* not the nicest place for this check */ + if (!test_bit(BCH_FS_btree_running, &c->flags)) + return; + + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_scan, POS_MIN, 0); + + struct bkey_s_c k; + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + + prt_printf(out, "%u\n", iter.pos.inode == 0); +} + void bch2_rebalance_stop(struct bch_fs *c) { struct task_struct *p; @@ -986,96 +1863,275 @@ int bch2_fs_rebalance_init(struct bch_fs *c) return 0; } +/* need better helpers for iterating in parallel */ + +static int fix_rebalance_work_btree(struct btree_trans *trans, + enum btree_id want_set_in_btree, + struct bpos pos, + struct btree_iter *rb_iter) +{ + bool should_have_rebalance = rb_iter->btree_id == want_set_in_btree; + bool have_rebalance = rb_iter->k.type == KEY_TYPE_set; + + return should_have_rebalance != have_rebalance + ? bch2_btree_bit_mod_buffered(trans, rb_iter->btree_id, pos, should_have_rebalance) + : 0; +} + static int check_rebalance_work_one(struct btree_trans *trans, struct btree_iter *extent_iter, - struct btree_iter *rebalance_iter, - struct wb_maybe_flush *last_flushed) + struct btree_iter *rb_w, + struct btree_iter *rb_h, + struct btree_iter *rb_p, + struct per_snapshot_io_opts *snapshot_io_opts, + struct wb_maybe_flush *last_flushed, + struct bpos *cur_pos) { struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); int ret = 0; - struct bkey_s_c extent_k = bkey_try(bch2_btree_iter_peek(extent_iter)); - struct bkey_s_c rebalance_k = bkey_try(bch2_btree_iter_peek(rebalance_iter)); + bch2_btree_iter_set_pos(extent_iter, *cur_pos); + bch2_btree_iter_set_pos(rb_w, *cur_pos); + bch2_btree_iter_set_pos(rb_h, *cur_pos); + bch2_btree_iter_set_pos(rb_p, *cur_pos); - if (!extent_k.k && - extent_iter->btree_id == BTREE_ID_reflink && - (!rebalance_k.k || - rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { - bch2_trans_iter_init(trans, extent_iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots); - return bch_err_throw(c, transaction_restart_nested); - } + struct bkey_s_c extent_k = bkey_try(bch2_btree_iter_peek(extent_iter)); + bkey_try(bch2_btree_iter_peek(rb_w)); + bkey_try(bch2_btree_iter_peek(rb_h)); + bkey_try(bch2_btree_iter_peek(rb_p)); - if (!extent_k.k && !rebalance_k.k) - return 1; - - int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, - rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + struct bpos pos = bpos_min(bpos_min(bpos_min( + extent_iter->pos, rb_w->pos), rb_h->pos), rb_p->pos); struct bkey deleted; bkey_init(&deleted); + deleted.p = pos; - if (cmp < 0) { - deleted.p = extent_k.k->p; - rebalance_k.k = &deleted; - } else if (cmp > 0) { - deleted.p = rebalance_k.k->p; - extent_k.k = &deleted; + if (bpos_lt(pos, extent_iter->pos)) { + extent_k.k = &deleted; + extent_iter->k = deleted; + } + if (bpos_lt(pos, rb_w->pos)) + rb_w->k = deleted; + if (bpos_lt(pos, rb_h->pos)) + rb_h->k = deleted; + if (bpos_lt(pos, rb_p->pos)) + rb_p->k = deleted; + + if (extent_iter->btree_id == BTREE_ID_reflink && pos.inode >= BCACHEFS_ROOT_INO) { + bch2_trans_iter_init(trans, extent_iter, BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + return bch_err_throw(c, transaction_restart_nested); } - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; - bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + *cur_pos = pos; + if (bpos_eq(*cur_pos, SPOS_MAX)) + return 0; - if (should_have_rebalance != have_rebalance) { + enum btree_id btree_want_set = rb_work_btree(bch2_bkey_rebalance_opts(c, extent_k)); + + u64 btrees_set = + (rb_w->k.type ? BIT_ULL(rb_w->btree_id) : 0)| + (rb_h->k.type ? BIT_ULL(rb_h->btree_id) : 0)| + (rb_p->k.type ? BIT_ULL(rb_p->btree_id) : 0); + + u64 btree_want_set_mask = btree_want_set ? BIT_ULL(btree_want_set) : 0; + if (btrees_set != btree_want_set_mask) { try(bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed)); - bch2_bkey_val_to_text(&buf, c, extent_k); + CLASS(printbuf, buf)(); + prt_str(&buf, "extent should be set in "); + if (btree_want_set) + bch2_btree_id_str(btree_want_set); + else + prt_str(&buf, "(none)"); + prt_printf(&buf, "\nbut set in: "); + bch2_prt_bitflags(&buf, __bch2_btree_ids, btrees_set); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, trans->c, extent_k); + + if (fsck_err(trans, rebalance_work_incorrectly_set, "%s", buf.buf)) { + try(fix_rebalance_work_btree(trans, btree_want_set, *cur_pos, rb_w)); + try(fix_rebalance_work_btree(trans, btree_want_set, *cur_pos, rb_h)); + try(fix_rebalance_work_btree(trans, btree_want_set, *cur_pos, rb_p)); + } } - if (fsck_err_on(!should_have_rebalance && have_rebalance, - trans, rebalance_work_incorrectly_set, - "rebalance work incorrectly set\n%s", buf.buf)) - try(bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, extent_k.k->p, false)); + struct bch_inode_opts opts; - if (fsck_err_on(should_have_rebalance && !have_rebalance, - trans, rebalance_work_incorrectly_unset, - "rebalance work incorrectly unset\n%s", buf.buf)) - try(bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, extent_k.k->p, true)); - - try(bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc)); - - if (cmp <= 0) - bch2_btree_iter_advance(extent_iter); - if (cmp >= 0) - bch2_btree_iter_advance(rebalance_iter); + try(bch2_bkey_get_io_opts(trans, snapshot_io_opts, extent_k, &opts)); + try(bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, extent_iter, 0, extent_k, + SET_NEEDS_REBALANCE_other)); fsck_err: return ret; } +static int check_rebalance_work_btree_key(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + struct bch_inode_opts opts; + try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); + try(bch2_update_rebalance_opts(trans, NULL, &opts, iter, iter->min_depth, k, + SET_NEEDS_REBALANCE_other)); + + enum btree_id rb_btree = rb_work_btree(bch2_bkey_rebalance_opts(c, k)); + u64 rb_idx = bch2_bkey_get_rebalance_bp(c, k); + + CLASS(printbuf, buf)(); + + if (fsck_err_on(rb_btree && !rb_idx, + trans, btree_ptr_with_no_rebalance_bp, + "btree ptr with no rebalance_bp\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance_bp))); + + bkey_reassemble(n, k); + + try(rebalance_bp_add(trans, rb_btree, iter->btree_id, iter->min_depth, bkey_i_to_s(n), &rb_idx)); + bch2_bkey_set_rebalance_bp(c, bkey_i_to_s(n), rb_idx); + return 0; + } + + if (fsck_err_on(!rb_btree && rb_idx, + trans, btree_ptr_with_bad_rebalance_bp, + "btree ptr with bad rebalance_bp\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance_bp))); + + bkey_reassemble(n, k); + bch2_bkey_set_rebalance_bp(c, bkey_i_to_s(n), 0); + + try(bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node)); + try(bch2_btree_delete(trans, BTREE_ID_rebalance_scan, POS(1, rb_idx), 0)); + return 0; + } + + if (rb_idx) { + CLASS(btree_iter, rb_iter)(trans, BTREE_ID_rebalance_scan, POS(1, rb_idx), BTREE_ITER_intent); + struct bkey_s_c bp_k = bkey_try(bch2_btree_iter_peek_slot(&rb_iter)); + + struct bch_backpointer bp = rb_bp(iter->btree_id, iter->min_depth, k); + + if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bp_k.v, &bp, sizeof(bp))) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "btree ptr points to bad/missing rebalance bp\n"); + bch2_bkey_val_to_text(&buf, trans->c, k); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, trans->c, bp_k); + + fsck_err(trans, btree_ptr_to_bad_rebalance_bp, "%s", buf.buf); + + if (bp_k.k->type != KEY_TYPE_backpointer) { + struct bkey_i_backpointer *new_bp = errptr_try(bch2_bkey_alloc(trans, &rb_iter, 0, backpointer)); + new_bp->v = bp; + } else { + try(bch2_bkey_get_empty_slot(trans, &rb_iter, BTREE_ID_rebalance_scan, + POS(1, 1), POS(1, U64_MAX))); + + struct bkey_i_backpointer *new_bp = errptr_try(bch2_bkey_alloc(trans, &rb_iter, 0, backpointer)); + new_bp->v = bp; + + struct bkey_i *n = errptr_try(bch2_bkey_make_mut(trans, iter, &k, 0)); + bch2_bkey_set_rebalance_bp(c, bkey_i_to_s(n), rb_iter.pos.offset); + } + } + } +fsck_err: + return ret; +} + +static int check_rebalance_work_btrees(struct btree_trans *trans, struct disk_reservation *res) +{ + struct bch_fs *c = trans->c; + + for (enum btree_id btree = 0; btree < btree_id_nr_alive(c); btree++) { + if (!bch2_btree_id_root(c, btree)->b) + continue; + + for (unsigned level = 1; level < BTREE_MAX_DEPTH; level++) { + CLASS(btree_node_iter, iter)(trans, btree, POS_MIN, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + + try(for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_disk_reservation_put(c, res); + check_rebalance_work_btree_key(trans, &iter, k) ?: + bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); + }))); + } + } + + return 0; +} + +static int check_rebalance_btree_bp(struct btree_trans *trans, struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_backpointer) + return 0; + + CLASS(btree_iter_uninit, iter)(trans); + bkey_try(rebalance_bp_get_key(trans, &iter, bkey_s_c_to_backpointer(k))); + return 0; +} + +static int check_rebalance_btree_bps(struct btree_trans *trans) +{ + return for_each_btree_key_max(trans, iter, BTREE_ID_rebalance_scan, + POS(1, 0), POS(1, U64_MAX), + BTREE_ITER_prefetch, k, + check_rebalance_btree_bp(trans, k)); +} + int bch2_check_rebalance_work(struct bch_fs *c) { CLASS(btree_trans, trans)(c); CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch); - CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_prefetch); + BTREE_ITER_prefetch|BTREE_ITER_not_extents); + CLASS(btree_iter, rb_w)(trans, BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + CLASS(btree_iter, rb_h)(trans, BTREE_ID_rebalance_hipri, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + CLASS(btree_iter, rb_p)(trans, BTREE_ID_rebalance_pending, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + CLASS(per_snapshot_io_opts, snapshot_io_opts)(c); struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); wb_maybe_flush_init(&last_flushed); struct progress_indicator_state progress; - bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work)); + bch2_progress_init(&progress, c, + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink)); - int ret = 0; - while (!(ret = lockrestart_do(trans, - progress_update_iter(trans, &progress, &rebalance_iter) ?: - wb_maybe_flush_inc(&last_flushed) ?: - check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed)))) - ; + CLASS(disk_reservation, res)(c); - return min(ret, 0); + struct bpos cur_pos = POS_MIN; + + while (true) { + bch2_disk_reservation_put(c, &res.r); + + try(progress_update_iter(trans, &progress, &extent_iter) ?: + wb_maybe_flush_inc(&last_flushed) ?: + commit_do(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, + check_rebalance_work_one(trans, &extent_iter, &rb_w, &rb_h, &rb_p, + &snapshot_io_opts, &last_flushed, &cur_pos))); + if (bpos_eq(cur_pos, SPOS_MAX)) + break; + + cur_pos = extent_iter.btree_id == BTREE_ID_reflink + ? bpos_nosnap_successor(cur_pos) + : bpos_successor(cur_pos); + } + + /* progress indicator for metadata? */ + try(check_rebalance_work_btrees(trans, &res.r)); + try(check_rebalance_btree_bps(trans)); + + return 0; } diff --git a/libbcachefs/data/rebalance.h b/libbcachefs/data/rebalance.h index 97755d67..1e3cbd84 100644 --- a/libbcachefs/data/rebalance.h +++ b/libbcachefs/data/rebalance.h @@ -6,33 +6,55 @@ #include "alloc/disk_groups.h" #include "rebalance_types.h" -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, +int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context, + const struct bch_extent_rebalance_v2 *); + +static inline struct bch_extent_rebalance_v2 io_opts_to_rebalance_opts(struct bch_fs *c, struct bch_inode_opts *opts) { - struct bch_extent_rebalance r = { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), + return (struct bch_extent_rebalance_v2) { + .type = BIT(BCH_EXTENT_ENTRY_rebalance_v2), #define x(_name) \ ._name = opts->_name, \ ._name##_from_inode = opts->_name##_from_inode, BCH_REBALANCE_OPTS() #undef x }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; }; -void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, - const struct bch_extent_rebalance *); +void bch2_extent_rebalance_v1_to_text(struct printbuf *, struct bch_fs *, + const struct bch_extent_rebalance_v1 *); +void bch2_extent_rebalance_v2_to_text(struct printbuf *, struct bch_fs *, + const struct bch_extent_rebalance_v2 *); -int bch2_trigger_extent_rebalance(struct btree_trans *, - struct bkey_s_c, struct bkey_s_c, - enum btree_iter_update_trigger_flags); +const struct bch_extent_rebalance_v2 *bch2_bkey_rebalance_opts(const struct bch_fs *, struct bkey_s_c); -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +int __bch2_trigger_extent_rebalance(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, + const struct bch_extent_rebalance_v2 *, + const struct bch_extent_rebalance_v2 *, + enum btree_iter_update_trigger_flags); + +static inline unsigned rb_needs_trigger(const struct bch_extent_rebalance_v2 *r) +{ + return r ? r->need_rb|r->ptrs_moving : 0; +} + +static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + enum btree_iter_update_trigger_flags flags) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_rebalance_v2 *old_r = bch2_bkey_rebalance_opts(c, old); + const struct bch_extent_rebalance_v2 *new_r = bch2_bkey_rebalance_opts(c, new.s_c); + + return rb_needs_trigger(old_r) || rb_needs_trigger(new_r) + ? __bch2_trigger_extent_rebalance(trans, btree, level, old, new, old_r, new_r, flags) + : 0; +} enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_opt_change, @@ -41,9 +63,6 @@ enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_other, }; -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, - struct bkey_i *, enum set_needs_rebalance_ctx, u32); - /* Inodes in different snapshots may have different IO options: */ struct snapshot_io_opts_entry { u32 snapshot; @@ -53,6 +72,9 @@ struct snapshot_io_opts_entry { struct per_snapshot_io_opts { u64 cur_inum; bool metadata; + bool fs_scan_cookie; + bool inum_scan_cookie; + struct bch_devs_mask dev_cookie; struct bch_inode_opts fs_io_opts; DARRAY(struct snapshot_io_opts_entry) d; @@ -76,20 +98,27 @@ DEFINE_CLASS(per_snapshot_io_opts, struct per_snapshot_io_opts, per_snapshot_io_opts_init(c), struct bch_fs *c); -int bch2_update_rebalance_opts(struct btree_trans *, - struct bch_inode_opts *, - struct btree_iter *, - struct bkey_s_c, - enum set_needs_rebalance_ctx); - int bch2_bkey_get_io_opts(struct btree_trans *, struct per_snapshot_io_opts *, struct bkey_s_c, struct bch_inode_opts *opts); +int bch2_update_rebalance_opts(struct btree_trans *, + struct per_snapshot_io_opts *, + struct bch_inode_opts *, + struct btree_iter *, + unsigned level, + struct bkey_s_c, + enum set_needs_rebalance_ctx); + +int bch2_bkey_set_needs_rebalance(struct btree_trans *, + struct per_snapshot_io_opts *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + struct rebalance_scan { enum rebalance_scan_type { REBALANCE_SCAN_fs, REBALANCE_SCAN_metadata, + REBALANCE_SCAN_pending, REBALANCE_SCAN_device, REBALANCE_SCAN_inum, } type; @@ -101,7 +130,7 @@ struct rebalance_scan { }; int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan); -int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan); +int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan, bool); int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) @@ -114,6 +143,7 @@ static inline void bch2_rebalance_wakeup(struct bch_fs *c) } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); +void bch2_rebalance_scan_pending_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); diff --git a/libbcachefs/data/rebalance_format.h b/libbcachefs/data/rebalance_format.h index ff9a1342..0c60ec0c 100644 --- a/libbcachefs/data/rebalance_format.h +++ b/libbcachefs/data/rebalance_format.h @@ -2,52 +2,177 @@ #ifndef _BCACHEFS_REBALANCE_FORMAT_H #define _BCACHEFS_REBALANCE_FORMAT_H -struct bch_extent_rebalance { +/* + * rebalance on disk data structures: + * + * extents will contain a bch_extent_rebalance if they have background + * processing pending; additionally, indirect extents will always have a + * bch_extent_rebalance if they had any io path options set on the inode, since + * we don't (yet) have backpointers that would let us look up the "owning" inode + * of an indirect extent to recover the io path options. + * + * We also have 4 btrees for keeping track of pending rebalance work: + * + * BTREE_ID_rebalance_scan: + * Inum 0: + * Holds "scan cookies", which are created on option change to indicate that + * new options need to be propagated to each extent; this happens before the + * actual data processing. + * + * A scan cookie may be for the entire filesystem, a specific device, or a + * specific inode. + * + * Inum 1: + * Btree nodes that need background processing cannot be tracked by the + * other rebalance btrees; instead they have backpointers + * (KEY_TYPE_backpointer) created here. + * + * This has the added benefit that btree nodes will be processed before + * regular data, which is beneficial if e.g. we're recovering from data + * being degraded. + * + * BTREE_ID_rebalance_work: + * The main "pending rebalance work" btree: it's a simple bitset btree where + * a set bit indicates that an an extent in BTREE_ID_extents or + * BTREE_ID_reflink needs to be processed. + * + * BTREE_ID_rebalance_hipri: + * If bch_extent_rebalance.hipri is set, the extent will be tracked here + * instead of BTREE_ID_rebalance_work and processed ahead of extents in + * BTREE_ID_rebalance_work; this is so that we can evacuate failed devices + * before other work. + * + * BTREE_ID_rebalance_pending: + * If we'd like to move an extent to a specific target, but can't because the + * target is full, we set bch_extent_rebalance.pending and switch to tracking + * it here; pending rebalance work is re-attempted on device resize, add, or + * label change. + */ + +struct bch_extent_rebalance_v1 { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:3, + __u64 type:6, + unused:3, - promote_target_from_inode:1, - erasure_code_from_inode:1, - data_checksum_from_inode:1, - background_compression_from_inode:1, - data_replicas_from_inode:1, - background_target_from_inode:1, + promote_target_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + background_compression_from_inode:1, + data_replicas_from_inode:1, + background_target_from_inode:1, - promote_target:16, - erasure_code:1, - data_checksum:4, - data_replicas:4, - background_compression:8, /* enum bch_compression_opt */ - background_target:16; + promote_target:16, + erasure_code:1, + data_checksum:4, + data_replicas:4, + background_compression:8, /* enum bch_compression_opt */ + background_target:16; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, - background_compression:8, - data_replicas:4, - data_checksum:4, - erasure_code:1, - promote_target:16, + __u64 background_target:16, + background_compression:8, + data_replicas:4, + data_checksum:4, + erasure_code:1, + promote_target:16, - background_target_from_inode:1, - data_replicas_from_inode:1, - background_compression_from_inode:1, - data_checksum_from_inode:1, - erasure_code_from_inode:1, - promote_target_from_inode:1, + background_target_from_inode:1, + data_replicas_from_inode:1, + background_compression_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, - unused:3, - type:6; + unused:3, + type:6; +#endif +}; + +struct bch_extent_rebalance_v2 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:8, + unused:2, + ptrs_moving:5, + hipri:1, + pending:1, + need_rb:5, + + data_replicas_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + background_compression_from_inode:1, + background_target_from_inode:1, + promote_target_from_inode:1, + + data_replicas:3, + data_checksum:4, + erasure_code:1, + background_compression:8, /* enum bch_compression_opt */ + background_target:10, + promote_target:10; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 promote_target:10, + background_target:10, + background_compression:8, + erasure_code:1, + data_checksum:4, + data_replicas:3, + + promote_target_from_inode:1, + background_target_from_inode:1, + background_compression_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + data_replicas_from_inode:1, + + need_rb:5, + pending:1, + hipri:1, + ptrs_moving:5, + unused:2, + type:8; +#endif +}; + +struct bch_extent_rebalance_bp { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:9, + idx:55; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:55, + type:9; #endif }; /* subset of BCH_INODE_OPTS */ #define BCH_REBALANCE_OPTS() \ - x(data_checksum) \ - x(background_compression) \ x(data_replicas) \ - x(promote_target) \ + x(data_checksum) \ + x(erasure_code) \ + x(background_compression) \ x(background_target) \ - x(erasure_code) + x(promote_target) + +enum bch_rebalance_opts { +#define x(n) BCH_REBALANCE_##n, + BCH_REBALANCE_OPTS() +#undef x +}; + +#define BCH_REBALANCE_ACCOUNTING() \ + x(replicas, 0) \ + x(checksum, 1) \ + x(erasure_code, 2) \ + x(compression, 3) \ + x(target, 4) \ + x(high_priority, 5) \ + x(pending, 6) \ + +enum bch_rebalance_accounting_type { +#define x(t, n) BCH_REBALANCE_ACCOUNTING_##t = n, + BCH_REBALANCE_ACCOUNTING() +#undef x + BCH_REBALANCE_ACCOUNTING_NR, +}; #endif /* _BCACHEFS_REBALANCE_FORMAT_H */ diff --git a/libbcachefs/data/update.c b/libbcachefs/data/update.c index ad7c002c..35cf9c80 100644 --- a/libbcachefs/data/update.c +++ b/libbcachefs/data/update.c @@ -197,14 +197,14 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, insert = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + bkey_val_bytes(&new->k) + - sizeof(struct bch_extent_rebalance)); + sizeof(struct bch_extent_rebalance_v2)); ret = PTR_ERR_OR_ZERO(insert); if (ret) goto err; bkey_reassemble(insert, k); - new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); + new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k) + sizeof(struct bch_extent_rebalance_v2)); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; @@ -327,7 +327,18 @@ restart_drop_extra_replicas: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: bch2_bkey_get_io_opts(trans, NULL, k, &opts) ?: - bch2_bkey_set_needs_rebalance(c, &opts, insert, + /* + * this set_needs_rebalance call is only for verifying + * that the data we just wrote was written correctly, + * otherwise we could fail to flag incorrectly written + * data due to needs_rb already being set on the + * existing extent + */ + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, &new->k_i, + SET_NEEDS_REBALANCE_foreground, + m->op.opts.change_cookie) ?: + /* this is the real set_needs_rebalance() call */ + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert, SET_NEEDS_REBALANCE_foreground, m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, @@ -451,7 +462,8 @@ static void data_update_trace(struct data_update *u, int ret) trace_data_update_no_io(c, buf.buf); } count_event(c, data_update_no_io); - } else if (ret != -BCH_ERR_data_update_fail_no_rw_devs) { + } else if (ret != -BCH_ERR_data_update_fail_no_rw_devs && + ret != -BCH_ERR_insufficient_devices) { if (trace_data_update_fail_enabled()) { CLASS(printbuf, buf)(); bch2_data_update_to_text(&buf, u); @@ -774,7 +786,13 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(-BCH_ERR_data_update_fail_no_rw_devs)); trace_data_update_fail(c, buf.buf); } - count_event(c, data_update_fail); + + /* + * It's not counted as a failure because it'll end up on + * the rebalance pending list + * + * count_event(c, data_update_fail); + */ } return bch_err_throw(c, data_update_fail_no_rw_devs); diff --git a/libbcachefs/data/write.c b/libbcachefs/data/write.c index a49469f0..a6d16e0c 100644 --- a/libbcachefs/data/write.c +++ b/libbcachefs/data/write.c @@ -355,7 +355,7 @@ int bch2_extent_update(struct btree_trans *trans, bch2_inode_opts_get_inode(c, &inode, &opts); - try(bch2_bkey_set_needs_rebalance(c, &opts, k, + try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k, SET_NEEDS_REBALANCE_foreground, change_cookie)); try(bch2_trans_update(trans, iter, k, 0)); @@ -390,6 +390,13 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_begin(trans); k = bch2_keylist_front(keys); + + /* + * If we did a degraded write, bch2_bkey_set_needs_rebalance() will add + * pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as + * degraded + */ + bch2_bkey_buf_realloc(&sk, k->k.u64s + 1 + BCH_REPLICAS_MAX); bch2_bkey_buf_copy(&sk, k); int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot); @@ -1227,8 +1234,15 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } + /* + * If we did a degraded write, bch2_bkey_set_needs_rebalance() will add + * pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as + * degraded + */ struct bkey_i *new = errptr_try(bch2_trans_kmalloc_nomemzero(trans, - bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance))); + bkey_bytes(k.k) + + sizeof(struct bch_extent_rebalance_v2) + + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX)); bkey_reassemble(new, k); bch2_cut_front(c, bkey_start_pos(&orig->k), new); @@ -1256,7 +1270,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: (bch2_inode_opts_get_inode(c, &inode, &opts), - bch2_bkey_set_needs_rebalance(c, &opts, new, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_foreground, op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, @@ -1273,7 +1287,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + &op->res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size); })); if (ret) diff --git a/libbcachefs/debug/sysfs.c b/libbcachefs/debug/sysfs.c index 02ef020b..a512306d 100644 --- a/libbcachefs/debug/sysfs.c +++ b/libbcachefs/debug/sysfs.c @@ -165,6 +165,7 @@ write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); write_attribute(trigger_emergency_read_only); +write_attribute(trigger_check_inconsistent_replicas); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -218,6 +219,7 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(rebalance_scan_pending); read_attribute(snapshot_delete_status); read_attribute(recovery_status); @@ -314,6 +316,51 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "reserved:\t\t%llu\n", b.reserved); } +static int bkey_check_inconsistent_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p, prev; + bool have_prev = false, have_inconsistent = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.live_size != k.k->size) + continue; + + if (!have_prev) { + prev = p; + continue; + } + + have_inconsistent |= prev.crc.csum_type == p.crc.csum_type && + bch2_crc_cmp(prev.crc.csum, p.crc.csum); + } + + if (have_inconsistent) { + CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, k); + pr_info("%s", buf.buf); + } + + return 0; +} + +static void bch2_check_inconsistent_replicas(struct bch_fs *c) +{ + CLASS(btree_trans, trans)(c); + for_each_btree_key(trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_all_snapshots, k, ({ + bkey_check_inconsistent_replicas(c, k); + })); + + for_each_btree_key(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_all_snapshots, k, ({ + bkey_check_inconsistent_replicas(c, k); + })); +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -340,6 +387,9 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_rebalance_scan_pending) + bch2_rebalance_scan_pending_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) bch2_snapshot_delete_status_to_text(out, c); @@ -487,6 +537,9 @@ STORE(bch2_fs) printbuf_exit(&buf); } + if (attr == &sysfs_trigger_check_inconsistent_replicas) + bch2_check_inconsistent_replicas(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp __free(kfree) = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -517,6 +570,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_rebalance_scan_pending, &sysfs_snapshot_delete_status, &sysfs_recovery_status, @@ -622,6 +676,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_recalc_capacity, &sysfs_trigger_delete_dead_snapshots, &sysfs_trigger_emergency_read_only, + &sysfs_trigger_check_inconsistent_replicas, &sysfs_gc_gens_pos, diff --git a/libbcachefs/fs/inode.h b/libbcachefs/fs/inode.h index d9b9d1b2..2f5b257e 100644 --- a/libbcachefs/fs/inode.h +++ b/libbcachefs/fs/inode.h @@ -294,7 +294,7 @@ int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, #include "data/rebalance.h" -static inline struct bch_extent_rebalance +static inline struct bch_extent_rebalance_v2 bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { struct bch_inode_opts io_opts; diff --git a/libbcachefs/init/dev.c b/libbcachefs/init/dev.c index 511b8f5a..0ef499a8 100644 --- a/libbcachefs/init/dev.c +++ b/libbcachefs/init/dev.c @@ -543,6 +543,17 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch_notice(ca, "%s", bch2_member_states[new_state]); + bool do_rebalance_scan = + new_state == BCH_MEMBER_STATE_rw || + new_state == BCH_MEMBER_STATE_failed; + + struct rebalance_scan s = new_state == BCH_MEMBER_STATE_rw + ? (struct rebalance_scan) { .type = REBALANCE_SCAN_pending } + : (struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = ca->dev_idx }; + + if (do_rebalance_scan) + try(bch2_set_rebalance_needs_scan(c, s, false)); + scoped_guard(mutex, &c->sb_lock) { struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); @@ -552,7 +563,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - bch2_rebalance_wakeup(c); + if (do_rebalance_scan) + try(bch2_set_rebalance_needs_scan(c, s, true)); return ret; } @@ -740,6 +752,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err) if (ret) goto err; + struct rebalance_scan s = { .type = REBALANCE_SCAN_pending }; + try(bch2_set_rebalance_needs_scan(c, s, false)); + scoped_guard(rwsem_write, &c->state_lock) { scoped_guard(mutex, &c->sb_lock) { SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); @@ -824,6 +839,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err) }; kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); } + + try(bch2_set_rebalance_needs_scan(c, s, true)); out: bch_err_fn(c, ret); return ret; @@ -936,6 +953,11 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p return -EINVAL; } + bool wakeup_rebalance_pending = nbuckets > ca->mi.nbuckets; + struct rebalance_scan s = { .type = REBALANCE_SCAN_pending }; + if (wakeup_rebalance_pending) + try(bch2_set_rebalance_needs_scan(c, s, false)); + if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { prt_printf(err, "New device size too big (%llu greater than max %u)\n", nbuckets, BCH_MEMBER_NBUCKETS_MAX); @@ -979,6 +1001,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p } bch2_recalc_capacity(c); + + if (wakeup_rebalance_pending) + try(bch2_set_rebalance_needs_scan(c, s, true)); return 0; } diff --git a/libbcachefs/init/fs.c b/libbcachefs/init/fs.c index 466ca787..ba84dc13 100644 --- a/libbcachefs/init/fs.c +++ b/libbcachefs/init/fs.c @@ -370,14 +370,12 @@ void bch2_fs_read_only(struct bch_fs *c) test_bit(BCH_FS_clean_shutdown, &c->flags) && c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); + BUG_ON(!c->sb.clean); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch2_verify_accounting_clean(c); - - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); } else { /* Make sure error counts/counters are persisted */ guard(mutex)(&c->sb_lock); @@ -473,7 +471,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) try(bch2_fs_init_rw(c)); try(bch2_sb_members_v2_init(c)); - try(bch2_fs_mark_dirty(c)); clear_bit(BCH_FS_clean_shutdown, &c->flags); @@ -1052,7 +1049,6 @@ static int bch2_fs_init(struct bch_fs *c, struct bch_sb *sb, init_rwsem(&c->state_lock); mutex_init(&c->sb_lock); - mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); diff --git a/libbcachefs/journal/init.c b/libbcachefs/journal/init.c index f4b0f930..0dab3532 100644 --- a/libbcachefs/journal/init.c +++ b/libbcachefs/journal/init.c @@ -377,6 +377,7 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; + int ret = 0; /* * @@ -445,12 +446,26 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) if (journal_entry_empty(&i->j)) j->last_empty_seq = le64_to_cpu(i->j.seq); - struct bch_devs_list seq_devs = {}; - darray_for_each(i->ptrs, ptr) - seq_devs.data[seq_devs.nr++] = ptr->dev; + if (!info.clean) { + struct bch_devs_list seq_devs = {}; + darray_for_each(i->ptrs, ptr) + seq_devs.data[seq_devs.nr++] = ptr->dev; - p = journal_seq_pin(j, seq); - bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); + p = journal_seq_pin(j, seq); + bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); + + CLASS(printbuf, buf)(); + bch2_replicas_entry_to_text(&buf, &p->devs.e); + + fsck_err_on(!test_bit(JOURNAL_degraded, &j->flags) && + !bch2_replicas_marked(c, &p->devs.e), + c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n%s", + le64_to_cpu(i->j.seq), buf.buf); + + if (bch2_replicas_entry_get(c, &p->devs.e)) + p->devs.e.nr_devs = 0; + } had_entries = true; } @@ -464,7 +479,9 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) c->last_bucket_seq_cleanup = journal_cur_seq(j); } - return 0; + try(bch2_replicas_gc_reffed(c)); +fsck_err: + return ret; } void bch2_journal_set_replay_done(struct journal *j) diff --git a/libbcachefs/journal/journal.c b/libbcachefs/journal/journal.c index a51027f2..410c29e4 100644 --- a/libbcachefs/journal/journal.c +++ b/libbcachefs/journal/journal.c @@ -358,7 +358,6 @@ static int journal_entry_open(struct journal *j) lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); - BUG_ON(c->sb.clean); if (j->blocked) return bch_err_throw(c, journal_blocked); @@ -435,7 +434,8 @@ static int journal_entry_open(struct journal *j) bkey_extent_init(&buf->key); buf->noflush = false; - buf->must_flush = false; + /* if filesystem is clean, the first journal write must be a flush */ + buf->must_flush = c->sb.clean; buf->separate_flush = false; buf->flush_time = 0; buf->need_flush_to_write_buffer = true; @@ -1097,6 +1097,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq:\t%llu\n", j->last_seq); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "last_empty_seq:\t%llu\n", j->last_empty_seq); prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); diff --git a/libbcachefs/journal/read.c b/libbcachefs/journal/read.c index 7bf6dd7a..a506d942 100644 --- a/libbcachefs/journal/read.c +++ b/libbcachefs/journal/read.c @@ -1351,7 +1351,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) struct journal_list jlist; struct journal_replay *i, **_i; struct genradix_iter radix_iter; - bool degraded = false, last_write_torn = false; + bool last_write_torn = false; u64 seq; int ret = 0; @@ -1376,7 +1376,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) system_unbound_wq, &jlist.cl); else - degraded = true; + set_bit(JOURNAL_degraded, &c->journal.flags); } while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) @@ -1514,17 +1514,6 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); - - CLASS(printbuf, buf)(); - bch2_replicas_entry_to_text(&buf, &replicas.e); - - if (!degraded && - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == info->seq_read_start || - fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) - try(bch2_mark_replicas(c, &replicas.e)); } fsck_err: return ret; diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index a0fade1a..f3a56341 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -346,25 +346,47 @@ void bch2_journal_update_last_seq(struct journal *j) } } -void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk) +void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk, + bool clean) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union bch_replicas_padded replicas; + unsigned nr_refs = 0; size_t dirty_entry_bytes = 0; - scoped_guard(mutex, &j->last_seq_ondisk_lock) - while (j->last_seq_ondisk < last_seq_ondisk) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, j->last_seq_ondisk); + scoped_guard(mutex, &j->last_seq_ondisk_lock) { + for (u64 seq = j->last_seq_ondisk; + seq < (clean ? j->pin.back : last_seq_ondisk); + seq++) { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (pin_list->devs.e.nr_devs) { + if (nr_refs && + !bch2_replicas_entry_eq(&replicas.e, &pin_list->devs.e)) { + bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); + nr_refs = 0; + } + + memcpy(&replicas, &pin_list->devs, replicas_entry_bytes(&pin_list->devs.e)); + pin_list->devs.e.nr_devs = 0; + nr_refs++; + } dirty_entry_bytes += pin_list->bytes; pin_list->bytes = 0; - - j->last_seq_ondisk++; } + j->last_seq_ondisk = last_seq_ondisk; + } + scoped_guard(spinlock, &j->lock) { if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes)) dirty_entry_bytes = j->dirty_entry_bytes; j->dirty_entry_bytes -= dirty_entry_bytes; } + + if (nr_refs) + bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); } bool __bch2_journal_pin_put(struct journal *j, u64 seq) @@ -975,39 +997,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) try(bch2_journal_error(j)); - guard(mutex)(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - - /* - * Now that we've populated replicas_gc, write to the journal to mark - * active journal devices. This handles the case where the journal might - * be empty. Otherwise we could clear all journal replicas and - * temporarily put the fs into an unrecoverable state. Journal recovery - * expects to find devices marked for journal data on unclean mount. - */ - int ret = bch2_journal_meta(&c->journal); - if (ret) - goto err; - - seq = 0; - scoped_guard(spinlock, &j->lock) - while (!ret) { - seq = max(seq, j->last_seq); - if (seq > j->seq_ondisk) - break; - - union bch_replicas_padded replicas; - memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas)); - seq++; - - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); - } - } -err: - return bch2_replicas_gc_end(c, ret); + return 0; } bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) diff --git a/libbcachefs/journal/reclaim.h b/libbcachefs/journal/reclaim.h index e1956ba9..29bc1e86 100644 --- a/libbcachefs/journal/reclaim.h +++ b/libbcachefs/journal/reclaim.h @@ -44,7 +44,7 @@ journal_seq_pin(struct journal *j, u64 seq) } void bch2_journal_update_last_seq(struct journal *); -void bch2_journal_update_last_seq_ondisk(struct journal *, u64); +void bch2_journal_update_last_seq_ondisk(struct journal *, u64, bool); bool __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); diff --git a/libbcachefs/journal/types.h b/libbcachefs/journal/types.h index b149553b..599f6907 100644 --- a/libbcachefs/journal/types.h +++ b/libbcachefs/journal/types.h @@ -150,6 +150,7 @@ enum journal_space_from { }; #define JOURNAL_FLAGS() \ + x(degraded) \ x(replay_done) \ x(running) \ x(may_skip_flush) \ diff --git a/libbcachefs/journal/write.c b/libbcachefs/journal/write.c index f1f707ce..c447ca83 100644 --- a/libbcachefs/journal/write.c +++ b/libbcachefs/journal/write.c @@ -196,10 +196,17 @@ static CLOSURE_CALLBACK(journal_write_done) ? j->flush_write_time : j->noflush_write_time, j->write_start_time); + struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; if (w->had_error) { - struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; + bch2_replicas_entry_put(c, r); + r->nr_devs = 0; + } + if (!r->nr_devs && !w->empty) { bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); + err = bch2_replicas_entry_get(c, r); + if (err) + r->nr_devs = 0; } if (!w->devs_written.nr) @@ -261,7 +268,7 @@ again: * properly - when the flush completes replcias * refs need to have been dropped * */ - bch2_journal_update_last_seq_ondisk(j, w->last_seq); + bch2_journal_update_last_seq_ondisk(j, w->last_seq, w->empty); last_seq_ondisk_updated = true; spin_lock(&j->lock); goto again; @@ -657,7 +664,6 @@ CLOSURE_CALLBACK(bch2_journal_write) unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); BUG_ON(w->write_done); @@ -718,15 +724,24 @@ CLOSURE_CALLBACK(bch2_journal_write) w->devs_written = bch2_bkey_devs(c, bkey_i_to_s_c(&w->key)); - /* - * Mark journal replicas before we submit the write to guarantee - * recovery will find the journal entries after a crash. - */ - struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; - bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); - ret = bch2_mark_replicas(c, r); - if (ret) - goto err; + if (!c->sb.clean) { + /* + * Mark journal replicas before we submit the write to guarantee + * recovery will find the journal entries after a crash. + * + * If the filesystem is clean, we have to defer this until after + * the write completes, so the filesystem isn't marked dirty + * before anything is in the journal: + */ + struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; + bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); + + ret = bch2_replicas_entry_get(c, r); + if (ret) { + r->nr_devs = 0; + goto err; + } + } if (c->opts.nochanges) goto no_io; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index adf7d6f6..4c3e7a5a 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -108,6 +108,11 @@ static const char * const __bch2_fs_usage_types[] = { NULL }; +const char * const __bch2_rebalance_accounting_types[] = { + BCH_REBALANCE_ACCOUNTING() + NULL +}; + #undef x static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], @@ -132,6 +137,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); +PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) @@ -525,7 +531,8 @@ void bch2_opts_to_text(struct printbuf *out, } } -static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, bool post) +static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, + u64 v, bool post) { if (!test_bit(BCH_FS_started, &c->flags)) return 0; @@ -544,11 +551,23 @@ static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_ .inum = inum, }; - try(bch2_set_rebalance_needs_scan(c, s)); - if (post) - bch2_rebalance_wakeup(c); + try(bch2_set_rebalance_needs_scan(c, s, post)); break; } + case Opt_metadata_target: + case Opt_metadata_checksum: + case Opt_metadata_replicas: + try(bch2_set_rebalance_needs_scan(c, + (struct rebalance_scan) { .type = REBALANCE_SCAN_metadata, .dev = inum }, post)); + break; + case Opt_durability: + if (!post && v > ca->mi.durability) + try(bch2_set_rebalance_needs_scan(c, + (struct rebalance_scan) { .type = REBALANCE_SCAN_pending}, post)); + + try(bch2_set_rebalance_needs_scan(c, + (struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = inum }, post)); + break; default: break; } @@ -578,7 +597,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b } if (change) - try(opt_hook_io(c, ca, inum, id, false)); + try(opt_hook_io(c, ca, inum, id, v, false)); return 0; } @@ -594,7 +613,7 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c) void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) { - opt_hook_io(c, ca, inum, id, true); + opt_hook_io(c, ca, inum, id, v, true); switch (id) { case Opt_rebalance_enabled: diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 43ae8e21..94ebac75 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -25,6 +25,7 @@ extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; +extern const char * const __bch2_rebalance_accounting_types[]; extern const char * const bch2_d_types[]; void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); @@ -34,6 +35,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); +void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type); static inline const char *bch2_d_type_str(unsigned d_type) { diff --git a/libbcachefs/sb/clean.c b/libbcachefs/sb/clean.c index 18a350bc..1c36d6dd 100644 --- a/libbcachefs/sb/clean.c +++ b/libbcachefs/sb/clean.c @@ -256,18 +256,10 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = { .to_text = bch2_sb_clean_to_text, }; -int bch2_fs_mark_dirty(struct bch_fs *c) +void bch2_fs_mark_dirty(struct bch_fs *c) { - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - guard(mutex)(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - return bch2_write_super(c); } void bch2_fs_mark_clean(struct bch_fs *c) @@ -277,7 +269,6 @@ void bch2_fs_mark_clean(struct bch_fs *c) unsigned u64s; int ret; - guard(mutex)(&c->sb_lock); if (BCH_SB_CLEAN(c->disk_sb.sb)) return; @@ -321,6 +312,4 @@ void bch2_fs_mark_clean(struct bch_fs *c) } bch2_journal_pos_from_member_info_set(c); - - bch2_write_super(c); } diff --git a/libbcachefs/sb/clean.h b/libbcachefs/sb/clean.h index 71caef28..6d811f12 100644 --- a/libbcachefs/sb/clean.h +++ b/libbcachefs/sb/clean.h @@ -10,7 +10,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry ** extern const struct bch_sb_field_ops bch_sb_field_ops_clean; -int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); #endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/libbcachefs/sb/downgrade.c b/libbcachefs/sb/downgrade.c index 1abb011f..0a04a620 100644 --- a/libbcachefs/sb/downgrade.c +++ b/libbcachefs/sb/downgrade.c @@ -110,7 +110,16 @@ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ x(btree_node_accounting, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_extent_io_opts_not_set) + +#define UPGRADE_TABLE_INCOMPAT() \ + x(rebalance_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \ + BCH_FSCK_ERR_extent_io_opts_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -175,17 +184,32 @@ struct upgrade_downgrade_entry { UPGRADE_TABLE() #undef x +#define x(ver, passes, ...) static const u16 upgrade_incompat_##ver##_errors[] = { __VA_ARGS__ }; +UPGRADE_TABLE_INCOMPAT() +#undef x + static const struct upgrade_downgrade_entry upgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ - .errors = upgrade_##ver##_errors, \ +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver, \ + .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ + .errors = upgrade_##ver##_errors, \ }, UPGRADE_TABLE() #undef x }; +static const struct upgrade_downgrade_entry upgrade_table_incompat[] = { +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver, \ + .nr_errors = ARRAY_SIZE(upgrade_incompat_##ver##_errors), \ + .errors = upgrade_incompat_##ver##_errors, \ +}, +UPGRADE_TABLE_INCOMPAT() +#undef x +}; + static int have_stripes(struct bch_fs *c) { if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) @@ -219,17 +243,17 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c) return ret < 0 ? ret : 0; } -void bch2_sb_set_upgrade(struct bch_fs *c, - unsigned old_version, - unsigned new_version) +static void __bch2_sb_set_upgrade(struct bch_fs *c, + unsigned old_version, + unsigned new_version, + const struct upgrade_downgrade_entry *table, + size_t nr_entries) { lockdep_assert_held(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - for (const struct upgrade_downgrade_entry *i = upgrade_table; - i < upgrade_table + ARRAY_SIZE(upgrade_table); - i++) + for (const struct upgrade_downgrade_entry *i = table; i < table + nr_entries; i++) if (i->version > old_version && i->version <= new_version) { u64 passes = i->recovery_passes; @@ -245,6 +269,24 @@ void bch2_sb_set_upgrade(struct bch_fs *c, } } +void bch2_sb_set_upgrade(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + return __bch2_sb_set_upgrade(c, old_version, new_version, + upgrade_table, + ARRAY_SIZE(upgrade_table)); +} + +void bch2_sb_set_upgrade_incompat(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + return __bch2_sb_set_upgrade(c, old_version, new_version, + upgrade_table_incompat, + ARRAY_SIZE(upgrade_table_incompat)); +} + #define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x diff --git a/libbcachefs/sb/downgrade.h b/libbcachefs/sb/downgrade.h index 095b7cc9..f0d77eb2 100644 --- a/libbcachefs/sb/downgrade.h +++ b/libbcachefs/sb/downgrade.h @@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); +void bch2_sb_set_upgrade_incompat(struct bch_fs *, unsigned, unsigned); int bch2_sb_set_upgrade_extra(struct bch_fs *); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); diff --git a/libbcachefs/sb/errors_format.h b/libbcachefs/sb/errors_format.h index 42070f99..30e8789d 100644 --- a/libbcachefs/sb/errors_format.h +++ b/libbcachefs/sb/errors_format.h @@ -160,6 +160,10 @@ enum bch_fsck_flags { x(extent_ptrs_redundant_stripe, 139, 0) \ x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(extent_ptrs_all_invalid, 338, 0) \ + x(extent_rebalance_bad_pending, 332, 0) \ + x(extent_rebalance_bad_hipri, 333, 0) \ + x(extent_rebalance_bad_replicas, 339, 0) \ x(ptr_to_invalid_device, 142, 0) \ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ @@ -339,9 +343,15 @@ enum bch_fsck_flags { x(dirent_cf_name_too_big, 304, 0) \ x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ - x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ x(validate_error_in_commit, 329, 0) \ - x(MAX, 330, 0) + x(extent_io_opts_not_set, 330, FSCK_AUTOFIX) \ + x(extent_io_opts_unneeded, 331, FSCK_AUTOFIX) \ + x(rebalance_bp_to_missing_btree_ptr, 310, FSCK_AUTOFIX) \ + x(rebalance_bp_to_leaf_node_key, 334, FSCK_AUTOFIX) \ + x(btree_ptr_with_no_rebalance_bp, 335, FSCK_AUTOFIX) \ + x(btree_ptr_with_bad_rebalance_bp, 336, FSCK_AUTOFIX) \ + x(btree_ptr_to_bad_rebalance_bp, 337, FSCK_AUTOFIX) \ + x(MAX, 340, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb/io.c b/libbcachefs/sb/io.c index 02059426..d477bc54 100644 --- a/libbcachefs/sb/io.c +++ b/libbcachefs/sb/io.c @@ -1024,6 +1024,11 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + if (bch2_sb_has_journal(c->disk_sb.sb)) + bch2_fs_mark_dirty(c); + else + bch2_fs_mark_clean(c); + /* * Note: we do writes to RO devices here, and we might want to change * that in the future. @@ -1280,6 +1285,8 @@ void bch2_sb_upgrade_incompat(struct bch_fs *c) c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); + + bch2_sb_set_upgrade_incompat(c, c->sb.version_incompat_allowed, c->sb.version); bch2_write_super(c); } diff --git a/libbcachefs/vfs/fs.c b/libbcachefs/vfs/fs.c index cf9717ee..b7774ed7 100644 --- a/libbcachefs/vfs/fs.c +++ b/libbcachefs/vfs/fs.c @@ -115,12 +115,12 @@ static int bch2_write_inode_trans(struct btree_trans *trans, struct bch_inode_unpacked inode_u; try(bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent)); - struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); + struct bch_extent_rebalance_v2 old_r = bch2_inode_rebalance_opts_get(c, &inode_u); if (set) try(set(trans, inode, &inode_u, p)); - struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + struct bch_extent_rebalance_v2 new_r = bch2_inode_rebalance_opts_get(c, &inode_u); *rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); if (*rebalance_changed) try(bch2_set_rebalance_needs_scan_trans(trans,