From dc8c10a4b0adf2da9438aced70bc06a78fb009c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 30 Oct 2025 16:42:35 -0400 Subject: [PATCH] Update bcachefs sources to 5fe20ac58af4 bcachefs: Don't bail out of check_inode() if check_has_case_sensitive() fails --- .bcachefs_revision | 2 +- libbcachefs/alloc/accounting.c | 62 ++++---- libbcachefs/alloc/accounting.h | 23 ++- libbcachefs/alloc/replicas.c | 220 +++++++++-------------------- libbcachefs/alloc/replicas.h | 29 ++-- libbcachefs/alloc/replicas_types.h | 6 + libbcachefs/btree/interior.c | 111 ++++++++------- libbcachefs/btree/interior.h | 5 - libbcachefs/btree/iter.c | 20 +++ libbcachefs/btree/update.c | 2 +- libbcachefs/btree/write_buffer.c | 5 +- libbcachefs/data/ec_types.h | 6 - libbcachefs/data/move.c | 3 - libbcachefs/data/rebalance.c | 66 +++++++-- libbcachefs/data/rebalance.h | 18 ++- libbcachefs/data/update.c | 5 +- libbcachefs/data/write.c | 17 ++- libbcachefs/debug/sysfs.c | 5 + libbcachefs/fs/check.c | 5 +- libbcachefs/fs/dirent.c | 2 +- libbcachefs/fs/namei.c | 34 ++--- libbcachefs/fs/quota.c | 2 +- libbcachefs/fs/str_hash.c | 218 ++++++++++++++-------------- libbcachefs/init/dev.c | 19 ++- libbcachefs/journal/init.c | 10 +- libbcachefs/journal/journal.c | 1 + libbcachefs/journal/journal.h | 10 +- libbcachefs/journal/reclaim.c | 16 ++- libbcachefs/journal/reclaim.h | 2 +- libbcachefs/journal/types.h | 13 +- libbcachefs/journal/write.c | 25 ++-- libbcachefs/opts.c | 56 +++++--- libbcachefs/sb/members.h | 5 +- libbcachefs/util/darray.h | 2 +- libbcachefs/util/eytzinger.h | 59 ++++++-- libbcachefs/util/util.c | 51 +++++-- libbcachefs/util/util.h | 2 +- libbcachefs/vfs/fs.c | 5 +- 38 files changed, 610 insertions(+), 532 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 3da8aa16..ca5f3e9d 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -b552eb12225133c8bf869b461faba6b72e35d2be +5fe20ac58af402e8ad9ace0bcf9daad524e3005d diff --git a/libbcachefs/alloc/accounting.c b/libbcachefs/alloc/accounting.c index 598fbd63..4fdf28cc 100644 --- a/libbcachefs/alloc/accounting.c +++ b/libbcachefs/alloc/accounting.c @@ -440,25 +440,39 @@ static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) return true; } -void bch2_accounting_mem_gc(struct bch_fs *c) +void __bch2_accounting_maybe_kill(struct bch_fs *c, struct bpos pos) { - struct bch_accounting_mem *acc = &c->accounting; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, pos); - guard(percpu_write)(&c->mark_lock); - struct accounting_mem_entry *dst = acc->k.data; + if (acc_k.type != BCH_DISK_ACCOUNTING_replicas) + return; - darray_for_each(acc->k, src) { - if (accounting_mem_entry_is_zero(src)) { - free_percpu(src->v[0]); - free_percpu(src->v[1]); - } else { - *dst++ = *src; + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + struct bch_accounting_mem *acc = &c->accounting; + + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &pos); + + if (idx < acc->k.nr) { + struct accounting_mem_entry *e = acc->k.data + idx; + if (!accounting_mem_entry_is_zero(e)) + return; + + free_percpu(e->v[0]); + free_percpu(e->v[1]); + + swap(*e, darray_last(acc->k)); + --acc->k.nr; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); } + + bch2_replicas_entry_kill(c, &acc_k.replicas); } - acc->k.nr = dst - acc->k.data; - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + bch2_write_super(c); } /* @@ -472,9 +486,6 @@ void bch2_accounting_mem_gc(struct bch_fs *c) int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) { struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(usage); guard(percpu_read)(&c->mark_lock); darray_for_each(acc->k, i) { @@ -492,24 +503,19 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); u.r.sectors = sectors; - ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); - if (ret) - break; + try(darray_make_room(usage, replicas_usage_bytes(&u.r))); memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); usage->nr += replicas_usage_bytes(&u.r); } - if (ret) - darray_exit(usage); - return ret; + return 0; } int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) { struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; darray_init(out_buf); @@ -521,10 +527,8 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc if (!(accounting_types_mask & BIT(a_p.type))) continue; - ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + - sizeof(u64) * i->nr_counters); - if (ret) - break; + try(darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + + sizeof(u64) * i->nr_counters)); struct bkey_i_accounting *a_out = bkey_accounting_init((void *) &darray_top(*out_buf)); @@ -537,9 +541,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc out_buf->nr += bkey_bytes(&a_out->k); } - if (ret) - darray_exit(out_buf); - return ret; + return 0; } static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) diff --git a/libbcachefs/alloc/accounting.h b/libbcachefs/alloc/accounting.h index c537869e..470bba42 100644 --- a/libbcachefs/alloc/accounting.h +++ b/libbcachefs/alloc/accounting.h @@ -43,6 +43,21 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, dst->k.bversion = src.k->bversion; } +void __bch2_accounting_maybe_kill(struct bch_fs *, struct bpos pos); + +static inline void bch2_accounting_accumulate_maybe_kill(struct bch_fs *c, + struct bkey_i_accounting *dst, + struct bkey_s_c_accounting src) +{ + bch2_accounting_accumulate(dst, src); + + for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + if (dst->v.d[i]) + return; + + __bch2_accounting_maybe_kill(c, dst->k.p); +} + static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) @@ -137,7 +152,6 @@ enum bch_accounting_mode { int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -void bch2_accounting_mem_gc(struct bch_fs *); static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { @@ -205,13 +219,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = 0; if (unlikely(write_locked)) - ret = bch2_accounting_mem_insert_locked(c, a, mode); + try(bch2_accounting_mem_insert_locked(c, a, mode)); else - ret = bch2_accounting_mem_insert(c, a, mode); - if (ret) - return ret; + try(bch2_accounting_mem_insert(c, a, mode)); } struct accounting_mem_entry *e = &acc->k.data[idx]; diff --git a/libbcachefs/alloc/replicas.c b/libbcachefs/alloc/replicas.c index ba1afb74..7b32f371 100644 --- a/libbcachefs/alloc/replicas.c +++ b/libbcachefs/alloc/replicas.c @@ -12,6 +12,21 @@ #include +DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu, + kfree(_T.entries), + (struct bch_replicas_cpu) {}, void) + +static inline struct bch_replicas_entry_v1 * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (struct bch_replicas_entry_v1 *_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \ + _i = (void *) (_i) + (_r)->entry_size) + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); @@ -129,15 +144,14 @@ bad: void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { - struct bch_replicas_entry_v1 *e; bool first = true; - for_each_cpu_replicas_entry(r, e) { + for_each_cpu_replicas_entry(r, i) { if (!first) prt_printf(out, " "); first = false; - bch2_replicas_entry_to_text(out, e); + bch2_replicas_entry_to_text(out, i); } } @@ -246,45 +260,27 @@ cpu_replicas_add_entry(struct bch_fs *c, return new; } -static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) +static inline struct bch_replicas_entry_v1 * +replicas_entry_search(struct bch_replicas_cpu *r, + struct bch_replicas_entry_v1 *search) { - int idx, entry_size = replicas_entry_bytes(search); + verify_replicas_entry(search); - if (unlikely(entry_size > r->entry_size)) - return -1; - -#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) - idx = eytzinger0_find(r->entries, r->nr, r->entry_size, - entry_cmp, search); -#undef entry_cmp - - return idx < r->nr ? idx : -1; -} - -int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - bch2_replicas_entry_sort(search); - - return __replicas_entry_idx(&c->replicas, search); -} - -static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - return __replicas_entry_idx(r, search) >= 0; + size_t entry_size = replicas_entry_bytes(search); + int idx = likely(entry_size <= r->entry_size) + ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, (void *) entry_size, search) + : -1; + return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; } bool bch2_replicas_marked_locked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { - verify_replicas_entry(search); - return !search->nr_devs || - (__replicas_has_entry(&c->replicas, search) && + (replicas_entry_search(&c->replicas, search) && (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search))); + replicas_entry_search(&c->replicas_gc, search))); } bool bch2_replicas_marked(struct bch_fs *c, @@ -298,40 +294,31 @@ noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_entry_v1 *new_entry) { - struct bch_replicas_cpu new_r, new_gc; - int ret = 0; - verify_replicas_entry(new_entry); - memset(&new_r, 0, sizeof(new_r)); - memset(&new_gc, 0, sizeof(new_gc)); + CLASS(bch_replicas_cpu, new_r)(); + CLASS(bch_replicas_cpu, new_gc)(); guard(mutex)(&c->sb_lock); if (c->replicas_gc.entries && - !__replicas_has_entry(&c->replicas_gc, new_entry)) { + !replicas_entry_search(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto out; - } + if (!new_gc.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); } - if (!__replicas_has_entry(&c->replicas, new_entry)) { + if (!replicas_entry_search(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto out; - } + if (!new_r.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); - ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); - if (ret) - goto out; + try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); } if (!new_r.entries && !new_gc.entries) - goto out; + return 0; /* allocations done, now commit: */ @@ -345,12 +332,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (new_gc.entries) swap(new_gc, c->replicas_gc); } -out: - kfree(new_r.entries); - kfree(new_gc.entries); - bch_err_msg(c, ret, "adding replicas entry"); - return ret; + return 0; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) @@ -387,9 +370,6 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_entry_v1 *e; - unsigned i = 0; - lockdep_assert_held(&c->replicas_gc_lock); guard(mutex)(&c->sb_lock); @@ -401,7 +381,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) for_each_cpu_replicas_entry(&c->replicas, e) { /* Preserve unknown data types */ if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) { + !(BIT(e->data_type) & typemask)) { c->replicas_gc.nr++; c->replicas_gc.entry_size = max_t(unsigned, c->replicas_gc.entry_size, @@ -417,9 +397,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) return bch_err_throw(c, ENOMEM_replicas_gc); } + unsigned i = 0; for_each_cpu_replicas_entry(&c->replicas, e) if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) + !(BIT(e->data_type) & typemask)) memcpy(cpu_replicas_entry(&c->replicas_gc, i++), e, c->replicas_gc.entry_size); @@ -427,73 +408,23 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) return 0; } -/* - * New much simpler mechanism for clearing out unneeded replicas entries - drop - * replicas entries that have 0 sectors used. - * - * However, we don't track sector counts for journal usage, so this doesn't drop - * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism - * is retained for that. - */ -int bch2_replicas_gc2(struct bch_fs *c) +void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill) { - struct bch_replicas_cpu new = { 0 }; - unsigned nr; - int ret = 0; + lockdep_assert_held(&c->mark_lock); + lockdep_assert_held(&c->sb_lock); - bch2_accounting_mem_gc(c); -retry: - nr = READ_ONCE(c->replicas.nr); - new.entry_size = READ_ONCE(c->replicas.entry_size); - new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); - if (!new.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } + struct bch_replicas_cpu *r = &c->replicas; - guard(mutex)(&c->sb_lock); - scoped_guard(percpu_write, &c->mark_lock) { - if (nr != c->replicas.nr || - new.entry_size != c->replicas.entry_size) { - kfree(new.entries); - goto retry; - } + struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill); + if (WARN(!e, "replicas entry not found in sb")) + return; - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); + memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + bch2_cpu_replicas_sort(r); - unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), - "embedded variable length struct"); - - struct bpos p = disk_accounting_pos_to_bpos(&k); - - struct bch_accounting_mem *acc = &c->accounting; - bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p) >= acc->k.nr; - - if (e->data_type == BCH_DATA_journal || !kill) - memcpy(cpu_replicas_entry(&new, new.nr++), - e, new.entry_size); - } - - bch2_cpu_replicas_sort(&new); - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - - if (!ret) - swap(c->replicas, new); - - kfree(new.entries); - } - - if (!ret) - bch2_write_super(c); - return ret; + int ret = bch2_cpu_replicas_to_sb_replicas(c, r); + WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)); } /* Replicas tracking - superblock: */ @@ -502,7 +433,6 @@ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -519,7 +449,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { - dst = cpu_replicas_entry(cpu_r, idx++); + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); memcpy(dst, e, replicas_entry_bytes(e)); bch2_replicas_entry_sort(dst); } @@ -531,7 +461,6 @@ static int __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry_v0 *e; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -550,14 +479,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->nr = nr; cpu_r->entry_size = entry_size; - for_each_replicas_entry(sb_r, e) { + for_each_replicas_entry(sb_r, src) { struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); - dst->data_type = e->data_type; - dst->nr_devs = e->nr_devs; + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; dst->nr_required = 1; - memcpy(dst->devs, e->devs, e->nr_devs); + memcpy(dst->devs, src->devs, src->nr_devs); bch2_replicas_entry_sort(dst); } @@ -568,7 +497,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; - struct bch_replicas_cpu new_r = { 0, 0, NULL }; + CLASS(bch_replicas_cpu, new_r)(); if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) try(__bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r)); @@ -580,8 +509,6 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) guard(percpu_write)(&c->mark_lock); swap(c->replicas, new_r); - kfree(new_r.entries); - return 0; } @@ -590,7 +517,6 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); @@ -628,7 +554,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry_v1 *dst, *src; + struct bch_replicas_entry_v1 *dst; bool need_v1 = false; size_t bytes; @@ -707,12 +633,11 @@ static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_replicas_cpu cpu_r; + CLASS(bch_replicas_cpu, cpu_r)(); try(__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)); + try(bch2_cpu_replicas_validate(&cpu_r, sb, err)); - int ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; + return 0; } static void bch2_sb_replicas_to_text(struct printbuf *out, @@ -720,7 +645,6 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { @@ -743,12 +667,11 @@ static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field * { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_cpu cpu_r; + CLASS(bch_replicas_cpu, cpu_r)(); try(__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)); + try(bch2_cpu_replicas_validate(&cpu_r, sb, err)); - int ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; + return 0; } static void bch2_sb_replicas_v0_to_text(struct printbuf *out, @@ -756,7 +679,6 @@ static void bch2_sb_replicas_v0_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_entry_v0 *e; bool first = true; for_each_replicas_entry(sb_r, e) { @@ -779,8 +701,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, struct printbuf *err) { - struct bch_replicas_entry_v1 *e; - guard(percpu_read)(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { unsigned nr_online = 0, nr_failed = 0, dflags = 0; @@ -910,8 +830,6 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { - struct bch_replicas_entry_v1 *r; - for_each_replicas_entry(replicas, r) { if (r->data_type >= sizeof(data_has) * 8) continue; @@ -922,9 +840,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) } } else if (replicas_v0) { - struct bch_replicas_entry_v0 *r; - - for_each_replicas_entry_v0(replicas_v0, r) { + for_each_replicas_entry(replicas_v0, r) { if (r->data_type >= sizeof(data_has) * 8) continue; diff --git a/libbcachefs/alloc/replicas.h b/libbcachefs/alloc/replicas.h index 8565e58c..b2b86089 100644 --- a/libbcachefs/alloc/replicas.h +++ b/libbcachefs/alloc/replicas.h @@ -13,15 +13,6 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, struct bch_fs *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -static inline struct bch_replicas_entry_v1 * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry_v1 *); - void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); @@ -53,12 +44,15 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); -int bch2_replicas_gc2(struct bch_fs *); +void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *); -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) +static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev) +{ + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + return true; + return false; +} /* iterate over superblock replicas - used by userspace tools: */ @@ -66,12 +60,7 @@ int bch2_replicas_gc2(struct bch_fs *); ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) #define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -#define for_each_replicas_entry_v0(_r, _i) \ - for (_i = (_r)->entries; \ + for (typeof(&(_r)->entries[0]) _i = (_r)->entries; \ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ (_i) = replicas_entry_next(_i)) diff --git a/libbcachefs/alloc/replicas_types.h b/libbcachefs/alloc/replicas_types.h index fed71c86..418e702e 100644 --- a/libbcachefs/alloc/replicas_types.h +++ b/libbcachefs/alloc/replicas_types.h @@ -8,4 +8,10 @@ struct bch_replicas_cpu { struct bch_replicas_entry_v1 *entries; }; +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; + struct bch_replicas_entry_v1 e; +}; + #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/libbcachefs/btree/interior.c b/libbcachefs/btree/interior.c index 34c8f560..2517bdd5 100644 --- a/libbcachefs/btree/interior.c +++ b/libbcachefs/btree/interior.c @@ -609,6 +609,18 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * closure_wake_up(&c->btree_interior_update_wait); } +static void bch2_btree_update_add_key(btree_update_nodes *nodes, + unsigned level, struct bkey_i *k) +{ + BUG_ON(darray_make_room(nodes, 1)); + + struct btree_update_node *n = &darray_top(*nodes); + nodes->nr++; + + *n = (struct btree_update_node) { .level = level }; + bkey_copy(&n->key, k); +} + static void bch2_btree_update_add_node(struct bch_fs *c, btree_update_nodes *nodes, struct btree *b) { BUG_ON(darray_make_room(nodes, 1)); @@ -649,20 +661,26 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { - struct jset_entry *e = errptr_try(bch2_trans_jset_entry_alloc(trans, as->journal_u64s)); - - memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); - trans->journal_pin = &as->journal; darray_for_each(as->old_nodes, i) try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key), BTREE_TRIGGER_transactional)); - darray_for_each(as->new_nodes, i) + darray_for_each(as->new_nodes, i) { try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key), BTREE_TRIGGER_transactional)); + journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, + jset_u64s(i->key.k.u64s))), + i->root + ? BCH_JSET_ENTRY_btree_root + : BCH_JSET_ENTRY_btree_keys, + as->btree_id, + i->root ? i->level : i->level + 1, + &i->key, i->key.k.u64s); + } + return 0; } @@ -749,11 +767,12 @@ static void btree_update_nodes_written(struct btree_update *as) * all our new nodes, to avoid racing with * btree_node_update_key(): */ - darray_for_each(as->new_nodes, i) { - BUG_ON(i->b->will_make_reachable != (unsigned long) as); - i->b->will_make_reachable = 0; - clear_btree_node_will_make_reachable(i->b); - } + darray_for_each(as->new_nodes, i) + if (i->b) { + BUG_ON(i->b->will_make_reachable != (unsigned long) as); + i->b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(i->b); + } } /* @@ -841,11 +860,12 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); - darray_for_each(as->new_nodes, i) { - btree_node_lock_nopath_nofail(trans, &i->b->c, SIX_LOCK_read); - btree_node_write_if_need(trans, i->b, SIX_LOCK_read); - six_unlock_read(&i->b->c.lock); - } + darray_for_each(as->new_nodes, i) + if (i->b) { + btree_node_lock_nopath_nofail(trans, &i->b->c, SIX_LOCK_read); + btree_node_write_if_need(trans, i->b, SIX_LOCK_read); + six_unlock_read(&i->b->c.lock); + } for (unsigned i = 0; i < as->nr_open_buckets; i++) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); @@ -931,25 +951,13 @@ static void btree_update_reparent(struct btree_update *as, static void btree_update_updated_root(struct btree_update *as, struct btree *b) { - struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; BUG_ON(as->mode != BTREE_UPDATE_none); + as->mode = BTREE_UPDATE_root; - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - scoped_guard(mutex, &c->btree_interior_update_lock) { + scoped_guard(mutex, &c->btree_interior_update_lock) list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_root; - } } /* @@ -1323,7 +1331,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, { struct bch_fs *c = as->c; struct bkey_packed *k; - CLASS(printbuf, buf)(); unsigned long old, new; BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && @@ -1344,15 +1351,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, dump_stack(); } - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_keys, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) bch2_btree_node_iter_advance(node_iter, b); @@ -2105,6 +2103,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); + bch2_btree_update_add_key(&as->new_nodes, n->c.level, &delete); bch2_btree_update_add_node(c, &as->new_nodes, n); bch2_btree_node_free_inmem(trans, trans->paths + path, b); @@ -2386,15 +2385,6 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, struct bch_fs *c = trans->c; if (!btree_node_will_make_reachable(b)) { - if (!skip_triggers) { - try(bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_transactional)); - try(bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s(new_key), - BTREE_TRIGGER_transactional)); - } - if (!btree_node_is_root(c, b)) { CLASS(btree_node_iter, parent_iter)(trans, b->c.btree_id, @@ -2404,15 +2394,32 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BTREE_ITER_intent); try(bch2_btree_iter_traverse(&parent_iter)); - try(bch2_trans_update(trans, &parent_iter, new_key, BTREE_TRIGGER_norun)); + try(bch2_trans_update(trans, &parent_iter, new_key, skip_triggers ? BTREE_TRIGGER_norun : 0)); } else { - struct jset_entry *e = errptr_try(bch2_trans_jset_entry_alloc(trans, - jset_u64s(new_key->k.u64s))); + if (!skip_triggers) + try(bch2_key_trigger(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), + bkey_i_to_s(new_key), + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite| + BTREE_TRIGGER_transactional)); - journal_entry_set(e, + journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, + jset_u64s(b->key.k.u64s))), + BCH_JSET_ENTRY_overwrite, + b->c.btree_id, b->c.level + 1, + &b->key, b->key.k.u64s); + + journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, + jset_u64s(new_key->k.u64s))), BCH_JSET_ENTRY_btree_root, b->c.btree_id, b->c.level, new_key, new_key->k.u64s); + + /* + * propagated back to c->btree_roots[].key by + * bch2_journal_entry_to_btree_root() incorrect for + */ } try(bch2_trans_commit(trans, NULL, NULL, commit_flags)); diff --git a/libbcachefs/btree/interior.h b/libbcachefs/btree/interior.h index fdbb797b..de45d62f 100644 --- a/libbcachefs/btree/interior.h +++ b/libbcachefs/btree/interior.h @@ -8,8 +8,6 @@ #define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) -#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) - int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); #define BTREE_UPDATE_MODES() \ @@ -111,9 +109,6 @@ struct btree_update { BCH_REPLICAS_MAX]; open_bucket_idx_t nr_open_buckets; - unsigned journal_u64s; - u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; - /* Only here to reduce stack usage on recursive splits: */ struct keylist parent_keys; /* diff --git a/libbcachefs/btree/iter.c b/libbcachefs/btree/iter.c index 9c8713cf..a52575de 100644 --- a/libbcachefs/btree/iter.c +++ b/libbcachefs/btree/iter.c @@ -736,6 +736,19 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) /* Btree path: traverse, set_pos: */ +static noinline_for_stack int btree_node_root_err(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_str(&buf, "btree root doesn't cover expected range:\n"); + bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); + + return __bch2_topology_error(c, &buf); +} + static inline int btree_path_lock_root(struct btree_trans *trans, struct btree_path *path, unsigned depth_want, @@ -783,6 +796,13 @@ static inline int btree_path_lock_root(struct btree_trans *trans, if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { + if (unlikely(!bpos_eq(b->data->min_key, POS_MIN) || + !bpos_eq(b->key.k.p, SPOS_MAX))) { + ret = btree_node_root_err(trans, b); + six_unlock_type(&b->c.lock, lock_type); + return ret; + } + for (i = 0; i < path->level; i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); path->l[path->level].b = b; diff --git a/libbcachefs/btree/update.c b/libbcachefs/btree/update.c index 0003ef55..b21a4e11 100644 --- a/libbcachefs/btree/update.c +++ b/libbcachefs/btree/update.c @@ -557,7 +557,7 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos start, struct bpos end) { - bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent|BTREE_ITER_with_updates); struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_prev(iter)); if (bpos_lt(iter->pos, start)) diff --git a/libbcachefs/btree/write_buffer.c b/libbcachefs/btree/write_buffer.c index 64341936..94c5e6ed 100644 --- a/libbcachefs/btree/write_buffer.c +++ b/libbcachefs/btree/write_buffer.c @@ -158,8 +158,9 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); if (k.k->type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), - bkey_s_c_to_accounting(k)); + bch2_accounting_accumulate_maybe_kill(trans->c, + bkey_i_to_accounting(&wb->k), + bkey_s_c_to_accounting(k)); } *accounting_accumulated = true; diff --git a/libbcachefs/data/ec_types.h b/libbcachefs/data/ec_types.h index 809446c7..a7dedea7 100644 --- a/libbcachefs/data/ec_types.h +++ b/libbcachefs/data/ec_types.h @@ -4,12 +4,6 @@ #include "bcachefs_format.h" -union bch_replicas_padded { - u8 bytes[struct_size_t(struct bch_replicas_entry_v1, - devs, BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_entry_v1 e; -}; - struct stripe { size_t heap_idx; u16 sectors; diff --git a/libbcachefs/data/move.c b/libbcachefs/data/move.c index 465bec83..01732cee 100644 --- a/libbcachefs/data/move.c +++ b/libbcachefs/data/move.c @@ -994,7 +994,6 @@ int bch2_data_job(struct bch_fs *c, true, rereplicate_pred, c) ?: ret; bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_migrate: if (op->migrate.dev >= c->sb.nr_devices) @@ -1010,7 +1009,6 @@ int bch2_data_job(struct bch_fs *c, true, migrate_pred, op) ?: ret; bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_rewrite_old_nodes: ret = bch2_scan_old_btree_nodes(c, stats); @@ -1020,7 +1018,6 @@ int bch2_data_job(struct bch_fs *c, writepoint_hashed((unsigned long) current), true, drop_extra_replicas_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; diff --git a/libbcachefs/data/rebalance.c b/libbcachefs/data/rebalance.c index 809e3e11..28a96cf6 100644 --- a/libbcachefs/data/rebalance.c +++ b/libbcachefs/data/rebalance.c @@ -296,7 +296,7 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, if (!snapshot_opts) { bch2_inode_opts_get(c, opts, metadata); - if (k.k->p.snapshot) { + if (!metadata && k.k->p.snapshot) { struct bch_inode_unpacked inode; int ret = bch2_inode_find_by_inum_snapshot(trans, k.k->p.inode, k.k->p.snapshot, &inode, BTREE_ITER_cached); @@ -313,7 +313,7 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, snapshot_opts->d.nr = 0; } - if (k.k->p.snapshot) { + if (!metadata && k.k->p.snapshot) { if (snapshot_opts->cur_inum != k.k->p.inode) { snapshot_opts->d.nr = 0; @@ -362,6 +362,8 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, #undef x } + BUG_ON(metadata && opts->erasure_code); + return 0; } @@ -374,10 +376,46 @@ static const char * const bch2_rebalance_state_strs[] = { #undef x }; -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) +static u64 rebalance_scan_encode(struct rebalance_scan s) +{ + switch (s.type) { + case REBALANCE_SCAN_fs: + return 0; + case REBALANCE_SCAN_metadata: + return 1; + case REBALANCE_SCAN_device: + return s.dev + 32; + case REBALANCE_SCAN_inum: + return s.inum; + default: + BUG(); + } +} + +static struct rebalance_scan rebalance_scan_decode(u64 v) +{ + if (v == 0) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_fs }; + if (v == 1) + return (struct rebalance_scan) { .type = REBALANCE_SCAN_metadata }; + if (v < BCACHEFS_ROOT_INO) + return (struct rebalance_scan) { + .type = REBALANCE_SCAN_device, + .dev = v - 32, + }; + + return (struct rebalance_scan) { + .type = REBALANCE_SCAN_inum, + .inum = v, + }; +} + +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, struct rebalance_scan s) { CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + SPOS(rebalance_scan_encode(s), + REBALANCE_WORK_SCAN_OFFSET, + U32_MAX), BTREE_ITER_intent); struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter)); @@ -394,16 +432,17 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) return bch2_trans_update(trans, &iter, &cookie->k_i, 0); } -int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) +int bch2_set_rebalance_needs_scan(struct bch_fs *c, struct rebalance_scan s) { CLASS(btree_trans, trans)(c); return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, inum)); + bch2_set_rebalance_needs_scan_trans(trans, s)); } int bch2_set_fs_needs_rebalance(struct bch_fs *c) { - return bch2_set_rebalance_needs_scan(c, 0); + return bch2_set_rebalance_needs_scan(c, + (struct rebalance_scan) { .type = REBALANCE_SCAN_fs }); } static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) @@ -647,7 +686,7 @@ root_err: noinline_for_stack static int do_rebalance_scan(struct moving_context *ctxt, struct per_snapshot_io_opts *snapshot_io_opts, - u64 inum, u64 cookie, u64 *sectors_scanned) + u64 scan_v, u64 cookie, u64 *sectors_scanned) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -658,7 +697,8 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->state = BCH_REBALANCE_scanning; - if (!inum) { + struct rebalance_scan s = rebalance_scan_decode(scan_v); + if (s.type == REBALANCE_SCAN_fs) { r->scan_start = BBPOS_MIN; r->scan_end = BBPOS_MAX; @@ -670,16 +710,16 @@ static int do_rebalance_scan(struct moving_context *ctxt, try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, btree, 0, POS_MIN, SPOS_MAX)); } - } else { - r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); - r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } else if (s.type == REBALANCE_SCAN_inum) { + r->scan_start = BBPOS(BTREE_ID_extents, POS(s.inum, 0)); + r->scan_end = BBPOS(BTREE_ID_extents, POS(s.inum, U64_MAX)); try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, BTREE_ID_extents, 0, r->scan_start.pos, r->scan_end.pos)); } try(commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie))); + bch2_clear_rebalance_needs_scan(trans, scan_v, cookie))); *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); /* diff --git a/libbcachefs/data/rebalance.h b/libbcachefs/data/rebalance.h index 31aee292..6b46d3e0 100644 --- a/libbcachefs/data/rebalance.h +++ b/libbcachefs/data/rebalance.h @@ -84,8 +84,22 @@ int bch2_bkey_get_io_opts(struct btree_trans *, struct per_snapshot_io_opts *, struct bkey_s_c, struct bch_inode_opts *opts); -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); -int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); +struct rebalance_scan { + enum rebalance_scan_type { + REBALANCE_SCAN_fs, + REBALANCE_SCAN_metadata, + REBALANCE_SCAN_device, + REBALANCE_SCAN_inum, + } type; + + union { + unsigned dev; + u64 inum; + }; +}; + +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan); +int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan); int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) diff --git a/libbcachefs/data/update.c b/libbcachefs/data/update.c index 43eb764a..a636b0c1 100644 --- a/libbcachefs/data/update.c +++ b/libbcachefs/data/update.c @@ -693,6 +693,9 @@ static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, struct bch_inode_opts *io_opts, unsigned buf_bytes) { + /* be paranoid */ + buf_bytes = round_up(buf_bytes, c->opts.block_size); + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); @@ -702,7 +705,7 @@ static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + if (bch2_bio_alloc_pages(&m->op.wbio.bio, c->opts.block_size, buf_bytes, GFP_KERNEL)) { kfree(m->bvecs); m->bvecs = NULL; return -ENOMEM; diff --git a/libbcachefs/data/write.c b/libbcachefs/data/write.c index 4e9eea1d..3784a103 100644 --- a/libbcachefs/data/write.c +++ b/libbcachefs/data/write.c @@ -807,6 +807,19 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct bio *bio; unsigned output_available = min(wp->sectors_free << 9, src->bi_iter.bi_size); + + /* + * XXX: we'll want to delete this later, there's no reason we can't + * issue > 2MB bios if we're allocating high order pages + * + * But bch2_bio_alloc_pages() BUGS() if we ask it to allocate more pages + * than fit in the bio, and we're using bio_alloc_bioset() which is + * limited to BIO_MAX_VECS + */ + output_available = min(output_available, BIO_MAX_VECS * PAGE_SIZE); + + BUG_ON(output_available & (c->opts.block_size - 1)); + unsigned pages = DIV_ROUND_UP(output_available + (buf ? ((unsigned long) buf & (PAGE_SIZE - 1)) @@ -814,8 +827,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, pages = min(pages, BIO_MAX_VECS); - bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOFS, &c->bio_write); + bio = bio_alloc_bioset(NULL, pages, 0, GFP_NOFS, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ @@ -839,6 +851,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, if (bio->bi_iter.bi_size < output_available) *page_alloc_failed = bch2_bio_alloc_pages(bio, + c->opts.block_size, output_available - bio->bi_iter.bi_size, GFP_NOFS) != 0; diff --git a/libbcachefs/debug/sysfs.c b/libbcachefs/debug/sysfs.c index 06921929..02ef020b 100644 --- a/libbcachefs/debug/sysfs.c +++ b/libbcachefs/debug/sysfs.c @@ -196,6 +196,7 @@ read_attribute(btree_reserve_cache); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); +read_attribute(replicas); read_attribute(read_refs); read_attribute(write_refs); @@ -389,6 +390,9 @@ SHOW(bch2_fs) if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); + if (attr == &sysfs_replicas) + bch2_cpu_replicas_to_text(out, &c->replicas); + if (attr == &sysfs_disk_groups) bch2_disk_groups_to_text(out, c); @@ -600,6 +604,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_open_buckets_partial, &sysfs_write_refs, &sysfs_nocow_lock_table, + &sysfs_replicas, &sysfs_io_timers_read, &sysfs_io_timers_write, diff --git a/libbcachefs/fs/check.c b/libbcachefs/fs/check.c index 25f59ee6..4c947264 100644 --- a/libbcachefs/fs/check.c +++ b/libbcachefs/fs/check.c @@ -913,6 +913,9 @@ static int check_inode(struct btree_trans *trans, } ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); + if (bch2_err_matches(ret, ENOENT)) /* disconnected inode; will be fixed by a later pass */ + ret = 0; + bch_err_msg(c, ret, "bch2_check_inode_has_case_insensitive()"); if (ret) goto err; @@ -1627,7 +1630,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, new_d->k.p.inode = d.k->p.inode; new_d->k.p.snapshot = d.k->p.snapshot; - struct btree_iter dup_iter = {}; + CLASS(btree_iter_uninit, dup_iter)(trans); return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, iter, BTREE_UPDATE_internal_snapshot_node) ?: diff --git a/libbcachefs/fs/dirent.c b/libbcachefs/fs/dirent.c index b1a0d78e..1bd5be4b 100644 --- a/libbcachefs/fs/dirent.c +++ b/libbcachefs/fs/dirent.c @@ -549,7 +549,7 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, hash_info, dir, &lookup_name, flags)); int ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); - return ret > 0 ? -ENOENT : 0; + return ret > 0 ? -ENOENT : ret; } u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, diff --git a/libbcachefs/fs/namei.c b/libbcachefs/fs/namei.c index c69b2f95..24f582fa 100644 --- a/libbcachefs/fs/namei.c +++ b/libbcachefs/fs/namei.c @@ -832,10 +832,8 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", inode->bi_inum, inode->bi_snapshot); - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto out; + try(bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, + snapshot_overwrites, &buf)); if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { inode->bi_flags |= BCH_INODE_has_case_insensitive; @@ -844,7 +842,7 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, } if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) - goto out; + return 0; struct bch_inode_unpacked dir = *inode; u32 snapshot = dir.bi_snapshot; @@ -852,30 +850,22 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, while (!(dir.bi_inum == BCACHEFS_ROOT_INO && dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { if (dir.bi_parent_subvol) { - ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); - if (ret) - goto out; + try(bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot)); snapshot_overwrites = NULL; } - ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); - if (ret) - goto out; + try(bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0)); if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); - ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto out; + try(bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, + snapshot_overwrites, &buf)); if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { dir.bi_flags |= BCH_INODE_has_case_insensitive; - ret = __bch2_fsck_write_inode(trans, &dir); - if (ret) - goto out; + try(__bch2_fsck_write_inode(trans, &dir)); } } @@ -886,15 +876,11 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, if (!repairing_parents) break; } -out: -fsck_err: - bch_err_fn(trans->c, ret); - if (ret) - return ret; if (repairing_parents) return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: bch_err_throw(trans->c, transaction_restart_nested); - return 0; +fsck_err: + return ret; } diff --git a/libbcachefs/fs/quota.c b/libbcachefs/fs/quota.c index 972ebc1e..b5b9dcbd 100644 --- a/libbcachefs/fs/quota.c +++ b/libbcachefs/fs/quota.c @@ -118,7 +118,7 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); - prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); + prt_printf(out, "d_ino_hardlimit\t%llu\n", q->d_ino_hardlimit); prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); prt_printf(out, "d_space\t%llu\n", q->d_space); prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); diff --git a/libbcachefs/fs/str_hash.c b/libbcachefs/fs/str_hash.c index 70c9b703..3d99c914 100644 --- a/libbcachefs/fs/str_hash.c +++ b/libbcachefs/fs/str_hash.c @@ -218,6 +218,50 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans return 0; } +static int str_hash_dup_entries(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k, + struct btree_iter *dup_iter, struct bkey_s_c dup_k, + bool *updated_before_k_pos) +{ + struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + int ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); + if (ret < 0) + return ret; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, dup_k), + buf.buf))) + return 0; + + switch (ret) { + case 0: + try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0)); + break; + case 1: + try(bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0)); + break; + case 2: + try(bch2_fsck_rename_dirent(trans, s, *desc, hash_info, + bkey_s_c_to_dirent(k), + updated_before_k_pos)); + try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0)); + break; + } + + return bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +fsck_err: + return ret; +} + /* Put a str_hash key in its proper location, checking for duplicates */ int bch2_str_hash_repair_key(struct btree_trans *trans, struct snapshots_seen *s, @@ -227,96 +271,65 @@ int bch2_str_hash_repair_key(struct btree_trans *trans, struct btree_iter *dup_iter, struct bkey_s_c dup_k, bool *updated_before_k_pos) { - struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); - bool free_snapshots_seen = false; - int ret = 0; + CLASS(snapshots_seen, s_onstack)(); if (!s) { - s = bch2_trans_kmalloc(trans, sizeof(*s)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto out; - + s = &s_onstack; s->pos = k_iter->pos; - darray_init(&s->ids); - ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); - if (ret) - goto out; - - free_snapshots_seen = true; + try(bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids)); } if (!dup_k.k) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; + struct bkey_i *new = errptr_try(bch2_bkey_make_mut_noupdate(trans, k)); - dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, + dup_k = bkey_try(bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, (subvol_inum) { 0, new->k.p.inode }, new->k.p.snapshot, new, STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(dup_k); - if (ret) - goto out; - if (dup_k.k) - goto duplicate_entries; + BTREE_UPDATE_internal_snapshot_node)); - if (bpos_lt(new->k.p, k.k->p)) - *updated_before_k_pos = true; - - ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, - k_iter->pos, new->k.p) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, transaction_restart_commit); - } else { -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, dup_k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); - break; - case 2: - ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, - bkey_s_c_to_dirent(k), - updated_before_k_pos) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates); - goto out; + if (!dup_k.k) { + try(bch2_insert_snapshot_whiteouts(trans, desc->btree_id, + k_iter->pos, new->k.p)); + try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node)); + try(bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new)); + try(bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc)); } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - bch_err_throw(c, transaction_restart_commit); } -out: + + if (dup_k.k) + try(str_hash_dup_entries(trans, s, desc, hash_info, + k_iter, k, dup_iter, dup_k, + updated_before_k_pos)); + return 0; +} + +static int str_hash_bad_hash(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos, + struct btree_iter *iter, u64 hash) +{ + CLASS(printbuf, buf)(); + int ret = 0; + /* + * Before doing any repair, check hash_info itself: + */ + try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info)); + + if (fsck_err(trans, hash_table_key_wrong_offset, + "hash table key at wrong offset: should be at %llu\n%s", + hash, + (bch2_bkey_val_to_text(&buf, trans->c, hash_k), buf.buf))) + ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + iter, bkey_s_c_null, + updated_before_k_pos); fsck_err: - bch2_trans_iter_exit(dup_iter); - if (free_snapshots_seen) - darray_exit(&s->ids); return ret; } @@ -327,57 +340,36 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k, bool *updated_before_k_pos) { - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - CLASS(printbuf, buf)(); + u64 hash = desc->hash_bkey(hash_info, hash_k); + + CLASS(btree_iter, iter)(trans, desc->btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots); + + if (hash_k.k->p.offset < hash) + return str_hash_bad_hash(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos, &iter, hash); + struct bkey_s_c k; int ret = 0; - - u64 hash = desc->hash_bkey(hash_info, hash_k); - if (hash_k.k->p.offset < hash) - goto bad_hash; - - bch2_trans_iter_init(trans, &iter, desc->btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots| - BTREE_ITER_with_updates); - for_each_btree_key_continue_norestart(iter, - BTREE_ITER_slots| - BTREE_ITER_with_updates, k, ret) { + BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; if (k.k->type == desc->key_type && !desc->cmp_bkey(k, hash_k)) { - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, - hash_info) ?: - bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, k, updated_before_k_pos); + /* dup */ + try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info)); + try(bch2_str_hash_repair_key(trans, s, desc, hash_info, k_iter, hash_k, + &iter, k, updated_before_k_pos)); break; } if (bkey_deleted(k.k)) - goto bad_hash; + return str_hash_bad_hash(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos, &iter, hash); } - bch2_trans_iter_exit(&iter); -fsck_err: - return ret; -bad_hash: - bch2_trans_iter_exit(&iter); - /* - * Before doing any repair, check hash_info itself: - */ - try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info)); - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: should be at %llu\n%s", - hash, - (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) - ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, bkey_s_c_null, - updated_before_k_pos); return ret; } diff --git a/libbcachefs/init/dev.c b/libbcachefs/init/dev.c index 8671db2e..511b8f5a 100644 --- a/libbcachefs/init/dev.c +++ b/libbcachefs/init/dev.c @@ -447,8 +447,13 @@ int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb, struct prin lockdep_assert_held(&c->state_lock); if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb.sb->seq)) - bch2_sb_to_fs(c, sb->sb); + le64_to_cpu(c->disk_sb.sb->seq)) { + /* + * rewind, we'll lose some updates but it's not safe to call + * bch2_sb_to_fs() after fs is started + */ + sb->sb->seq = c->disk_sb.sb->seq; + } BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); @@ -628,11 +633,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags, goto err; } - ret = bch2_replicas_gc2(c); - if (ret) { - prt_printf(err, "bch2_replicas_gc2() error: %s\n", bch2_err_str(ret)); - goto err; - } + /* + * flushing the journal should be sufficient, but it's the write buffer + * flush that kills superblock replicas entries after they've gone to 0 + * so bch2_dev_has_data() returns the correct value: + */ data = bch2_dev_has_data(c, ca); if (data) { diff --git a/libbcachefs/journal/init.c b/libbcachefs/journal/init.c index df552cfb..e7410baa 100644 --- a/libbcachefs/journal/init.c +++ b/libbcachefs/journal/init.c @@ -9,6 +9,7 @@ #include "journal/seq_blacklist.h" #include "alloc/foreground.h" +#include "alloc/replicas.h" #include "btree/update.h" /* allocate journal on a device: */ @@ -440,11 +441,12 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) if (journal_entry_empty(&i->j)) j->last_empty_seq = le64_to_cpu(i->j.seq); - p = journal_seq_pin(j, seq); - - p->devs.nr = 0; + struct bch_devs_list seq_devs = {}; darray_for_each(i->ptrs, ptr) - bch2_dev_list_add_dev(&p->devs, ptr->dev); + seq_devs.data[seq_devs.nr++] = ptr->dev; + + p = journal_seq_pin(j, seq); + bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); had_entries = true; } diff --git a/libbcachefs/journal/journal.c b/libbcachefs/journal/journal.c index 91d80258..2b167bf8 100644 --- a/libbcachefs/journal/journal.c +++ b/libbcachefs/journal/journal.c @@ -442,6 +442,7 @@ static int journal_entry_open(struct journal *j) buf->write_started = false; buf->write_allocated = false; buf->write_done = false; + buf->had_error = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); diff --git a/libbcachefs/journal/journal.h b/libbcachefs/journal/journal.h index 40c1eaa3..88e18364 100644 --- a/libbcachefs/journal/journal.h +++ b/libbcachefs/journal/journal.h @@ -410,20 +410,14 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re unsigned u64s, unsigned flags, struct btree_trans *trans) { - int ret; - EBUG_ON(res->ref); EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; - if (journal_res_get_fast(j, res, flags)) - goto out; + if (!journal_res_get_fast(j, res, flags)) + try(bch2_journal_res_get_slowpath(j, res, flags, trans)); - ret = bch2_journal_res_get_slowpath(j, res, flags, trans); - if (ret) - return ret; -out: if (!(flags & JOURNAL_RES_GET_CHECK)) { lock_acquire_shared(&j->res_map, 0, (flags & JOURNAL_RES_GET_NONBLOCK) != 0, diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index 0c9bb010..d15b54fc 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -956,8 +956,8 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) scoped_guard(spinlock, &j->lock) fifo_for_each_entry_ptr(p, &j->pin, iter) if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) + ? bch2_replicas_entry_has_dev(&p->devs.e, dev_idx) + : p->devs.e.nr_devs < c->opts.metadata_replicas) seq = iter; bch2_journal_flush_pins(j, seq); @@ -981,13 +981,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; scoped_guard(spinlock, &j->lock) while (!ret) { - union bch_replicas_padded replicas; - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) + if (seq > j->seq_ondisk) break; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - journal_seq_pin(j, seq)->devs); + + union bch_replicas_padded replicas; + memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas)); seq++; if (replicas.e.nr_devs) { @@ -1021,6 +1020,9 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); guard(printbuf_indent)(out); + bch2_replicas_entry_to_text(out, &pin_list->devs.e); + prt_newline(out); + prt_printf(out, "unflushed:\n"); for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) list_for_each_entry(pin, &pin_list->unflushed[i], list) diff --git a/libbcachefs/journal/reclaim.h b/libbcachefs/journal/reclaim.h index 09332c7d..2578abfa 100644 --- a/libbcachefs/journal/reclaim.h +++ b/libbcachefs/journal/reclaim.h @@ -26,7 +26,7 @@ static inline void journal_pin_list_init(struct journal_entry_pin_list *p, int c for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); - p->devs.nr = 0; + p->devs.e.nr_devs = 0; p->bytes = 0; } diff --git a/libbcachefs/journal/types.h b/libbcachefs/journal/types.h index 1687096f..43d9e842 100644 --- a/libbcachefs/journal/types.h +++ b/libbcachefs/journal/types.h @@ -5,6 +5,7 @@ #include #include +#include "alloc/replicas_types.h" #include "alloc/types.h" #include "init/dev_types.h" #include "util/fifo.h" @@ -48,6 +49,7 @@ struct journal_buf { bool write_started:1; bool write_allocated:1; bool write_done:1; + bool had_error:1; u8 idx; }; @@ -70,7 +72,7 @@ struct journal_entry_pin_list { struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; - struct bch_devs_list devs; + union bch_replicas_padded devs; size_t bytes; }; @@ -113,7 +115,14 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ + +/* + * The block layer is fragile with large bios - it should be able to process any + * IO incrementally, but... + * + * 4MB corresponds to bio_kmalloc() -> UIO_MAXIOV + */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ /* * We stash some journal state as sentinal values in cur_entry_offset: diff --git a/libbcachefs/journal/write.c b/libbcachefs/journal/write.c index 0e6deef8..477ad0a0 100644 --- a/libbcachefs/journal/write.c +++ b/libbcachefs/journal/write.c @@ -188,7 +188,6 @@ static CLOSURE_CALLBACK(journal_write_done) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -196,14 +195,15 @@ static CLOSURE_CALLBACK(journal_write_done) ? j->flush_write_time : j->noflush_write_time, j->write_start_time); - if (!w->devs_written.nr) { - err = bch_err_throw(c, journal_write_err); - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - err = bch2_mark_replicas(c, &replicas.e); + if (w->had_error) { + struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; + + bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); } + if (!w->devs_written.nr) + err = bch_err_throw(c, journal_write_err); + if (err && !bch2_journal_error(j)) { CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); @@ -222,8 +222,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_debug_destroy(cl); spin_lock(&j->lock); - if (seq >= j->pin.front) - journal_seq_pin(j, seq)->devs = w->devs_written; + BUG_ON(seq < j->pin.front); if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; w->write_done = true; @@ -334,6 +333,7 @@ static void journal_write_endio(struct bio *bio) unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); + w->had_error = true; spin_unlock_irqrestore(&j->err_lock, flags); } @@ -632,7 +632,6 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; @@ -701,9 +700,9 @@ CLOSURE_CALLBACK(bch2_journal_write) * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - ret = bch2_mark_replicas(c, &replicas.e); + struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; + bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); + ret = bch2_mark_replicas(c, r); if (ret) goto err; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 535170e3..da5c5e72 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -525,6 +525,37 @@ void bch2_opts_to_text(struct printbuf *out, } } +static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, bool post) +{ + if (!test_bit(BCH_FS_started, &c->flags)) + return 0; + + switch (id) { + case Opt_foreground_target: + case Opt_background_target: + case Opt_promote_target: + case Opt_compression: + case Opt_background_compression: + case Opt_data_checksum: + case Opt_data_replicas: + case Opt_erasure_code: { + struct rebalance_scan s = { + .type = !inum ? REBALANCE_SCAN_fs : REBALANCE_SCAN_inum, + .inum = inum, + }; + + try(bch2_set_rebalance_needs_scan(c, s)); + if (post) + bch2_rebalance_wakeup(c); + break; + } + default: + break; + } + + return 0; +} + int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v, bool change) { @@ -546,16 +577,8 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b break; } - if (change && - test_bit(BCH_FS_started, &c->flags) && - (id == Opt_foreground_target || - id == Opt_background_target || - id == Opt_promote_target || - id == Opt_compression || - id == Opt_background_compression || - id == Opt_data_checksum || - id == Opt_data_replicas)) - try(bch2_set_rebalance_needs_scan(c, inum)); + if (change) + try(opt_hook_io(c, ca, inum, id, false)); return 0; } @@ -571,17 +594,7 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c) void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) { - if (test_bit(BCH_FS_started, &c->flags) && - (id == Opt_foreground_target || - id == Opt_background_target || - id == Opt_promote_target || - id == Opt_compression || - id == Opt_background_compression || - id == Opt_data_checksum || - id == Opt_data_replicas)) { - bch2_set_rebalance_needs_scan(c, inum); - bch2_rebalance_wakeup(c); - } + opt_hook_io(c, ca, inum, id, true); switch (id) { case Opt_rebalance_enabled: @@ -838,6 +851,7 @@ void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret, bool meta ret->background_target = c->opts.metadata_target ?: c->opts.foreground_target; ret->data_replicas = c->opts.metadata_replicas; ret->data_checksum = c->opts.metadata_checksum; + ret->erasure_code = false; } else { bch2_io_opts_fixups(ret); } diff --git a/libbcachefs/sb/members.h b/libbcachefs/sb/members.h index f8cc0762..56ec9fd7 100644 --- a/libbcachefs/sb/members.h +++ b/libbcachefs/sb/members.h @@ -72,10 +72,7 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, unsigned dev) { - darray_for_each(devs, i) - if (*i == dev) - return true; - return false; + return darray_find(devs, dev) != NULL; } static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, diff --git a/libbcachefs/util/darray.h b/libbcachefs/util/darray.h index b4f284fe..b20ce3d9 100644 --- a/libbcachefs/util/darray.h +++ b/libbcachefs/util/darray.h @@ -96,7 +96,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool); #define darray_find_p(_d, _i, cond) \ ({ \ - typeof((_d).data) _ret = NULL; \ + typeof(&(_d).data[0]) _ret = NULL; \ \ darray_for_each(_d, _i) \ if (cond) { \ diff --git a/libbcachefs/util/eytzinger.h b/libbcachefs/util/eytzinger.h index 643c1f71..b14ae1ff 100644 --- a/libbcachefs/util/eytzinger.h +++ b/libbcachefs/util/eytzinger.h @@ -278,20 +278,51 @@ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, return n - 1; } -#define eytzinger0_find(base, nr, size, _cmp, search) \ -({ \ - size_t _size = (size); \ - void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ - size_t _i = 1; \ - int _res; \ - \ - while (_i <= _nr && \ - (_res = _cmp(_search, _base1 + _i * _size))) \ - _i = eytzinger1_child(_i, _res > 0); \ - _i - 1; \ -}) +/* 0 == not found */ +static inline int eytzinger1_find_r(void *base, unsigned nr, unsigned size, + cmp_r_func_t cmp_fn, const void *priv, + const void *search) +{ + unsigned i = 1; + while (i <= nr) { + int cmp = cmp_fn(search, base + i * size, priv); + if (!cmp) + return i; + i = eytzinger1_child(i, cmp > 0); + } + + return 0; +} + +/* 0 == not found */ +static inline int eytzinger1_find(void *base, unsigned nr, unsigned size, + cmp_func_t cmp_fn, const void *search) +{ + unsigned i = 1; + while (i <= nr) { + int cmp = cmp_fn(search, base + i * size); + if (!cmp) + return i; + i = eytzinger1_child(i, cmp > 0); + } + + return 0; +} + +/* -1 == not found */ +static inline int eytzinger0_find_r(void *base, unsigned nr, unsigned size, + cmp_r_func_t cmp_fn, const void *priv, + const void *search) +{ + return eytzinger1_find_r(base - size, nr, size, cmp_fn, priv, search) - 1; +} + +/* -1 == not found */ +static inline int eytzinger0_find(void *base, unsigned nr, unsigned size, + cmp_func_t cmp_fn, const void *search) +{ + return eytzinger1_find(base - size, nr, size, cmp_fn, search) - 1; +} void eytzinger0_sort_r(void *, size_t, size_t, cmp_r_func_t, swap_r_func_t, const void *); diff --git a/libbcachefs/util/util.c b/libbcachefs/util/util.c index 5b1edd11..c9ecd942 100644 --- a/libbcachefs/util/util.c +++ b/libbcachefs/util/util.c @@ -612,24 +612,51 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size) bio_add_virt_nofail(bio, base, size); } -int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) +int bch2_bio_alloc_pages(struct bio *bio, unsigned bs, size_t size, gfp_t gfp_mask) { + BUG_ON(size & (bs - 1)); + unsigned bs_pages = DIV_ROUND_UP(bs, PAGE_SIZE); + + /* + * XXX: we could do this by allocating higher order pages, but + * + * - the page allocator gets slower at a certain order (5?) - we'd have + * to check for this + * + * - bch2_bio_free_pages_pool() probably does not handle compound pages + * yet + */ + DARRAY_PREALLOCATED(struct page *, 16) pages; + darray_init(&pages); + darray_make_room_gfp(&pages, bs_pages, gfp_mask|__GFP_NOFAIL); + + int ret = 0; while (size) { - struct page *page = alloc_pages(gfp_mask, 0); - unsigned len = min_t(size_t, PAGE_SIZE, size); + while (pages.nr < bs_pages) { + struct page *page = alloc_pages(gfp_mask, 0); + if (!page) { + ret = -ENOMEM; + goto out; + } - if (!page) - return -ENOMEM; - - if (unlikely(!bio_add_page(bio, page, len, 0))) { - __free_page(page); - break; + BUG_ON(darray_push(&pages, page)); } - size -= len; - } + while (pages.nr) { + BUG_ON(!size); - return 0; + unsigned len = min(PAGE_SIZE, size); + size -= len; + + struct page *page = darray_pop(&pages); + BUG_ON(!bio_add_page(bio, page, len, 0)); + } + } +out: + darray_for_each(pages, i) + __free_page(*i); + darray_exit(&pages); + return ret; } u64 bch2_get_random_u64_below(u64 ceil) diff --git a/libbcachefs/util/util.h b/libbcachefs/util/util.h index 05da1468..70e2b84c 100644 --- a/libbcachefs/util/util.h +++ b/libbcachefs/util/util.h @@ -370,7 +370,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); +int bch2_bio_alloc_pages(struct bio *, unsigned, size_t, gfp_t); #define closure_bio_submit(bio, cl) \ do { \ diff --git a/libbcachefs/vfs/fs.c b/libbcachefs/vfs/fs.c index dea16ea6..2b367dd1 100644 --- a/libbcachefs/vfs/fs.c +++ b/libbcachefs/vfs/fs.c @@ -123,7 +123,10 @@ static int bch2_write_inode_trans(struct btree_trans *trans, struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); *rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); if (*rebalance_changed) - try(bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum)); + try(bch2_set_rebalance_needs_scan_trans(trans, + (struct rebalance_scan) { + .type = REBALANCE_SCAN_inum, + .inum = inode_u.bi_inum })); try(bch2_inode_write(trans, &iter, &inode_u)); try(bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc));