diff --git a/.bcachefs_revision b/.bcachefs_revision index 667f3d1d..10afd2ea 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d7354b97c0100568c5696b192e30335a3666062f +3adea2c857ddebd719c40731b113640d94984a9a diff --git a/libbcachefs/Makefile b/libbcachefs/Makefile index f86b5f2d..da60bbc0 100644 --- a/libbcachefs/Makefile +++ b/libbcachefs/Makefile @@ -107,6 +107,7 @@ bcachefs-y := \ util/two_state_shared_lock.o \ util/util.o \ util/varint.o \ + vendor/bio_iov_iter.o \ vendor/closure.o \ vendor/min_heap.o \ vfs/fiemap.o \ diff --git a/libbcachefs/alloc/replicas.c b/libbcachefs/alloc/replicas.c index f0722e5b..7b32f371 100644 --- a/libbcachefs/alloc/replicas.c +++ b/libbcachefs/alloc/replicas.c @@ -16,40 +16,25 @@ DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu, kfree(_T.entries), (struct bch_replicas_cpu) {}, void) -static inline struct bch_replicas_entry_cpu * +static inline struct bch_replicas_entry_v1 * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } -static inline unsigned __cpu_replicas_entry_bytes(unsigned v1_bytes) -{ - return offsetof(struct bch_replicas_entry_cpu, e) + v1_bytes; -} - -static inline unsigned cpu_replicas_entry_bytes(struct bch_replicas_entry_cpu *e) -{ - return __cpu_replicas_entry_bytes(replicas_entry_bytes(&e->e)); -} - #define for_each_cpu_replicas_entry(_r, _i) \ - for (struct bch_replicas_entry_cpu *_i = (_r)->entries; \ + for (struct bch_replicas_entry_v1 *_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \ _i = (void *) (_i) + (_r)->entry_size) static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); -static int cpu_replicas_entry_cmp(const struct bch_replicas_entry_cpu *l, - const struct bch_replicas_entry_cpu *r, - size_t size) +/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ +static int bch2_memcmp(const void *l, const void *r, const void *priv) { - return memcmp(&l->e, &r->e, size - offsetof(struct bch_replicas_entry_cpu, e)); -} - -static int cpu_replicas_entry_cmp_r(const void *l, const void *r, const void *priv) -{ - return cpu_replicas_entry_cmp(l, r, (size_t) priv); + size_t size = (size_t) priv; + return memcmp(l, r, size); } /* Replicas tracking - in memory: */ @@ -75,8 +60,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - cpu_replicas_entry_cmp_r, NULL, - (void *)(size_t)r->entry_size); + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -101,13 +85,6 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -static void bch2_replicas_entry_cpu_to_text(struct printbuf *out, - struct bch_replicas_entry_cpu *e) -{ - prt_printf(out, "ref=%u ", atomic_read(&e->ref)); - bch2_replicas_entry_to_text(out, &e->e); -} - static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) @@ -174,7 +151,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, prt_printf(out, " "); first = false; - bch2_replicas_entry_cpu_to_text(out, i); + bch2_replicas_entry_to_text(out, i); } } @@ -255,44 +232,6 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, bch2_replicas_entry_sort(e); } -/* @l is bch_replicas_entry_v1, @r is bch_replicas_entry_cpu */ -static int replicas_entry_search_cmp(const void *_l, const void *_r, const void *priv) -{ - const struct bch_replicas_entry_v1 *l = _l; - const struct bch_replicas_entry_cpu *r = _r; - size_t size = (size_t) priv; - - return memcmp(l, &r->e, size); -} - -static inline struct bch_replicas_entry_cpu * -replicas_entry_search(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - size_t entry_size = replicas_entry_bytes(search); - int idx = likely(__cpu_replicas_entry_bytes(entry_size) <= r->entry_size) - ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, - replicas_entry_search_cmp, - (void *) entry_size, search) - : -1; - return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - return !search->nr_devs || replicas_entry_search(&c->replicas, search); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - guard(percpu_read)(&c->mark_lock); - return bch2_replicas_marked_locked(c, search); -} - static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, @@ -301,12 +240,9 @@ cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, - __cpu_replicas_entry_bytes(replicas_entry_bytes(new_entry))), + replicas_entry_bytes(new_entry)), }; - /* alignment */ - new.entry_size = round_up(new.entry_size, sizeof(atomic_t)); - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -316,7 +252,7 @@ cpu_replicas_add_entry(struct bch_fs *c, cpu_replicas_entry(old, i), old->entry_size); - memcpy(&cpu_replicas_entry(&new, old->nr)->e, + memcpy(cpu_replicas_entry(&new, old->nr), new_entry, replicas_entry_bytes(new_entry)); @@ -324,56 +260,152 @@ cpu_replicas_add_entry(struct bch_fs *c, return new; } +static inline struct bch_replicas_entry_v1 * +replicas_entry_search(struct bch_replicas_cpu *r, + struct bch_replicas_entry_v1 *search) +{ + verify_replicas_entry(search); + + size_t entry_size = replicas_entry_bytes(search); + int idx = likely(entry_size <= r->entry_size) + ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, (void *) entry_size, search) + : -1; + return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; +} + +bool bch2_replicas_marked_locked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + return !search->nr_devs || + (replicas_entry_search(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + replicas_entry_search(&c->replicas_gc, search))); +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + guard(percpu_read)(&c->mark_lock); + return bch2_replicas_marked_locked(c, search); +} + noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry, - unsigned ref) + struct bch_replicas_entry_v1 *new_entry) { verify_replicas_entry(new_entry); + CLASS(bch_replicas_cpu, new_r)(); + CLASS(bch_replicas_cpu, new_gc)(); + guard(mutex)(&c->sb_lock); - bool write_sb = false; - scoped_guard(percpu_write, &c->mark_lock) { - if (!replicas_entry_search(&c->replicas, new_entry)) { - CLASS(bch_replicas_cpu, new_r)(); - - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) - return bch_err_throw(c, ENOMEM_cpu_replicas); - - try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); - - swap(c->replicas, new_r); - write_sb = true; - } - - atomic_add(ref, &replicas_entry_search(&c->replicas, new_entry)->ref); + if (c->replicas_gc.entries && + !replicas_entry_search(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); + if (!new_gc.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); } - /* After dropping mark_lock */ - if (write_sb) + if (!replicas_entry_search(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); + if (!new_r.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); + + try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); + } + + if (!new_r.entries && + !new_gc.entries) + return 0; + + /* allocations done, now commit: */ + + if (new_r.entries) bch2_write_super(c); + /* don't update in memory replicas until changes are persistent */ + scoped_guard(percpu_write, &c->mark_lock) { + if (new_r.entries) + swap(c->replicas, new_r); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + } + return 0; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r, 0); + ? 0 : bch2_mark_replicas_slowpath(c, r); } -static void __replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_cpu *e) +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: + */ + +int bch2_replicas_gc_end(struct bch_fs *c, int ret) { - struct bch_replicas_cpu *r = &c->replicas; + lockdep_assert_held(&c->replicas_gc_lock); - memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); - bch2_cpu_replicas_sort(r); + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + ret = ret ?: + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (!ret) + swap(c->replicas, c->replicas_gc); - int ret = bch2_cpu_replicas_to_sb_replicas(c, r); - if (WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret))) - return; + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + } + + if (!ret) + bch2_write_super(c); + + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + lockdep_assert_held(&c->replicas_gc_lock); + + guard(mutex)(&c->sb_lock); + BUG_ON(c->replicas_gc.entries); + + c->replicas_gc.nr = 0; + c->replicas_gc.entry_size = 0; + + for_each_cpu_replicas_entry(&c->replicas, e) { + /* Preserve unknown data types */ + if (e->data_type >= BCH_DATA_NR || + !(BIT(e->data_type) & typemask)) { + c->replicas_gc.nr++; + c->replicas_gc.entry_size = + max_t(unsigned, c->replicas_gc.entry_size, + replicas_entry_bytes(e)); + } + } + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, + GFP_KERNEL); + if (!c->replicas_gc.entries) { + bch_err(c, "error allocating c->replicas_gc"); + return bch_err_throw(c, ENOMEM_replicas_gc); + } + + unsigned i = 0; + for_each_cpu_replicas_entry(&c->replicas, e) + if (e->data_type >= BCH_DATA_NR || + !(BIT(e->data_type) & typemask)) + memcpy(cpu_replicas_entry(&c->replicas_gc, i++), + e, c->replicas_gc.entry_size); + + bch2_cpu_replicas_sort(&c->replicas_gc); + return 0; } void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill) @@ -381,95 +413,18 @@ void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *ki lockdep_assert_held(&c->mark_lock); lockdep_assert_held(&c->sb_lock); - struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, kill); + struct bch_replicas_cpu *r = &c->replicas; + struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill); if (WARN(!e, "replicas entry not found in sb")) return; - __replicas_entry_kill(c, e); + memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); - /* caller does write_super() after dropping mark_lock */ -} + bch2_cpu_replicas_sort(r); -void bch2_replicas_entry_put_many(struct bch_fs *c, struct bch_replicas_entry_v1 *r, unsigned nr) -{ - if (!r->nr_devs) - return; - - BUG_ON(r->data_type != BCH_DATA_journal); - verify_replicas_entry(r); - - scoped_guard(percpu_read, &c->mark_lock) { - struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); - - int v = atomic_sub_return(nr, &e->ref); - BUG_ON(v < 0); - if (v) - return; - } - - guard(mutex)(&c->sb_lock); - scoped_guard(percpu_write, &c->mark_lock) { - struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); - if (e && !atomic_read(&e->ref)) - __replicas_entry_kill(c, e); - } - - bch2_write_super(c); -} - -static inline bool bch2_replicas_entry_get_inmem(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - guard(percpu_read)(&c->mark_lock); - struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); - if (e) - atomic_inc(&e->ref); - return e != NULL; -} - -int bch2_replicas_entry_get(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - if (!r->nr_devs) - return 0; - - BUG_ON(r->data_type != BCH_DATA_journal); - verify_replicas_entry(r); - - return bch2_replicas_entry_get_inmem(c, r) - ? 0 - : bch2_mark_replicas_slowpath(c, r, 1); -} - -int bch2_replicas_gc_reffed(struct bch_fs *c) -{ - bool write_sb = false; - - guard(mutex)(&c->sb_lock); - - scoped_guard(percpu_write, &c->mark_lock) { - unsigned dst = 0; - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_cpu *e = - cpu_replicas_entry(&c->replicas, i); - - if (e->e.data_type != BCH_DATA_journal || - atomic_read(&e->ref)) - memcpy(cpu_replicas_entry(&c->replicas, dst++), - e, - c->replicas.entry_size); - } - - if (c->replicas.nr != dst) { - c->replicas.nr = dst; - bch2_cpu_replicas_sort(&c->replicas); - - try(bch2_cpu_replicas_to_sb_replicas(c, &c->replicas)); - } - } - - if (write_sb) - bch2_write_super(c); - return 0; + int ret = bch2_cpu_replicas_to_sb_replicas(c, r); + WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)); } /* Replicas tracking - superblock: */ @@ -486,9 +441,6 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } - entry_size = __cpu_replicas_entry_bytes(entry_size); - entry_size = round_up(entry_size, sizeof(atomic_t)); - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -496,10 +448,10 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, cpu_r->nr = nr; cpu_r->entry_size = entry_size; - for_each_replicas_entry(sb_r, src) { - struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(&dst->e, src, replicas_entry_bytes(src)); - bch2_replicas_entry_sort(&dst->e); + for_each_replicas_entry(sb_r, e) { + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(dst, e, replicas_entry_bytes(e)); + bch2_replicas_entry_sort(dst); } return 0; @@ -517,13 +469,9 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } - entry_size = __cpu_replicas_entry_bytes(entry_size); - entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); - entry_size = round_up(entry_size, sizeof(atomic_t)); - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -532,14 +480,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, src) { - struct bch_replicas_entry_cpu *dst = + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); - dst->e.data_type = src->data_type; - dst->e.nr_devs = src->nr_devs; - dst->e.nr_required = 1; - memcpy(dst->e.devs, src->devs, src->nr_devs); - bch2_replicas_entry_sort(&dst->e); + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; + dst->nr_required = 1; + memcpy(dst->devs, src->devs, src->nr_devs); + bch2_replicas_entry_sort(dst); } return 0; @@ -547,12 +495,6 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { - /* - * If called after fs is started (after journal read), we'll be blowing - * away refcounts - */ - BUG_ON(test_bit(BCH_FS_started, &c->flags)); - struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; CLASS(bch_replicas_cpu, new_r)(); @@ -580,7 +522,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(&src->e) - 1; + bytes += replicas_entry_bytes(src) - 1; sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); @@ -596,9 +538,9 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->e.data_type; - dst->nr_devs = src->e.nr_devs; - memcpy(dst->devs, src->e.devs, src->e.nr_devs); + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; + memcpy(dst->devs, src->devs, src->nr_devs); dst = replicas_entry_next(dst); @@ -619,8 +561,8 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(&src->e); - if (src->e.nr_required != 1) + bytes += replicas_entry_bytes(src); + if (src->nr_required != 1) need_v1 = true; } @@ -641,7 +583,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - memcpy(dst, &src->e, replicas_entry_bytes(&src->e)); + memcpy(dst, src, replicas_entry_bytes(src)); dst = replicas_entry_next(dst); @@ -660,26 +602,24 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_r(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - cpu_replicas_entry_cmp_r, NULL, + bch2_memcmp, NULL, (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_cpu *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - try(bch2_replicas_entry_sb_validate(&e->e, sb, err)); + try(bch2_replicas_entry_sb_validate(e, sb, err)); if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_cpu *n = + struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); - int cmp = cpu_replicas_entry_cmp(e, n, cpu_r->entry_size); + BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - BUG_ON(cmp > 0); - - if (!cmp) { + if (!memcmp(e, n, cpu_r->entry_size)) { prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, &e->e); + bch2_replicas_entry_to_text(err, e); return -BCH_ERR_invalid_sb_replicas; } } @@ -762,9 +702,7 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, struct printbuf *err) { guard(percpu_read)(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, i) { - struct bch_replicas_entry_v1 *e = &i->e; - + for_each_cpu_replicas_entry(&c->replicas, e) { unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; @@ -882,25 +820,6 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return bch2_can_read_fs_with_devs(c, devs, flags, err); } -bool bch2_sb_has_journal(struct bch_sb *sb) -{ - struct bch_sb_field_replicas *replicas = bch2_sb_field_get(sb, replicas); - struct bch_sb_field_replicas_v0 *replicas_v0 = bch2_sb_field_get(sb, replicas_v0); - - if (replicas) { - for_each_replicas_entry(replicas, r) - if (r->data_type == BCH_DATA_journal) - return true; - } else if (replicas_v0) { - for_each_replicas_entry(replicas_v0, r) - if (r->data_type == BCH_DATA_journal) - return true; - } - - - return false; -} - unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; @@ -944,4 +863,5 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) void bch2_fs_replicas_exit(struct bch_fs *c) { kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); } diff --git a/libbcachefs/alloc/replicas.h b/libbcachefs/alloc/replicas.h index f9743ec5..b2b86089 100644 --- a/libbcachefs/alloc/replicas.h +++ b/libbcachefs/alloc/replicas.h @@ -39,22 +39,13 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, unsigned, struct printbuf *, bool); -bool bch2_sb_has_journal(struct bch_sb *); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -void bch2_replicas_entry_put_many(struct bch_fs *, struct bch_replicas_entry_v1 *, unsigned); -static inline void bch2_replicas_entry_put(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - bch2_replicas_entry_put_many(c, r, 1); -} - -int bch2_replicas_entry_get(struct bch_fs *, struct bch_replicas_entry_v1 *); - +int bch2_replicas_gc_end(struct bch_fs *, int); +int bch2_replicas_gc_start(struct bch_fs *, unsigned); void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *); -int bch2_replicas_gc_reffed(struct bch_fs *); - static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev) { for (unsigned i = 0; i < r->nr_devs; i++) @@ -63,12 +54,6 @@ static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, return false; } -static inline bool bch2_replicas_entry_eq(struct bch_replicas_entry_v1 *l, - struct bch_replicas_entry_v1 *r) -{ - return l->nr_devs == r->nr_devs && !memcmp(l, r, replicas_entry_bytes(l)); -} - /* iterate over superblock replicas - used by userspace tools: */ #define replicas_entry_next(_i) \ diff --git a/libbcachefs/alloc/replicas_types.h b/libbcachefs/alloc/replicas_types.h index 50d8f87c..418e702e 100644 --- a/libbcachefs/alloc/replicas_types.h +++ b/libbcachefs/alloc/replicas_types.h @@ -2,16 +2,10 @@ #ifndef _BCACHEFS_REPLICAS_TYPES_H #define _BCACHEFS_REPLICAS_TYPES_H -/* unsized - bch_replicas_entry_v1 is variable length */ -struct bch_replicas_entry_cpu { - atomic_t ref; - struct bch_replicas_entry_v1 e; -}; - struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_cpu *entries; + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry_v1 *entries; }; union bch_replicas_padded { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 9fd7a62e..8539b4e1 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -808,6 +808,8 @@ struct bch_fs { struct bch_accounting_mem accounting; struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; + struct mutex replicas_gc_lock; struct journal_entry_res btree_root_journal_res; struct journal_entry_res clock_journal_res; diff --git a/libbcachefs/btree/key_cache.c b/libbcachefs/btree/key_cache.c index f2e213d2..0532e9a9 100644 --- a/libbcachefs/btree/key_cache.c +++ b/libbcachefs/btree/key_cache.c @@ -438,10 +438,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * sequence number with a new btree node write, we want to re-journal * the update */ - if (ck->journal.seq == j->last_seq) + if (ck->journal.seq == journal_last_seq(j)) commit_flags |= BCH_WATERMARK_reclaim; - if (ck->journal.seq != j->last_seq || + if (ck->journal.seq != journal_last_seq(j) || !journal_low_on_space(&c->journal)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; diff --git a/libbcachefs/data/ec.c b/libbcachefs/data/ec.c index 5d62d98c..9b701c98 100644 --- a/libbcachefs/data/ec.c +++ b/libbcachefs/data/ec.c @@ -1030,11 +1030,24 @@ static int ec_stripe_key_update(struct btree_trans *trans, return bch2_trans_update(trans, &iter, &new->k_i, 0); } +struct stripe_update_bucket_stats { + u32 nr_bp_to_deleted; + u32 nr_no_match; + u32 nr_cached; + u32 nr_done; + + u32 sectors_bp_to_deleted; + u32 sectors_no_match; + u32 sectors_cached; + u32 sectors_done; +}; + static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bkey_s_c_backpointer bp, + struct stripe_update_bucket_stats *stats, struct wb_maybe_flush *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; @@ -1063,6 +1076,9 @@ static int ec_stripe_update_extent(struct btree_trans *trans, * extent no longer exists - we could flush the btree * write buffer and retry to verify, but no need: */ + stats->nr_bp_to_deleted++; + stats->sectors_bp_to_deleted += bp.v->bucket_len; + count_event(c, ec_stripe_update_extent_fail); return 0; } @@ -1075,8 +1091,18 @@ static int ec_stripe_update_extent(struct btree_trans *trans, * It doesn't generally make sense to erasure code cached ptrs: * XXX: should we be incrementing a counter? */ - if (!ptr_c || ptr_c->cached) + if (!ptr_c) { + stats->nr_no_match++; + stats->sectors_no_match += bp.v->bucket_len; + count_event(c, ec_stripe_update_extent_fail); return 0; + } + if (ptr_c->cached) { + stats->nr_cached++; + stats->sectors_cached += bp.v->bucket_len; + count_event(c, ec_stripe_update_extent_fail); + return 0; + } unsigned dev = v->ptrs[block].dev; @@ -1106,6 +1132,14 @@ static int ec_stripe_update_extent(struct btree_trans *trans, try(bch2_bkey_set_needs_rebalance(trans->c, &opts, n, SET_NEEDS_REBALANCE_other, 0)); try(bch2_trans_update(trans, &iter, n, 0)); + try(bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + + stats->nr_done++; + stats->sectors_done += bp.v->bucket_len; + + count_event(c, ec_stripe_update_extent); return 0; } @@ -1126,12 +1160,11 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); wb_maybe_flush_init(&last_flushed); - return for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, + struct stripe_update_bucket_stats stats = {}; + + try(for_each_btree_key_max(trans, bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket_pos), - bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, ({ + bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, ({ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) break; @@ -1143,8 +1176,26 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b continue; wb_maybe_flush_inc(&last_flushed); - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, &last_flushed); - })); + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, + &stats, &last_flushed); + }))); + + if (trace_stripe_update_bucket_enabled()) { + CLASS(printbuf, buf)(); + + prt_printf(&buf, "bp_to_deleted:\t%u %u\n", + stats.nr_bp_to_deleted, stats.sectors_bp_to_deleted); + prt_printf(&buf, "no_match:\t%u %u\n", + stats.nr_no_match, stats.sectors_no_match); + prt_printf(&buf, "cached:\t%u %u\n", + stats.nr_cached, stats.sectors_cached); + prt_printf(&buf, "done:\t%u %u\n", + stats.nr_done, stats.sectors_done); + + trace_stripe_update_bucket(c, buf.buf); + } + + return 0; } static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) diff --git a/libbcachefs/data/rebalance.c b/libbcachefs/data/rebalance.c index 28a96cf6..0e0e69a8 100644 --- a/libbcachefs/data/rebalance.c +++ b/libbcachefs/data/rebalance.c @@ -306,9 +306,11 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans, bch2_inode_opts_get_inode(c, &inode, opts); } } else { - if (snapshot_opts->fs_io_opts.change_cookie != atomic_read(&c->opt_change_cookie)) { + if (snapshot_opts->fs_io_opts.change_cookie != atomic_read(&c->opt_change_cookie) || + snapshot_opts->metadata != metadata) { bch2_inode_opts_get(c, &snapshot_opts->fs_io_opts, metadata); + snapshot_opts->metadata = metadata; snapshot_opts->cur_inum = 0; snapshot_opts->d.nr = 0; } diff --git a/libbcachefs/data/rebalance.h b/libbcachefs/data/rebalance.h index 6b46d3e0..97755d67 100644 --- a/libbcachefs/data/rebalance.h +++ b/libbcachefs/data/rebalance.h @@ -52,6 +52,8 @@ struct snapshot_io_opts_entry { struct per_snapshot_io_opts { u64 cur_inum; + bool metadata; + struct bch_inode_opts fs_io_opts; DARRAY(struct snapshot_io_opts_entry) d; }; diff --git a/libbcachefs/debug/trace.h b/libbcachefs/debug/trace.h index c84bccd1..9ffe59de 100644 --- a/libbcachefs/debug/trace.h +++ b/libbcachefs/debug/trace.h @@ -346,6 +346,11 @@ TRACE_EVENT(stripe_create, __entry->ret) ); +DEFINE_EVENT(fs_str, stripe_update_bucket, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, diff --git a/libbcachefs/init/fs.c b/libbcachefs/init/fs.c index 594263a4..466ca787 100644 --- a/libbcachefs/init/fs.c +++ b/libbcachefs/init/fs.c @@ -375,6 +375,9 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch2_verify_accounting_clean(c); + + bch_verbose(c, "marking filesystem clean"); + bch2_fs_mark_clean(c); } else { /* Make sure error counts/counters are persisted */ guard(mutex)(&c->sb_lock); @@ -470,6 +473,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) try(bch2_fs_init_rw(c)); try(bch2_sb_members_v2_init(c)); + try(bch2_fs_mark_dirty(c)); clear_bit(BCH_FS_clean_shutdown, &c->flags); @@ -1048,6 +1052,7 @@ static int bch2_fs_init(struct bch_fs *c, struct bch_sb *sb, init_rwsem(&c->state_lock); mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); diff --git a/libbcachefs/init/recovery.c b/libbcachefs/init/recovery.c index 25cdb194..b001065b 100644 --- a/libbcachefs/init/recovery.c +++ b/libbcachefs/init/recovery.c @@ -610,7 +610,8 @@ fsck_err: int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; - struct journal_start_info journal_start = {}; + struct jset *last_journal_entry = NULL; + u64 last_seq = 0, blacklist_seq, journal_seq; int ret = 0; if (c->sb.clean) { @@ -636,7 +637,7 @@ int bch2_fs_recovery(struct bch_fs *c) struct journal_replay **i; bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &journal_start); + ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); if (ret) goto err; @@ -647,21 +648,22 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.read_journal_only) goto out; - if (mustfix_fsck_err_on(c->sb.clean && !journal_start.clean, - c, clean_but_journal_not_empty, - "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - } - - struct jset *last_journal_entry = NULL; genradix_for_each_reverse(&c->journal_entries, iter, i) if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, + clean_but_journal_not_empty, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + if (!last_journal_entry) { fsck_err_on(!c->sb.clean, c, dirty_but_no_journal_entries, @@ -703,12 +705,11 @@ use_clean: goto err; } - - journal_start.start_seq = le64_to_cpu(clean->journal_seq) + 1; + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } - c->journal_replay_seq_start = journal_start.seq_read_start; - c->journal_replay_seq_end = journal_start.seq_read_end; + c->journal_replay_seq_start = last_seq; + c->journal_replay_seq_end = blacklist_seq - 1; zero_out_btree_mem_ptr(&c->journal_keys); @@ -755,15 +756,13 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_start.start_seq += JOURNAL_BUF_NR * 4; + journal_seq += JOURNAL_BUF_NR * 4; - if (journal_start.seq_read_end && - journal_start.seq_read_end + 1 != journal_start.start_seq) { - u64 blacklist_seq = journal_start.seq_read_end + 1; + if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_start.start_seq) ?: + blacklist_seq, journal_seq) ?: bch2_journal_seq_blacklist_add(c, - blacklist_seq, journal_start.start_seq); + blacklist_seq, journal_seq); if (ret) { bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; @@ -771,10 +770,8 @@ use_clean: } ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_start.start_seq, - journal_start.seq_read_start, - journal_start.seq_read_end) ?: - bch2_fs_journal_start(&c->journal, journal_start); + journal_seq, last_seq, blacklist_seq - 1) ?: + bch2_fs_journal_start(&c->journal, last_seq, journal_seq); if (ret) goto err; @@ -1017,8 +1014,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - struct journal_start_info journal_start = { .start_seq = 1 }; - ret = bch2_fs_journal_start(&c->journal, journal_start); + ret = bch2_fs_journal_start(&c->journal, 1, 1); if (ret) goto err; diff --git a/libbcachefs/journal/init.c b/libbcachefs/journal/init.c index 651228cc..e7410baa 100644 --- a/libbcachefs/journal/init.c +++ b/libbcachefs/journal/init.c @@ -11,7 +11,6 @@ #include "alloc/foreground.h" #include "alloc/replicas.h" #include "btree/update.h" -#include "init/error.h" /* allocate journal on a device: */ @@ -368,30 +367,29 @@ void bch2_fs_journal_stop(struct journal *j) clear_bit(JOURNAL_running, &j->flags); } -int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) +int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - int ret = 0; /* * * XXX pick most recent non blacklisted sequence number */ - info.start_seq = max(info.start_seq, bch2_journal_last_blacklisted_seq(c)); + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); - if (info.start_seq >= JOURNAL_SEQ_MAX) { + if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; } /* Clean filesystem? */ - u64 cur_seq = info.start_seq; - u64 last_seq = info.seq_read_start ?: info.start_seq; + if (!last_seq) + last_seq = cur_seq; u64 nr = cur_seq - last_seq; if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) { @@ -421,7 +419,6 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; - j->last_seq = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); @@ -444,26 +441,12 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) if (journal_entry_empty(&i->j)) j->last_empty_seq = le64_to_cpu(i->j.seq); - if (!info.clean) { - struct bch_devs_list seq_devs = {}; - darray_for_each(i->ptrs, ptr) - seq_devs.data[seq_devs.nr++] = ptr->dev; + struct bch_devs_list seq_devs = {}; + darray_for_each(i->ptrs, ptr) + seq_devs.data[seq_devs.nr++] = ptr->dev; - p = journal_seq_pin(j, seq); - bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); - - CLASS(printbuf, buf)(); - bch2_replicas_entry_to_text(&buf, &p->devs.e); - - fsck_err_on(!test_bit(JOURNAL_degraded, &j->flags) && - !bch2_replicas_marked(c, &p->devs.e), - c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf); - - if (bch2_replicas_entry_get(c, &p->devs.e)) - p->devs.e.nr_devs = 0; - } + p = journal_seq_pin(j, seq); + bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); had_entries = true; } @@ -477,9 +460,7 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) c->last_bucket_seq_cleanup = journal_cur_seq(j); } - try(bch2_replicas_gc_reffed(c)); -fsck_err: - return ret; + return 0; } void bch2_journal_set_replay_done(struct journal *j) @@ -604,7 +585,6 @@ void bch2_fs_journal_init_early(struct journal *j) init_waitqueue_head(&j->reclaim_wait); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); - mutex_init(&j->last_seq_ondisk_lock); mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); diff --git a/libbcachefs/journal/init.h b/libbcachefs/journal/init.h index 6fc55c50..6d49c29a 100644 --- a/libbcachefs/journal/init.h +++ b/libbcachefs/journal/init.h @@ -11,7 +11,7 @@ int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, struct journal_start_info); +int bch2_fs_journal_start(struct journal *, u64, u64); void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); diff --git a/libbcachefs/journal/journal.c b/libbcachefs/journal/journal.c index 1c1d3472..2b167bf8 100644 --- a/libbcachefs/journal/journal.c +++ b/libbcachefs/journal/journal.c @@ -187,7 +187,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); /* @@ -235,10 +235,10 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - size_t bytes = roundup_pow_of_two(vstruct_bytes(buf->data)); - - journal_seq_pin(j, journal_cur_seq(j))->bytes = bytes; - j->dirty_entry_bytes += bytes; + struct journal_entry_pin_list *pin_list = + journal_seq_pin(j, journal_cur_seq(j)); + pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data)); + j->dirty_entry_bytes += pin_list->bytes; if (trace_journal_entry_close_enabled() && trace) { CLASS(printbuf, err)(); @@ -280,7 +280,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t * contain either what the old pin protected or what the new pin * protects. * - * After the old pin is dropped j->last_seq won't include the old + * After the old pin is dropped journal_last_seq() won't include the old * pin, so we can only write the updated last_seq on the entry that * contains whatever the new pin protects. * @@ -291,7 +291,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t * Hence, we want update/set last_seq on the current journal entry right * before we open a new one: */ - buf->last_seq = j->last_seq; + buf->last_seq = journal_last_seq(j); buf->data->last_seq = cpu_to_le64(buf->last_seq); BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); @@ -358,6 +358,7 @@ static int journal_entry_open(struct journal *j) lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); + BUG_ON(c->sb.clean); if (j->blocked) return bch_err_throw(c, journal_blocked); @@ -415,7 +416,7 @@ static int journal_entry_open(struct journal *j) /* * The fifo_push() needs to happen at the same time as j->seq is - * incremented for j->last_seq to be calculated correctly + * incremented for journal_last_seq() to be calculated correctly */ atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); @@ -1091,7 +1092,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t%llu\n", j->last_seq); + prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); diff --git a/libbcachefs/journal/journal.h b/libbcachefs/journal/journal.h index 272a3a64..88e18364 100644 --- a/libbcachefs/journal/journal.h +++ b/libbcachefs/journal/journal.h @@ -129,6 +129,11 @@ static inline bool journal_low_on_space(struct journal *j) /* Sequence number of oldest dirty journal entry */ +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} + static inline u64 journal_cur_seq(struct journal *j) { return atomic64_read(&j->seq); diff --git a/libbcachefs/journal/read.c b/libbcachefs/journal/read.c index 4c5753b9..2a697a23 100644 --- a/libbcachefs/journal/read.c +++ b/libbcachefs/journal/read.c @@ -1346,17 +1346,18 @@ fsck_err: return ret; } -int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) +int bch2_journal_read(struct bch_fs *c, + u64 *last_seq, + u64 *blacklist_seq, + u64 *start_seq) { struct journal_list jlist; struct journal_replay *i, **_i; struct genradix_iter radix_iter; - bool last_write_torn = false; + bool degraded = false, last_write_torn = false; u64 seq; int ret = 0; - memset(info, 0, sizeof(*info)); - closure_init_stack(&jlist.cl); mutex_init(&jlist.lock); jlist.last_seq = 0; @@ -1376,7 +1377,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) system_unbound_wq, &jlist.cl); else - set_bit(JOURNAL_degraded, &c->journal.flags); + degraded = true; } while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) @@ -1385,6 +1386,10 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) if (jlist.ret) return jlist.ret; + *last_seq = 0; + *start_seq = 0; + *blacklist_seq = 0; + /* * Find most recent flush entry, and ignore newer non flush entries - * those entries will be blacklisted: @@ -1395,8 +1400,8 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) if (journal_replay_ignore(i)) continue; - if (!info->start_seq) - info->start_seq = le64_to_cpu(i->j.seq) + 1; + if (!*start_seq) + *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { i->ignore_blacklisted = true; @@ -1421,28 +1426,27 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) le64_to_cpu(i->j.seq))) i->j.last_seq = i->j.seq; - info->seq_read_start = le64_to_cpu(i->j.last_seq); - info->seq_read_end = le64_to_cpu(i->j.seq); - info->clean = journal_entry_empty(&i->j); + *last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; break; } - if (!info->start_seq) { + if (!*start_seq) { bch_info(c, "journal read done, but no entries found"); return 0; } - if (!info->seq_read_end) { + if (!*last_seq) { fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, "journal read done, but no entries found after dropping non-flushes"); return 0; } - u64 drop_before = info->seq_read_start; + u64 drop_before = *last_seq; { CLASS(printbuf, buf)(); prt_printf(&buf, "journal read done, replaying entries %llu-%llu", - info->seq_read_start, info->seq_read_end); + *last_seq, *blacklist_seq - 1); /* * Drop blacklisted entries and entries older than last_seq (or start of @@ -1453,11 +1457,9 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); } - info->seq_read_start = drop_before; - if (info->seq_read_end + 1 != info->start_seq) - prt_printf(&buf, " (unflushed %llu-%llu)", - info->seq_read_end + 1, - info->start_seq - 1); + *last_seq = drop_before; + if (*start_seq != *blacklist_seq) + prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); bch_info(c, "%s", buf.buf); } @@ -1481,7 +1483,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) } } - try(bch2_journal_check_for_missing(c, drop_before, info->seq_read_end)); + try(bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1)); genradix_for_each(&c->journal_entries, radix_iter, _i) { union bch_replicas_padded replicas = { @@ -1514,6 +1516,17 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); + + CLASS(printbuf, buf)(); + bch2_replicas_entry_to_text(&buf, &replicas.e); + + if (!degraded && + !bch2_replicas_marked(c, &replicas.e) && + (le64_to_cpu(i->j.seq) == *last_seq || + fsck_err(c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n%s", + le64_to_cpu(i->j.seq), buf.buf))) + try(bch2_mark_replicas(c, &replicas.e)); } fsck_err: return ret; diff --git a/libbcachefs/journal/read.h b/libbcachefs/journal/read.h index 556a7ff1..ff3c8690 100644 --- a/libbcachefs/journal/read.h +++ b/libbcachefs/journal/read.h @@ -70,6 +70,6 @@ struct u64_range { struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); -int bch2_journal_read(struct bch_fs *, struct journal_start_info *); +int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); #endif /* _BCACHEFS_JOURNAL_READ_H */ diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index 703f626b..d15b54fc 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -211,7 +211,7 @@ void bch2_journal_space_available(struct journal *j) continue; while (ja->dirty_idx != ja->cur_idx && - ja->bucket_seq[ja->dirty_idx] < j->last_seq) + ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; while (ja->dirty_idx_ondisk != ja->dirty_idx && @@ -325,66 +325,37 @@ void bch2_journal_do_discards(struct journal *j) * entry, holding it open to ensure it gets replayed during recovery: */ -void bch2_journal_update_last_seq(struct journal *j) +void bch2_journal_reclaim_fast(struct journal *j) { + bool popped = false; + lockdep_assert_held(&j->lock); /* * Unpin journal entries whose reference counts reached zero, meaning * all btree nodes got written out */ - u64 old = j->last_seq; struct journal_entry_pin_list *pin_list; - while (j->last_seq < j->pin.back && - j->last_seq <= j->seq_ondisk && - !atomic_read(&(pin_list = journal_seq_pin(j, j->last_seq))->count)) - j->last_seq++; + while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && + !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) { - if (old != j->last_seq) { + if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes)) + pin_list->bytes = j->dirty_entry_bytes; + + j->dirty_entry_bytes -= pin_list->bytes; + pin_list->bytes = 0; + + j->pin.front++; + popped = true; + } + + if (popped) { bch2_journal_space_available(j); __closure_wake_up(&j->reclaim_flush_wait); } } -void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - unsigned nr_refs = 0; - size_t dirty_entry_bytes = 0; - - scoped_guard(mutex, &j->last_seq_ondisk_lock) - while (j->last_seq_ondisk < last_seq_ondisk) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, j->last_seq_ondisk); - - if (pin_list->devs.e.nr_devs) { - if (nr_refs && - !bch2_replicas_entry_eq(&replicas.e, &pin_list->devs.e)) { - bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); - nr_refs = 0; - } - - memcpy(&replicas, &pin_list->devs, replicas_entry_bytes(&pin_list->devs.e)); - pin_list->devs.e.nr_devs = 0; - nr_refs++; - } - - dirty_entry_bytes += pin_list->bytes; - pin_list->bytes = 0; - - j->last_seq_ondisk++; - } - - scoped_guard(spinlock, &j->lock) { - if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes)) - dirty_entry_bytes = j->dirty_entry_bytes; - j->dirty_entry_bytes -= dirty_entry_bytes; - } - - if (nr_refs) - bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); -} - bool __bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); @@ -396,7 +367,7 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) { if (__bch2_journal_pin_put(j, seq)) { guard(spinlock)(&j->lock); - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); } } @@ -423,7 +394,7 @@ static inline bool __journal_pin_drop(struct journal *j, * writing a new last_seq will now make another bucket available: */ return atomic_dec_and_test(&pin_list->count) && - pin_list == journal_seq_pin(j, j->last_seq); + pin_list == &fifo_peek_front(&j->pin); } void bch2_journal_pin_drop(struct journal *j, @@ -431,7 +402,7 @@ void bch2_journal_pin_drop(struct journal *j, { guard(spinlock)(&j->lock); if (__journal_pin_drop(j, pin)) - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); } static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, @@ -482,7 +453,7 @@ void bch2_journal_pin_copy(struct journal *j, u64 seq = READ_ONCE(src->seq); - if (seq < j->last_seq) { + if (seq < journal_last_seq(j)) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on * the src pin - with the pin dropped, the entry to pin might no @@ -497,13 +468,13 @@ void bch2_journal_pin_copy(struct journal *j, bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - if (seq == j->last_seq) + if (seq == journal_last_seq(j)) journal_wake(j); } @@ -514,19 +485,19 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool wake; scoped_guard(spinlock, &j->lock) { - BUG_ON(seq < j->last_seq); + BUG_ON(seq < journal_last_seq(j)); bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - wake = seq == j->last_seq; + wake = seq == journal_last_seq(j); } if (wake) @@ -958,8 +929,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, */ guard(spinlock)(&j->lock); return !test_bit(JOURNAL_replay_done, &j->flags) || - j->last_seq > seq_to_flush || - j->last_seq == j->pin.back; + journal_last_seq(j) > seq_to_flush || + !fifo_used(&j->pin); } bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) @@ -993,7 +964,39 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) try(bch2_journal_error(j)); - return 0; + guard(mutex)(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + + /* + * Now that we've populated replicas_gc, write to the journal to mark + * active journal devices. This handles the case where the journal might + * be empty. Otherwise we could clear all journal replicas and + * temporarily put the fs into an unrecoverable state. Journal recovery + * expects to find devices marked for journal data on unclean mount. + */ + int ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + + seq = 0; + scoped_guard(spinlock, &j->lock) + while (!ret) { + seq = max(seq, journal_last_seq(j)); + if (seq > j->seq_ondisk) + break; + + union bch_replicas_padded replicas; + memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas)); + seq++; + + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } + } +err: + return bch2_replicas_gc_end(c, ret); } bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) @@ -1007,7 +1010,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 if (!test_bit(JOURNAL_running, &j->flags)) return true; - *seq = max(*seq, j->last_seq); + *seq = max(*seq, j->pin.front); if (*seq >= j->pin.back) return true; diff --git a/libbcachefs/journal/reclaim.h b/libbcachefs/journal/reclaim.h index e1956ba9..2578abfa 100644 --- a/libbcachefs/journal/reclaim.h +++ b/libbcachefs/journal/reclaim.h @@ -43,9 +43,7 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } -void bch2_journal_update_last_seq(struct journal *); -void bch2_journal_update_last_seq_ondisk(struct journal *, u64); - +void bch2_journal_reclaim_fast(struct journal *); bool __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); diff --git a/libbcachefs/journal/types.h b/libbcachefs/journal/types.h index 51061443..43d9e842 100644 --- a/libbcachefs/journal/types.h +++ b/libbcachefs/journal/types.h @@ -149,7 +149,6 @@ enum journal_space_from { }; #define JOURNAL_FLAGS() \ - x(degraded) \ x(replay_done) \ x(running) \ x(may_skip_flush) \ @@ -266,8 +265,6 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; - u64 last_seq; - size_t dirty_entry_bytes; struct journal_space space[journal_space_nr]; @@ -279,7 +276,6 @@ struct journal { spinlock_t err_lock; struct mutex reclaim_lock; - struct mutex last_seq_ondisk_lock; /* * Used for waiting until journal reclaim has freed up space in the * journal: @@ -356,11 +352,4 @@ struct journal_entry_res { unsigned u64s; }; -struct journal_start_info { - u64 seq_read_start; - u64 seq_read_end; - u64 start_seq; - bool clean; -}; - #endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/libbcachefs/journal/write.c b/libbcachefs/journal/write.c index bf83c796..477ad0a0 100644 --- a/libbcachefs/journal/write.c +++ b/libbcachefs/journal/write.c @@ -189,7 +189,6 @@ static CLOSURE_CALLBACK(journal_write_done) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq = le64_to_cpu(w->data->seq); - u64 seq_wrote = seq; int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -198,12 +197,8 @@ static CLOSURE_CALLBACK(journal_write_done) if (w->had_error) { struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; - bch2_replicas_entry_put(c, r); bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); - err = bch2_replicas_entry_get(c, r); - if (err) - r->nr_devs = 0; } if (!w->devs_written.nr) @@ -230,6 +225,7 @@ static CLOSURE_CALLBACK(journal_write_done) BUG_ON(seq < j->pin.front); if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; + w->write_done = true; if (!j->free_buf || j->free_buf_size < w->buf_size) { swap(j->free_buf, w->data); @@ -247,31 +243,22 @@ static CLOSURE_CALLBACK(journal_write_done) } bool completed = false; - bool last_seq_ondisk_updated = false; -again: + bool do_discards = false; + for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { w = j->buf + (seq & JOURNAL_BUF_MASK); - if (!w->write_done && seq != seq_wrote) + if (!w->write_done) break; if (!j->err_seq && !w->noflush) { - if (j->last_seq_ondisk < w->last_seq) { - spin_unlock(&j->lock); - /* - * this needs to happen _before_ updating - * j->flushed_seq_ondisk, for flushing to work - * properly - when the flush completes replcias - * refs need to have been dropped - * */ - bch2_journal_update_last_seq_ondisk(j, w->last_seq); - last_seq_ondisk_updated = true; - spin_lock(&j->lock); - goto again; - } - j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + + closure_wake_up(&c->freelist_wait); + bch2_reset_alloc_cursors(c); + do_discards = true; } j->seq_ondisk = seq; @@ -290,10 +277,8 @@ again: completed = true; } - j->buf[seq_wrote & JOURNAL_BUF_MASK].write_done = true; - if (completed) { - bch2_journal_update_last_seq(j); + bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); @@ -301,8 +286,6 @@ again: journal_wake(j); } - j->pin.front = min(j->pin.back, j->last_seq_ondisk); - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); @@ -325,11 +308,8 @@ again: bch2_journal_do_writes(j); spin_unlock(&j->lock); - if (last_seq_ondisk_updated) { - bch2_reset_alloc_cursors(c); - closure_wake_up(&c->freelist_wait); + if (do_discards) bch2_do_discards(c); - } closure_put(&c->cl); } @@ -655,6 +635,7 @@ CLOSURE_CALLBACK(bch2_journal_write) unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); BUG_ON(w->write_done); @@ -721,11 +702,9 @@ CLOSURE_CALLBACK(bch2_journal_write) */ struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); - ret = bch2_replicas_entry_get(c, r); - if (ret) { - r->nr_devs = 0; + ret = bch2_mark_replicas(c, r); + if (ret) goto err; - } if (c->opts.nochanges) goto no_io; diff --git a/libbcachefs/sb/clean.c b/libbcachefs/sb/clean.c index 1c36d6dd..18a350bc 100644 --- a/libbcachefs/sb/clean.c +++ b/libbcachefs/sb/clean.c @@ -256,10 +256,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = { .to_text = bch2_sb_clean_to_text, }; -void bch2_fs_mark_dirty(struct bch_fs *c) +int bch2_fs_mark_dirty(struct bch_fs *c) { + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + guard(mutex)(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + + return bch2_write_super(c); } void bch2_fs_mark_clean(struct bch_fs *c) @@ -269,6 +277,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) unsigned u64s; int ret; + guard(mutex)(&c->sb_lock); if (BCH_SB_CLEAN(c->disk_sb.sb)) return; @@ -312,4 +321,6 @@ void bch2_fs_mark_clean(struct bch_fs *c) } bch2_journal_pos_from_member_info_set(c); + + bch2_write_super(c); } diff --git a/libbcachefs/sb/clean.h b/libbcachefs/sb/clean.h index 6d811f12..71caef28 100644 --- a/libbcachefs/sb/clean.h +++ b/libbcachefs/sb/clean.h @@ -10,7 +10,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry ** extern const struct bch_sb_field_ops bch_sb_field_ops_clean; -void bch2_fs_mark_dirty(struct bch_fs *); +int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); #endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/libbcachefs/sb/counters_format.h b/libbcachefs/sb/counters_format.h index d6c54dbb..46be5fe8 100644 --- a/libbcachefs/sb/counters_format.h +++ b/libbcachefs/sb/counters_format.h @@ -31,6 +31,8 @@ enum counters_flags { x(data_update_fail, 82, TYPE_COUNTER) \ x(data_update_key, 37, TYPE_SECTORS) \ x(data_update_key_fail, 38, TYPE_COUNTER) \ + x(ec_stripe_update_extent, 99, TYPE_COUNTER) \ + x(ec_stripe_update_extent_fail, 100, TYPE_COUNTER) \ x(io_move_read, 35, TYPE_SECTORS) \ x(io_move_write, 36, TYPE_SECTORS) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ diff --git a/libbcachefs/sb/io.c b/libbcachefs/sb/io.c index 0c358c19..11842325 100644 --- a/libbcachefs/sb/io.c +++ b/libbcachefs/sb/io.c @@ -1021,11 +1021,6 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); - if (bch2_sb_has_journal(c->disk_sb.sb)) - bch2_fs_mark_dirty(c); - else - bch2_fs_mark_clean(c); - /* * Note: we do writes to RO devices here, and we might want to change * that in the future. diff --git a/libbcachefs/vendor/bio_iov_iter.c b/libbcachefs/vendor/bio_iov_iter.c new file mode 100644 index 00000000..95f4e32a --- /dev/null +++ b/libbcachefs/vendor/bio_iov_iter.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include +#include + +#include "vendor/bio_iov_iter.h" + +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + +static inline void bio_release_page(struct bio *bio, struct page *page) +{ + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(page); +} + +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + +static unsigned int get_contig_folio_len(unsigned int *num_pages, + struct page **pages, unsigned int i, + struct folio *folio, size_t left, + size_t offset) +{ + size_t bytes = left; + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); + unsigned int j; + + /* + * We might COW a single page in the middle of + * a large folio, so we have to check that all + * pages belong to the same folio. + */ + bytes -= contig_sz; + for (j = i + 1; j < i + *num_pages; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) { + break; + } + contig_sz += next; + bytes -= next; + } + *num_pages = j - i; + + return contig_sz; +} + +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + iov_iter_extraction_t extraction_flags = 0; + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size; + unsigned int num_pages, i = 0; + size_t offset, folio_offset, left, len; + int ret = 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { + struct page *page = pages[i]; + struct folio *folio = page_folio(page); + unsigned int old_vcnt = bio->bi_vcnt; + + folio_offset = ((size_t)folio_page_idx(folio, page) << + PAGE_SHIFT) + offset; + + len = min(folio_size(folio) - folio_offset, left); + + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + if (num_pages > 1) + len = get_contig_folio_len(&num_pages, pages, i, + folio, left, offset); + + if (!bio_add_folio(bio, folio, len, folio_offset)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + /* + * We're adding another fragment of a page that already + * was part of the last segment. Undo our pin as the + * page was pinned when an earlier fragment of it was + * added to the bio and __bio_release_pages expects a + * single pin per page. + */ + if (offset && bio->bi_vcnt == old_vcnt) + unpin_user_folio(folio, 1); + } + offset = 0; + } + + iov_iter_revert(iter, left); +out: + while (i < nr_pages) + bio_release_page(bio, pages[i++]); + + return ret; +} + +/* + * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that + * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length + * for the next iteration. + */ +static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) +{ + size_t nbytes = bio->bi_iter.bi_size & len_align_mask; + + if (!nbytes) + return 0; + + iov_iter_revert(iter, nbytes); + bio->bi_iter.bi_size -= nbytes; + do { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (nbytes < bv->bv_len) { + bv->bv_len -= nbytes; + break; + } + + bio_release_page(bio, bv->bv_page); + bio->bi_vcnt--; + nbytes -= bv->bv_len; + } while (nbytes); + + if (!bio->bi_vcnt) + return -EFAULT; + return 0; +} + +int bch2_bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) +{ + int ret = 0; + + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + + if (iov_iter_is_bvec(iter)) { + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; + } + + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); + do { + ret = __bio_iov_iter_get_pages(bio, iter); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + + if (bio->bi_vcnt) + return bio_iov_iter_align_down(bio, iter, len_align_mask); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/vendor/bio_iov_iter.h b/libbcachefs/vendor/bio_iov_iter.h new file mode 100644 index 00000000..e95e92e2 --- /dev/null +++ b/libbcachefs/vendor/bio_iov_iter.h @@ -0,0 +1,6 @@ +#ifndef _BCACHEFS_VENDOR_BIO_IOV_ITER_H +#define _BCACHEFS_VENDOR_BIO_IOV_ITER_H + +int bch2_bio_iov_iter_get_pages(struct bio *, struct iov_iter *, unsigned); + +#endif /* _BCACHEFS_VENDOR_BIO_IOV_ITER_H */ diff --git a/libbcachefs/vfs/direct.c b/libbcachefs/vfs/direct.c index cbdcf747..7c9565b9 100644 --- a/libbcachefs/vfs/direct.c +++ b/libbcachefs/vfs/direct.c @@ -13,6 +13,8 @@ #include "vfs/direct.h" #include "vfs/pagecache.h" +#include "vendor/bio_iov_iter.h" + #include "util/enumerated_ref.h" #include @@ -148,11 +150,7 @@ start: bio->bi_iter.bi_sector = offset >> 9; bio->bi_private = dio; -#if LINUX_VERSION_CODE < KERNEL_VERSION(6,18,0) - ret = bio_iov_iter_get_pages(bio, iter); -#else - ret = bio_iov_iter_get_pages(bio, iter, 0); -#endif + ret = bch2_bio_iov_iter_get_pages(bio, iter, 0); if (ret < 0) { /* XXX: fault inject this path */ bio->bi_status = BLK_STS_RESOURCE; @@ -465,11 +463,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) EBUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; -#if LINUX_VERSION_CODE < KERNEL_VERSION(6,18,0) - ret = bio_iov_iter_get_pages(bio, &dio->iter); -#else - ret = bio_iov_iter_get_pages(bio, &dio->iter, 0); -#endif + ret = bch2_bio_iov_iter_get_pages(bio, &dio->iter, 0); dropped_locks = fdm_dropped_locks();