diff --git a/.bcachefs_revision b/.bcachefs_revision index ca5f3e9d..667f3d1d 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -5fe20ac58af402e8ad9ace0bcf9daad524e3005d +d7354b97c0100568c5696b192e30335a3666062f diff --git a/libbcachefs/alloc/replicas.c b/libbcachefs/alloc/replicas.c index 7b32f371..f0722e5b 100644 --- a/libbcachefs/alloc/replicas.c +++ b/libbcachefs/alloc/replicas.c @@ -16,25 +16,40 @@ DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu, kfree(_T.entries), (struct bch_replicas_cpu) {}, void) -static inline struct bch_replicas_entry_v1 * +static inline struct bch_replicas_entry_cpu * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } +static inline unsigned __cpu_replicas_entry_bytes(unsigned v1_bytes) +{ + return offsetof(struct bch_replicas_entry_cpu, e) + v1_bytes; +} + +static inline unsigned cpu_replicas_entry_bytes(struct bch_replicas_entry_cpu *e) +{ + return __cpu_replicas_entry_bytes(replicas_entry_bytes(&e->e)); +} + #define for_each_cpu_replicas_entry(_r, _i) \ - for (struct bch_replicas_entry_v1 *_i = (_r)->entries; \ + for (struct bch_replicas_entry_cpu *_i = (_r)->entries; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \ _i = (void *) (_i) + (_r)->entry_size) static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); -/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) +static int cpu_replicas_entry_cmp(const struct bch_replicas_entry_cpu *l, + const struct bch_replicas_entry_cpu *r, + size_t size) { - size_t size = (size_t) priv; - return memcmp(l, r, size); + return memcmp(&l->e, &r->e, size - offsetof(struct bch_replicas_entry_cpu, e)); +} + +static int cpu_replicas_entry_cmp_r(const void *l, const void *r, const void *priv) +{ + return cpu_replicas_entry_cmp(l, r, (size_t) priv); } /* Replicas tracking - in memory: */ @@ -60,7 +75,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); + cpu_replicas_entry_cmp_r, NULL, + (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -85,6 +101,13 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } +static void bch2_replicas_entry_cpu_to_text(struct printbuf *out, + struct bch_replicas_entry_cpu *e) +{ + prt_printf(out, "ref=%u ", atomic_read(&e->ref)); + bch2_replicas_entry_to_text(out, &e->e); +} + static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) @@ -151,7 +174,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, prt_printf(out, " "); first = false; - bch2_replicas_entry_to_text(out, i); + bch2_replicas_entry_cpu_to_text(out, i); } } @@ -232,6 +255,44 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, bch2_replicas_entry_sort(e); } +/* @l is bch_replicas_entry_v1, @r is bch_replicas_entry_cpu */ +static int replicas_entry_search_cmp(const void *_l, const void *_r, const void *priv) +{ + const struct bch_replicas_entry_v1 *l = _l; + const struct bch_replicas_entry_cpu *r = _r; + size_t size = (size_t) priv; + + return memcmp(l, &r->e, size); +} + +static inline struct bch_replicas_entry_cpu * +replicas_entry_search(struct bch_replicas_cpu *r, + struct bch_replicas_entry_v1 *search) +{ + verify_replicas_entry(search); + + size_t entry_size = replicas_entry_bytes(search); + int idx = likely(__cpu_replicas_entry_bytes(entry_size) <= r->entry_size) + ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, + replicas_entry_search_cmp, + (void *) entry_size, search) + : -1; + return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; +} + +bool bch2_replicas_marked_locked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + return !search->nr_devs || replicas_entry_search(&c->replicas, search); +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) +{ + guard(percpu_read)(&c->mark_lock); + return bch2_replicas_marked_locked(c, search); +} + static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, @@ -240,9 +301,12 @@ cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)), + __cpu_replicas_entry_bytes(replicas_entry_bytes(new_entry))), }; + /* alignment */ + new.entry_size = round_up(new.entry_size, sizeof(atomic_t)); + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -252,7 +316,7 @@ cpu_replicas_add_entry(struct bch_fs *c, cpu_replicas_entry(old, i), old->entry_size); - memcpy(cpu_replicas_entry(&new, old->nr), + memcpy(&cpu_replicas_entry(&new, old->nr)->e, new_entry, replicas_entry_bytes(new_entry)); @@ -260,152 +324,56 @@ cpu_replicas_add_entry(struct bch_fs *c, return new; } -static inline struct bch_replicas_entry_v1 * -replicas_entry_search(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - size_t entry_size = replicas_entry_bytes(search); - int idx = likely(entry_size <= r->entry_size) - ? eytzinger0_find_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, (void *) entry_size, search) - : -1; - return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - return !search->nr_devs || - (replicas_entry_search(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - replicas_entry_search(&c->replicas_gc, search))); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - guard(percpu_read)(&c->mark_lock); - return bch2_replicas_marked_locked(c, search); -} - noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry) + struct bch_replicas_entry_v1 *new_entry, + unsigned ref) { verify_replicas_entry(new_entry); - CLASS(bch_replicas_cpu, new_r)(); - CLASS(bch_replicas_cpu, new_gc)(); - guard(mutex)(&c->sb_lock); + bool write_sb = false; - if (c->replicas_gc.entries && - !replicas_entry_search(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) - return bch_err_throw(c, ENOMEM_cpu_replicas); - } - - if (!replicas_entry_search(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) - return bch_err_throw(c, ENOMEM_cpu_replicas); - - try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); - } - - if (!new_r.entries && - !new_gc.entries) - return 0; - - /* allocations done, now commit: */ - - if (new_r.entries) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ scoped_guard(percpu_write, &c->mark_lock) { - if (new_r.entries) + if (!replicas_entry_search(&c->replicas, new_entry)) { + CLASS(bch_replicas_cpu, new_r)(); + + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); + if (!new_r.entries) + return bch_err_throw(c, ENOMEM_cpu_replicas); + + try(bch2_cpu_replicas_to_sb_replicas(c, &new_r)); + swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); + write_sb = true; + } + + atomic_add(ref, &replicas_entry_search(&c->replicas, new_entry)->ref); } + /* After dropping mark_lock */ + if (write_sb) + bch2_write_super(c); + return 0; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r); + ? 0 : bch2_mark_replicas_slowpath(c, r, 0); } -/* - * Old replicas_gc mechanism: only used for journal replicas entries now, should - * die at some point: - */ - -int bch2_replicas_gc_end(struct bch_fs *c, int ret) +static void __replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_cpu *e) { - lockdep_assert_held(&c->replicas_gc_lock); + struct bch_replicas_cpu *r = &c->replicas; - guard(mutex)(&c->sb_lock); - scoped_guard(percpu_write, &c->mark_lock) { - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); + memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); + bch2_cpu_replicas_sort(r); - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; - } - - if (!ret) - bch2_write_super(c); - - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - lockdep_assert_held(&c->replicas_gc_lock); - - guard(mutex)(&c->sb_lock); - BUG_ON(c->replicas_gc.entries); - - c->replicas_gc.nr = 0; - c->replicas_gc.entry_size = 0; - - for_each_cpu_replicas_entry(&c->replicas, e) { - /* Preserve unknown data types */ - if (e->data_type >= BCH_DATA_NR || - !(BIT(e->data_type) & typemask)) { - c->replicas_gc.nr++; - c->replicas_gc.entry_size = - max_t(unsigned, c->replicas_gc.entry_size, - replicas_entry_bytes(e)); - } - } - - c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, - c->replicas_gc.entry_size, - GFP_KERNEL); - if (!c->replicas_gc.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - unsigned i = 0; - for_each_cpu_replicas_entry(&c->replicas, e) - if (e->data_type >= BCH_DATA_NR || - !(BIT(e->data_type) & typemask)) - memcpy(cpu_replicas_entry(&c->replicas_gc, i++), - e, c->replicas_gc.entry_size); - - bch2_cpu_replicas_sort(&c->replicas_gc); - return 0; + int ret = bch2_cpu_replicas_to_sb_replicas(c, r); + if (WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret))) + return; } void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill) @@ -413,18 +381,95 @@ void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *ki lockdep_assert_held(&c->mark_lock); lockdep_assert_held(&c->sb_lock); - struct bch_replicas_cpu *r = &c->replicas; + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, kill); - struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill); if (WARN(!e, "replicas entry not found in sb")) return; - memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); + __replicas_entry_kill(c, e); - bch2_cpu_replicas_sort(r); + /* caller does write_super() after dropping mark_lock */ +} - int ret = bch2_cpu_replicas_to_sb_replicas(c, r); - WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)); +void bch2_replicas_entry_put_many(struct bch_fs *c, struct bch_replicas_entry_v1 *r, unsigned nr) +{ + if (!r->nr_devs) + return; + + BUG_ON(r->data_type != BCH_DATA_journal); + verify_replicas_entry(r); + + scoped_guard(percpu_read, &c->mark_lock) { + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + + int v = atomic_sub_return(nr, &e->ref); + BUG_ON(v < 0); + if (v) + return; + } + + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + if (e && !atomic_read(&e->ref)) + __replicas_entry_kill(c, e); + } + + bch2_write_super(c); +} + +static inline bool bch2_replicas_entry_get_inmem(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + guard(percpu_read)(&c->mark_lock); + struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r); + if (e) + atomic_inc(&e->ref); + return e != NULL; +} + +int bch2_replicas_entry_get(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + if (!r->nr_devs) + return 0; + + BUG_ON(r->data_type != BCH_DATA_journal); + verify_replicas_entry(r); + + return bch2_replicas_entry_get_inmem(c, r) + ? 0 + : bch2_mark_replicas_slowpath(c, r, 1); +} + +int bch2_replicas_gc_reffed(struct bch_fs *c) +{ + bool write_sb = false; + + guard(mutex)(&c->sb_lock); + + scoped_guard(percpu_write, &c->mark_lock) { + unsigned dst = 0; + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_cpu *e = + cpu_replicas_entry(&c->replicas, i); + + if (e->e.data_type != BCH_DATA_journal || + atomic_read(&e->ref)) + memcpy(cpu_replicas_entry(&c->replicas, dst++), + e, + c->replicas.entry_size); + } + + if (c->replicas.nr != dst) { + c->replicas.nr = dst; + bch2_cpu_replicas_sort(&c->replicas); + + try(bch2_cpu_replicas_to_sb_replicas(c, &c->replicas)); + } + } + + if (write_sb) + bch2_write_super(c); + return 0; } /* Replicas tracking - superblock: */ @@ -441,6 +486,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } + entry_size = __cpu_replicas_entry_bytes(entry_size); + entry_size = round_up(entry_size, sizeof(atomic_t)); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -448,10 +496,10 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, cpu_r->nr = nr; cpu_r->entry_size = entry_size; - for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(dst, e, replicas_entry_bytes(e)); - bch2_replicas_entry_sort(dst); + for_each_replicas_entry(sb_r, src) { + struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(&dst->e, src, replicas_entry_bytes(src)); + bch2_replicas_entry_sort(&dst->e); } return 0; @@ -469,9 +517,13 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } + entry_size = __cpu_replicas_entry_bytes(entry_size); + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); + entry_size = round_up(entry_size, sizeof(atomic_t)); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -480,14 +532,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, src) { - struct bch_replicas_entry_v1 *dst = + struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++); - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - dst->nr_required = 1; - memcpy(dst->devs, src->devs, src->nr_devs); - bch2_replicas_entry_sort(dst); + dst->e.data_type = src->data_type; + dst->e.nr_devs = src->nr_devs; + dst->e.nr_required = 1; + memcpy(dst->e.devs, src->devs, src->nr_devs); + bch2_replicas_entry_sort(&dst->e); } return 0; @@ -495,6 +547,12 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { + /* + * If called after fs is started (after journal read), we'll be blowing + * away refcounts + */ + BUG_ON(test_bit(BCH_FS_started, &c->flags)); + struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; CLASS(bch_replicas_cpu, new_r)(); @@ -522,7 +580,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(src) - 1; + bytes += replicas_entry_bytes(&src->e) - 1; sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); @@ -538,9 +596,9 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - memcpy(dst->devs, src->devs, src->nr_devs); + dst->data_type = src->e.data_type; + dst->nr_devs = src->e.nr_devs; + memcpy(dst->devs, src->e.devs, src->e.nr_devs); dst = replicas_entry_next(dst); @@ -561,8 +619,8 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(src); - if (src->nr_required != 1) + bytes += replicas_entry_bytes(&src->e); + if (src->e.nr_required != 1) need_v1 = true; } @@ -583,7 +641,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { - memcpy(dst, src, replicas_entry_bytes(src)); + memcpy(dst, &src->e, replicas_entry_bytes(&src->e)); dst = replicas_entry_next(dst); @@ -602,24 +660,26 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_r(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - bch2_memcmp, NULL, + cpu_replicas_entry_cmp_r, NULL, (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_v1 *e = + struct bch_replicas_entry_cpu *e = cpu_replicas_entry(cpu_r, i); - try(bch2_replicas_entry_sb_validate(e, sb, err)); + try(bch2_replicas_entry_sb_validate(&e->e, sb, err)); if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_v1 *n = + struct bch_replicas_entry_cpu *n = cpu_replicas_entry(cpu_r, i + 1); - BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); + int cmp = cpu_replicas_entry_cmp(e, n, cpu_r->entry_size); - if (!memcmp(e, n, cpu_r->entry_size)) { + BUG_ON(cmp > 0); + + if (!cmp) { prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, e); + bch2_replicas_entry_to_text(err, &e->e); return -BCH_ERR_invalid_sb_replicas; } } @@ -702,7 +762,9 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, struct printbuf *err) { guard(percpu_read)(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { + for_each_cpu_replicas_entry(&c->replicas, i) { + struct bch_replicas_entry_v1 *e = &i->e; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; @@ -820,6 +882,25 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return bch2_can_read_fs_with_devs(c, devs, flags, err); } +bool bch2_sb_has_journal(struct bch_sb *sb) +{ + struct bch_sb_field_replicas *replicas = bch2_sb_field_get(sb, replicas); + struct bch_sb_field_replicas_v0 *replicas_v0 = bch2_sb_field_get(sb, replicas_v0); + + if (replicas) { + for_each_replicas_entry(replicas, r) + if (r->data_type == BCH_DATA_journal) + return true; + } else if (replicas_v0) { + for_each_replicas_entry(replicas_v0, r) + if (r->data_type == BCH_DATA_journal) + return true; + } + + + return false; +} + unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; @@ -863,5 +944,4 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) void bch2_fs_replicas_exit(struct bch_fs *c) { kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); } diff --git a/libbcachefs/alloc/replicas.h b/libbcachefs/alloc/replicas.h index b2b86089..f9743ec5 100644 --- a/libbcachefs/alloc/replicas.h +++ b/libbcachefs/alloc/replicas.h @@ -39,13 +39,22 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, unsigned, struct printbuf *, bool); +bool bch2_sb_has_journal(struct bch_sb *); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); +void bch2_replicas_entry_put_many(struct bch_fs *, struct bch_replicas_entry_v1 *, unsigned); +static inline void bch2_replicas_entry_put(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +{ + bch2_replicas_entry_put_many(c, r, 1); +} + +int bch2_replicas_entry_get(struct bch_fs *, struct bch_replicas_entry_v1 *); + void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *); +int bch2_replicas_gc_reffed(struct bch_fs *); + static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev) { for (unsigned i = 0; i < r->nr_devs; i++) @@ -54,6 +63,12 @@ static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, return false; } +static inline bool bch2_replicas_entry_eq(struct bch_replicas_entry_v1 *l, + struct bch_replicas_entry_v1 *r) +{ + return l->nr_devs == r->nr_devs && !memcmp(l, r, replicas_entry_bytes(l)); +} + /* iterate over superblock replicas - used by userspace tools: */ #define replicas_entry_next(_i) \ diff --git a/libbcachefs/alloc/replicas_types.h b/libbcachefs/alloc/replicas_types.h index 418e702e..50d8f87c 100644 --- a/libbcachefs/alloc/replicas_types.h +++ b/libbcachefs/alloc/replicas_types.h @@ -2,10 +2,16 @@ #ifndef _BCACHEFS_REPLICAS_TYPES_H #define _BCACHEFS_REPLICAS_TYPES_H +/* unsized - bch_replicas_entry_v1 is variable length */ +struct bch_replicas_entry_cpu { + atomic_t ref; + struct bch_replicas_entry_v1 e; +}; + struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_v1 *entries; + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry_cpu *entries; }; union bch_replicas_padded { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 8539b4e1..9fd7a62e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -808,8 +808,6 @@ struct bch_fs { struct bch_accounting_mem accounting; struct bch_replicas_cpu replicas; - struct bch_replicas_cpu replicas_gc; - struct mutex replicas_gc_lock; struct journal_entry_res btree_root_journal_res; struct journal_entry_res clock_journal_res; diff --git a/libbcachefs/btree/key_cache.c b/libbcachefs/btree/key_cache.c index 0532e9a9..f2e213d2 100644 --- a/libbcachefs/btree/key_cache.c +++ b/libbcachefs/btree/key_cache.c @@ -438,10 +438,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * sequence number with a new btree node write, we want to re-journal * the update */ - if (ck->journal.seq == journal_last_seq(j)) + if (ck->journal.seq == j->last_seq) commit_flags |= BCH_WATERMARK_reclaim; - if (ck->journal.seq != journal_last_seq(j) || + if (ck->journal.seq != j->last_seq || !journal_low_on_space(&c->journal)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; diff --git a/libbcachefs/data/extents.c b/libbcachefs/data/extents.c index 93fe7b74..4d7f7287 100644 --- a/libbcachefs/data/extents.c +++ b/libbcachefs/data/extents.c @@ -598,31 +598,16 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, bch2_crc_cmp(l.csum, r.csum)); } -static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, - struct bch_extent_crc_unpacked n) +static union bch_extent_entry *bkey_crc_find(struct bkey_i *k, struct bch_extent_crc_unpacked crc) { - return !crc_is_compressed(u) && - u.csum_type && - u.uncompressed_size > u.live_size && - bch2_csum_type_is_encryption(u.csum_type) == - bch2_csum_type_is_encryption(n.csum_type); -} + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked i; + union bch_extent_entry *entry; -bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, - struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - if (!n.csum_type) - return false; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (can_narrow_crc(crc, n)) - return true; - - return false; + bkey_for_each_crc(&k->k, ptrs, i, entry) + if (!bch2_crc_unpacked_cmp(i, crc)) + return entry; + return NULL; } /* @@ -634,44 +619,31 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crc(struct bkey_i *k, + struct bch_extent_crc_unpacked old, + struct bch_extent_crc_unpacked new) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked u; - struct extent_ptr_decoded p; - union bch_extent_entry *i; - bool ret = false; + BUG_ON(crc_is_compressed(new)); + BUG_ON(new.offset); + BUG_ON(new.live_size != k->k.size); - /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) { - bkey_for_each_crc(&k->k, ptrs, u, i) - if (!crc_is_compressed(u) && - u.csum_type && - u.live_size == u.uncompressed_size) { - n = u; - goto found; - } + + union bch_extent_entry *old_e = bkey_crc_find(k, old); + if (!old_e) return false; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_entry *i; + + bkey_extent_entry_for_each_from(ptrs, i, extent_entry_next(old_e)) { + if (extent_entry_is_crc(i)) + break; + if (extent_entry_is_ptr(i)) + i->ptr.offset += old.offset; } -found: - BUG_ON(crc_is_compressed(n)); - BUG_ON(n.offset); - BUG_ON(n.live_size != k->k.size); -restart_narrow_pointers: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - bkey_for_each_ptr_decode(&k->k, ptrs, p, i) - if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); - p.ptr.offset += p.crc.offset; - p.crc = n; - bch2_extent_ptr_decoded_append(k, &p); - ret = true; - goto restart_narrow_pointers; - } - - return ret; + bch2_extent_crc_pack(entry_to_crc(old_e), new, extent_entry_type(old_e)); + return true; } static void bch2_extent_crc_pack(union bch_extent_crc *dst, diff --git a/libbcachefs/data/extents.h b/libbcachefs/data/extents.h index 286afa94..82844873 100644 --- a/libbcachefs/data/extents.h +++ b/libbcachefs/data/extents.h @@ -461,9 +461,9 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* Extent checksum entries: */ -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crc(struct bkey_i *, + struct bch_extent_crc_unpacked, + struct bch_extent_crc_unpacked); void bch2_extent_crc_append(struct bkey_i *, struct bch_extent_crc_unpacked); diff --git a/libbcachefs/data/read.c b/libbcachefs/data/read.c index 4ebd9971..53b1295d 100644 --- a/libbcachefs/data/read.c +++ b/libbcachefs/data/read.c @@ -784,7 +784,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, bkey_reassemble(new, k); - if (!bch2_bkey_narrow_crcs(new, *new_crc)) + if (!bch2_bkey_narrow_crc(new, rbio->pick.crc, *new_crc)) return bch_err_throw(c, rbio_narrow_crcs_fail); return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); @@ -794,7 +794,8 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - if (crc_is_compressed(rbio->pick.crc)) + if (!rbio->pick.crc.csum_type || + crc_is_compressed(rbio->pick.crc)) return; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; @@ -1070,6 +1071,13 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); } +static inline bool can_narrow_crc(struct bch_extent_crc_unpacked n) +{ + return n.csum_type && + n.uncompressed_size < n.live_size && + !crc_is_compressed(n); +} + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, @@ -1170,8 +1178,7 @@ retry_pick: bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_must_clone; - narrow_crcs = !(flags & BCH_READ_in_retry) && - bch2_can_narrow_extent_crcs(k, pick.crc); + narrow_crcs = !(flags & BCH_READ_in_retry) && can_narrow_crc(pick.crc); if (narrow_crcs && (flags & BCH_READ_user_mapped)) flags |= BCH_READ_must_bounce; diff --git a/libbcachefs/data/update.c b/libbcachefs/data/update.c index a636b0c1..7f508674 100644 --- a/libbcachefs/data/update.c +++ b/libbcachefs/data/update.c @@ -298,7 +298,6 @@ restart_drop_extra_replicas: extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) bch2_extent_ptr_decoded_append(insert, &p); - bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); bch2_bkey_drop_extra_cached_ptrs(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, @@ -784,6 +783,53 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) return 0; } +/* + * When an extent has both checksummed and non-checksummed pointers, special + * handling: + * + * We don't want to blindly apply an existing checksum to non-checksummed data, + * or lose our ability to detect that different replicas in the same extent have + * or had different data, so: + * + * - prefer to read from the specific replica being rewritten + * - if we're rewriting a replica without a checksum, only rewrite that specific + * replica in this data update + */ +static void checksummed_and_non_checksummed_handling(struct data_update *u, struct bkey_ptrs_c ptrs) +{ + bool have_checksummed = false, have_non_checksummed = false; + + struct bkey_s_c k = bkey_i_to_s_c(u->k.k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.csum_type) + have_checksummed = true; + else + have_non_checksummed = true; + } + + if (unlikely(have_checksummed && have_non_checksummed)) { + unsigned ptr_bit = 1; + int rewrite_checksummed = -1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (ptr_bit & u->opts.ptrs_rewrite) { + if (rewrite_checksummed < 0) { + rewrite_checksummed = p.crc.csum_type != 0; + u->opts.read_dev = p.ptr.dev; + } + + if (rewrite_checksummed != (p.crc.csum_type != 0) || + (!rewrite_checksummed && p.ptr.dev != u->opts.read_dev)) + u->opts.ptrs_rewrite &= ~ptr_bit; + } + + ptr_bit <<= 1; + } + } +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, @@ -844,6 +890,9 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned buf_bytes = 0; bool unwritten = false; + if (m->opts.ptrs_rewrite) + checksummed_and_non_checksummed_handling(m, ptrs); + scoped_guard(rcu) { unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -957,6 +1006,11 @@ int bch2_data_update_init(struct btree_trans *trans, } } + /* + * Check if we have checksummed and non-checksummed pointers, prefer to + * read from the pointer we're operating on + */ + m->ptrs_held = bkey_get_dev_refs(c, k); if (c->opts.nocow_enabled) { diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 5fbda02e..d11a1a45 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -226,6 +226,7 @@ x(EINVAL, erasure_coding_found_btree_node) \ x(EINVAL, option_negative) \ x(EINVAL, topology_repair) \ + x(EINVAL, unaligned_io) \ x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \ x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \ x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \ diff --git a/libbcachefs/init/fs.c b/libbcachefs/init/fs.c index 1ea8c224..594263a4 100644 --- a/libbcachefs/init/fs.c +++ b/libbcachefs/init/fs.c @@ -375,9 +375,6 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch2_verify_accounting_clean(c); - - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); } else { /* Make sure error counts/counters are persisted */ guard(mutex)(&c->sb_lock); @@ -473,7 +470,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) try(bch2_fs_init_rw(c)); try(bch2_sb_members_v2_init(c)); - try(bch2_fs_mark_dirty(c)); clear_bit(BCH_FS_clean_shutdown, &c->flags); @@ -918,7 +914,7 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) } if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\nallowing incompatible features above "); + prt_printf(&p, "\nallowing incompatible features up to "); bch2_version_to_text(&p, c->sb.version_incompat_allowed); } @@ -1052,7 +1048,6 @@ static int bch2_fs_init(struct bch_fs *c, struct bch_sb *sb, init_rwsem(&c->state_lock); mutex_init(&c->sb_lock); - mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); diff --git a/libbcachefs/init/recovery.c b/libbcachefs/init/recovery.c index b001065b..25cdb194 100644 --- a/libbcachefs/init/recovery.c +++ b/libbcachefs/init/recovery.c @@ -610,8 +610,7 @@ fsck_err: int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; - struct jset *last_journal_entry = NULL; - u64 last_seq = 0, blacklist_seq, journal_seq; + struct journal_start_info journal_start = {}; int ret = 0; if (c->sb.clean) { @@ -637,7 +636,7 @@ int bch2_fs_recovery(struct bch_fs *c) struct journal_replay **i; bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); + ret = bch2_journal_read(c, &journal_start); if (ret) goto err; @@ -648,22 +647,21 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.read_journal_only) goto out; + if (mustfix_fsck_err_on(c->sb.clean && !journal_start.clean, + c, clean_but_journal_not_empty, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + + struct jset *last_journal_entry = NULL; genradix_for_each_reverse(&c->journal_entries, iter, i) if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } - if (mustfix_fsck_err_on(c->sb.clean && - last_journal_entry && - !journal_entry_empty(last_journal_entry), c, - clean_but_journal_not_empty, - "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - } - if (!last_journal_entry) { fsck_err_on(!c->sb.clean, c, dirty_but_no_journal_entries, @@ -705,11 +703,12 @@ use_clean: goto err; } - blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + + journal_start.start_seq = le64_to_cpu(clean->journal_seq) + 1; } - c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1; + c->journal_replay_seq_start = journal_start.seq_read_start; + c->journal_replay_seq_end = journal_start.seq_read_end; zero_out_btree_mem_ptr(&c->journal_keys); @@ -756,13 +755,15 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += JOURNAL_BUF_NR * 4; + journal_start.start_seq += JOURNAL_BUF_NR * 4; - if (blacklist_seq != journal_seq) { + if (journal_start.seq_read_end && + journal_start.seq_read_end + 1 != journal_start.start_seq) { + u64 blacklist_seq = journal_start.seq_read_end + 1; ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_seq) ?: + blacklist_seq, journal_start.start_seq) ?: bch2_journal_seq_blacklist_add(c, - blacklist_seq, journal_seq); + blacklist_seq, journal_start.start_seq); if (ret) { bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; @@ -770,8 +771,10 @@ use_clean: } ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, last_seq, journal_seq); + journal_start.start_seq, + journal_start.seq_read_start, + journal_start.seq_read_end) ?: + bch2_fs_journal_start(&c->journal, journal_start); if (ret) goto err; @@ -1014,7 +1017,8 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - ret = bch2_fs_journal_start(&c->journal, 1, 1); + struct journal_start_info journal_start = { .start_seq = 1 }; + ret = bch2_fs_journal_start(&c->journal, journal_start); if (ret) goto err; diff --git a/libbcachefs/journal/init.c b/libbcachefs/journal/init.c index e7410baa..651228cc 100644 --- a/libbcachefs/journal/init.c +++ b/libbcachefs/journal/init.c @@ -11,6 +11,7 @@ #include "alloc/foreground.h" #include "alloc/replicas.h" #include "btree/update.h" +#include "init/error.h" /* allocate journal on a device: */ @@ -367,29 +368,30 @@ void bch2_fs_journal_stop(struct journal *j) clear_bit(JOURNAL_running, &j->flags); } -int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) +int bch2_fs_journal_start(struct journal *j, struct journal_start_info info) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; + int ret = 0; /* * * XXX pick most recent non blacklisted sequence number */ - cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + info.start_seq = max(info.start_seq, bch2_journal_last_blacklisted_seq(c)); - if (cur_seq >= JOURNAL_SEQ_MAX) { + if (info.start_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; } /* Clean filesystem? */ - if (!last_seq) - last_seq = cur_seq; + u64 cur_seq = info.start_seq; + u64 last_seq = info.seq_read_start ?: info.start_seq; u64 nr = cur_seq - last_seq; if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) { @@ -419,6 +421,7 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; + j->last_seq = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); @@ -441,12 +444,26 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) if (journal_entry_empty(&i->j)) j->last_empty_seq = le64_to_cpu(i->j.seq); - struct bch_devs_list seq_devs = {}; - darray_for_each(i->ptrs, ptr) - seq_devs.data[seq_devs.nr++] = ptr->dev; + if (!info.clean) { + struct bch_devs_list seq_devs = {}; + darray_for_each(i->ptrs, ptr) + seq_devs.data[seq_devs.nr++] = ptr->dev; - p = journal_seq_pin(j, seq); - bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); + p = journal_seq_pin(j, seq); + bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); + + CLASS(printbuf, buf)(); + bch2_replicas_entry_to_text(&buf, &p->devs.e); + + fsck_err_on(!test_bit(JOURNAL_degraded, &j->flags) && + !bch2_replicas_marked(c, &p->devs.e), + c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n%s", + le64_to_cpu(i->j.seq), buf.buf); + + if (bch2_replicas_entry_get(c, &p->devs.e)) + p->devs.e.nr_devs = 0; + } had_entries = true; } @@ -460,7 +477,9 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) c->last_bucket_seq_cleanup = journal_cur_seq(j); } - return 0; + try(bch2_replicas_gc_reffed(c)); +fsck_err: + return ret; } void bch2_journal_set_replay_done(struct journal *j) @@ -585,6 +604,7 @@ void bch2_fs_journal_init_early(struct journal *j) init_waitqueue_head(&j->reclaim_wait); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); + mutex_init(&j->last_seq_ondisk_lock); mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); diff --git a/libbcachefs/journal/init.h b/libbcachefs/journal/init.h index 6d49c29a..6fc55c50 100644 --- a/libbcachefs/journal/init.h +++ b/libbcachefs/journal/init.h @@ -11,7 +11,7 @@ int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64, u64); +int bch2_fs_journal_start(struct journal *, struct journal_start_info); void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); diff --git a/libbcachefs/journal/journal.c b/libbcachefs/journal/journal.c index 2b167bf8..1c1d3472 100644 --- a/libbcachefs/journal/journal.c +++ b/libbcachefs/journal/journal.c @@ -187,7 +187,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); bch2_journal_do_writes(j); /* @@ -235,10 +235,10 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - struct journal_entry_pin_list *pin_list = - journal_seq_pin(j, journal_cur_seq(j)); - pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data)); - j->dirty_entry_bytes += pin_list->bytes; + size_t bytes = roundup_pow_of_two(vstruct_bytes(buf->data)); + + journal_seq_pin(j, journal_cur_seq(j))->bytes = bytes; + j->dirty_entry_bytes += bytes; if (trace_journal_entry_close_enabled() && trace) { CLASS(printbuf, err)(); @@ -280,7 +280,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t * contain either what the old pin protected or what the new pin * protects. * - * After the old pin is dropped journal_last_seq() won't include the old + * After the old pin is dropped j->last_seq won't include the old * pin, so we can only write the updated last_seq on the entry that * contains whatever the new pin protects. * @@ -291,7 +291,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t * Hence, we want update/set last_seq on the current journal entry right * before we open a new one: */ - buf->last_seq = journal_last_seq(j); + buf->last_seq = j->last_seq; buf->data->last_seq = cpu_to_le64(buf->last_seq); BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); @@ -358,7 +358,6 @@ static int journal_entry_open(struct journal *j) lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); - BUG_ON(c->sb.clean); if (j->blocked) return bch_err_throw(c, journal_blocked); @@ -416,7 +415,7 @@ static int journal_entry_open(struct journal *j) /* * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly + * incremented for j->last_seq to be calculated correctly */ atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); @@ -1092,7 +1091,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq:\t%llu\n", j->last_seq); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); diff --git a/libbcachefs/journal/journal.h b/libbcachefs/journal/journal.h index 88e18364..272a3a64 100644 --- a/libbcachefs/journal/journal.h +++ b/libbcachefs/journal/journal.h @@ -129,11 +129,6 @@ static inline bool journal_low_on_space(struct journal *j) /* Sequence number of oldest dirty journal entry */ -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - static inline u64 journal_cur_seq(struct journal *j) { return atomic64_read(&j->seq); diff --git a/libbcachefs/journal/read.c b/libbcachefs/journal/read.c index 2a697a23..4c5753b9 100644 --- a/libbcachefs/journal/read.c +++ b/libbcachefs/journal/read.c @@ -1346,18 +1346,17 @@ fsck_err: return ret; } -int bch2_journal_read(struct bch_fs *c, - u64 *last_seq, - u64 *blacklist_seq, - u64 *start_seq) +int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info) { struct journal_list jlist; struct journal_replay *i, **_i; struct genradix_iter radix_iter; - bool degraded = false, last_write_torn = false; + bool last_write_torn = false; u64 seq; int ret = 0; + memset(info, 0, sizeof(*info)); + closure_init_stack(&jlist.cl); mutex_init(&jlist.lock); jlist.last_seq = 0; @@ -1377,7 +1376,7 @@ int bch2_journal_read(struct bch_fs *c, system_unbound_wq, &jlist.cl); else - degraded = true; + set_bit(JOURNAL_degraded, &c->journal.flags); } while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) @@ -1386,10 +1385,6 @@ int bch2_journal_read(struct bch_fs *c, if (jlist.ret) return jlist.ret; - *last_seq = 0; - *start_seq = 0; - *blacklist_seq = 0; - /* * Find most recent flush entry, and ignore newer non flush entries - * those entries will be blacklisted: @@ -1400,8 +1395,8 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - if (!*start_seq) - *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; + if (!info->start_seq) + info->start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { i->ignore_blacklisted = true; @@ -1426,27 +1421,28 @@ int bch2_journal_read(struct bch_fs *c, le64_to_cpu(i->j.seq))) i->j.last_seq = i->j.seq; - *last_seq = le64_to_cpu(i->j.last_seq); - *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + info->seq_read_start = le64_to_cpu(i->j.last_seq); + info->seq_read_end = le64_to_cpu(i->j.seq); + info->clean = journal_entry_empty(&i->j); break; } - if (!*start_seq) { + if (!info->start_seq) { bch_info(c, "journal read done, but no entries found"); return 0; } - if (!*last_seq) { + if (!info->seq_read_end) { fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, "journal read done, but no entries found after dropping non-flushes"); return 0; } - u64 drop_before = *last_seq; + u64 drop_before = info->seq_read_start; { CLASS(printbuf, buf)(); prt_printf(&buf, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); + info->seq_read_start, info->seq_read_end); /* * Drop blacklisted entries and entries older than last_seq (or start of @@ -1457,9 +1453,11 @@ int bch2_journal_read(struct bch_fs *c, prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); } - *last_seq = drop_before; - if (*start_seq != *blacklist_seq) - prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); + info->seq_read_start = drop_before; + if (info->seq_read_end + 1 != info->start_seq) + prt_printf(&buf, " (unflushed %llu-%llu)", + info->seq_read_end + 1, + info->start_seq - 1); bch_info(c, "%s", buf.buf); } @@ -1483,7 +1481,7 @@ int bch2_journal_read(struct bch_fs *c, } } - try(bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1)); + try(bch2_journal_check_for_missing(c, drop_before, info->seq_read_end)); genradix_for_each(&c->journal_entries, radix_iter, _i) { union bch_replicas_padded replicas = { @@ -1516,17 +1514,6 @@ int bch2_journal_read(struct bch_fs *c, replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); - - CLASS(printbuf, buf)(); - bch2_replicas_entry_to_text(&buf, &replicas.e); - - if (!degraded && - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) - try(bch2_mark_replicas(c, &replicas.e)); } fsck_err: return ret; diff --git a/libbcachefs/journal/read.h b/libbcachefs/journal/read.h index ff3c8690..556a7ff1 100644 --- a/libbcachefs/journal/read.h +++ b/libbcachefs/journal/read.h @@ -70,6 +70,6 @@ struct u64_range { struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); -int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); +int bch2_journal_read(struct bch_fs *, struct journal_start_info *); #endif /* _BCACHEFS_JOURNAL_READ_H */ diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index d15b54fc..703f626b 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -211,7 +211,7 @@ void bch2_journal_space_available(struct journal *j) continue; while (ja->dirty_idx != ja->cur_idx && - ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) + ja->bucket_seq[ja->dirty_idx] < j->last_seq) ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; while (ja->dirty_idx_ondisk != ja->dirty_idx && @@ -325,37 +325,66 @@ void bch2_journal_do_discards(struct journal *j) * entry, holding it open to ensure it gets replayed during recovery: */ -void bch2_journal_reclaim_fast(struct journal *j) +void bch2_journal_update_last_seq(struct journal *j) { - bool popped = false; - lockdep_assert_held(&j->lock); /* * Unpin journal entries whose reference counts reached zero, meaning * all btree nodes got written out */ + u64 old = j->last_seq; struct journal_entry_pin_list *pin_list; - while (!fifo_empty(&j->pin) && - j->pin.front <= j->seq_ondisk && - !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) { + while (j->last_seq < j->pin.back && + j->last_seq <= j->seq_ondisk && + !atomic_read(&(pin_list = journal_seq_pin(j, j->last_seq))->count)) + j->last_seq++; - if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes)) - pin_list->bytes = j->dirty_entry_bytes; - - j->dirty_entry_bytes -= pin_list->bytes; - pin_list->bytes = 0; - - j->pin.front++; - popped = true; - } - - if (popped) { + if (old != j->last_seq) { bch2_journal_space_available(j); __closure_wake_up(&j->reclaim_flush_wait); } } +void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union bch_replicas_padded replicas; + unsigned nr_refs = 0; + size_t dirty_entry_bytes = 0; + + scoped_guard(mutex, &j->last_seq_ondisk_lock) + while (j->last_seq_ondisk < last_seq_ondisk) { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, j->last_seq_ondisk); + + if (pin_list->devs.e.nr_devs) { + if (nr_refs && + !bch2_replicas_entry_eq(&replicas.e, &pin_list->devs.e)) { + bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); + nr_refs = 0; + } + + memcpy(&replicas, &pin_list->devs, replicas_entry_bytes(&pin_list->devs.e)); + pin_list->devs.e.nr_devs = 0; + nr_refs++; + } + + dirty_entry_bytes += pin_list->bytes; + pin_list->bytes = 0; + + j->last_seq_ondisk++; + } + + scoped_guard(spinlock, &j->lock) { + if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes)) + dirty_entry_bytes = j->dirty_entry_bytes; + j->dirty_entry_bytes -= dirty_entry_bytes; + } + + if (nr_refs) + bch2_replicas_entry_put_many(c, &replicas.e, nr_refs); +} + bool __bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); @@ -367,7 +396,7 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) { if (__bch2_journal_pin_put(j, seq)) { guard(spinlock)(&j->lock); - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); } } @@ -394,7 +423,7 @@ static inline bool __journal_pin_drop(struct journal *j, * writing a new last_seq will now make another bucket available: */ return atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin); + pin_list == journal_seq_pin(j, j->last_seq); } void bch2_journal_pin_drop(struct journal *j, @@ -402,7 +431,7 @@ void bch2_journal_pin_drop(struct journal *j, { guard(spinlock)(&j->lock); if (__journal_pin_drop(j, pin)) - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); } static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, @@ -453,7 +482,7 @@ void bch2_journal_pin_copy(struct journal *j, u64 seq = READ_ONCE(src->seq); - if (seq < journal_last_seq(j)) { + if (seq < j->last_seq) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on * the src pin - with the pin dropped, the entry to pin might no @@ -468,13 +497,13 @@ void bch2_journal_pin_copy(struct journal *j, bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - if (seq == journal_last_seq(j)) + if (seq == j->last_seq) journal_wake(j); } @@ -485,19 +514,19 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool wake; scoped_guard(spinlock, &j->lock) { - BUG_ON(seq < journal_last_seq(j)); + BUG_ON(seq < j->last_seq); bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - wake = seq == journal_last_seq(j); + wake = seq == j->last_seq; } if (wake) @@ -929,8 +958,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, */ guard(spinlock)(&j->lock); return !test_bit(JOURNAL_replay_done, &j->flags) || - journal_last_seq(j) > seq_to_flush || - !fifo_used(&j->pin); + j->last_seq > seq_to_flush || + j->last_seq == j->pin.back; } bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) @@ -964,39 +993,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) try(bch2_journal_error(j)); - guard(mutex)(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - - /* - * Now that we've populated replicas_gc, write to the journal to mark - * active journal devices. This handles the case where the journal might - * be empty. Otherwise we could clear all journal replicas and - * temporarily put the fs into an unrecoverable state. Journal recovery - * expects to find devices marked for journal data on unclean mount. - */ - int ret = bch2_journal_meta(&c->journal); - if (ret) - goto err; - - seq = 0; - scoped_guard(spinlock, &j->lock) - while (!ret) { - seq = max(seq, journal_last_seq(j)); - if (seq > j->seq_ondisk) - break; - - union bch_replicas_padded replicas; - memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas)); - seq++; - - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); - } - } -err: - return bch2_replicas_gc_end(c, ret); + return 0; } bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) @@ -1010,7 +1007,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 if (!test_bit(JOURNAL_running, &j->flags)) return true; - *seq = max(*seq, j->pin.front); + *seq = max(*seq, j->last_seq); if (*seq >= j->pin.back) return true; diff --git a/libbcachefs/journal/reclaim.h b/libbcachefs/journal/reclaim.h index 2578abfa..e1956ba9 100644 --- a/libbcachefs/journal/reclaim.h +++ b/libbcachefs/journal/reclaim.h @@ -43,7 +43,9 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } -void bch2_journal_reclaim_fast(struct journal *); +void bch2_journal_update_last_seq(struct journal *); +void bch2_journal_update_last_seq_ondisk(struct journal *, u64); + bool __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); diff --git a/libbcachefs/journal/types.h b/libbcachefs/journal/types.h index 43d9e842..51061443 100644 --- a/libbcachefs/journal/types.h +++ b/libbcachefs/journal/types.h @@ -149,6 +149,7 @@ enum journal_space_from { }; #define JOURNAL_FLAGS() \ + x(degraded) \ x(replay_done) \ x(running) \ x(may_skip_flush) \ @@ -265,6 +266,8 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; + u64 last_seq; + size_t dirty_entry_bytes; struct journal_space space[journal_space_nr]; @@ -276,6 +279,7 @@ struct journal { spinlock_t err_lock; struct mutex reclaim_lock; + struct mutex last_seq_ondisk_lock; /* * Used for waiting until journal reclaim has freed up space in the * journal: @@ -352,4 +356,11 @@ struct journal_entry_res { unsigned u64s; }; +struct journal_start_info { + u64 seq_read_start; + u64 seq_read_end; + u64 start_seq; + bool clean; +}; + #endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/libbcachefs/journal/write.c b/libbcachefs/journal/write.c index 477ad0a0..bf83c796 100644 --- a/libbcachefs/journal/write.c +++ b/libbcachefs/journal/write.c @@ -189,6 +189,7 @@ static CLOSURE_CALLBACK(journal_write_done) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq = le64_to_cpu(w->data->seq); + u64 seq_wrote = seq; int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -197,8 +198,12 @@ static CLOSURE_CALLBACK(journal_write_done) if (w->had_error) { struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; + bch2_replicas_entry_put(c, r); bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); + err = bch2_replicas_entry_get(c, r); + if (err) + r->nr_devs = 0; } if (!w->devs_written.nr) @@ -225,7 +230,6 @@ static CLOSURE_CALLBACK(journal_write_done) BUG_ON(seq < j->pin.front); if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - w->write_done = true; if (!j->free_buf || j->free_buf_size < w->buf_size) { swap(j->free_buf, w->data); @@ -243,22 +247,31 @@ static CLOSURE_CALLBACK(journal_write_done) } bool completed = false; - bool do_discards = false; - + bool last_seq_ondisk_updated = false; +again: for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { w = j->buf + (seq & JOURNAL_BUF_MASK); - if (!w->write_done) + if (!w->write_done && seq != seq_wrote) break; if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; + if (j->last_seq_ondisk < w->last_seq) { + spin_unlock(&j->lock); + /* + * this needs to happen _before_ updating + * j->flushed_seq_ondisk, for flushing to work + * properly - when the flush completes replcias + * refs need to have been dropped + * */ + bch2_journal_update_last_seq_ondisk(j, w->last_seq); + last_seq_ondisk_updated = true; + spin_lock(&j->lock); + goto again; + } - closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); - do_discards = true; + j->flushed_seq_ondisk = seq; } j->seq_ondisk = seq; @@ -277,8 +290,10 @@ static CLOSURE_CALLBACK(journal_write_done) completed = true; } + j->buf[seq_wrote & JOURNAL_BUF_MASK].write_done = true; + if (completed) { - bch2_journal_reclaim_fast(j); + bch2_journal_update_last_seq(j); bch2_journal_space_available(j); track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); @@ -286,6 +301,8 @@ static CLOSURE_CALLBACK(journal_write_done) journal_wake(j); } + j->pin.front = min(j->pin.back, j->last_seq_ondisk); + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); @@ -308,8 +325,11 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_journal_do_writes(j); spin_unlock(&j->lock); - if (do_discards) + if (last_seq_ondisk_updated) { + bch2_reset_alloc_cursors(c); + closure_wake_up(&c->freelist_wait); bch2_do_discards(c); + } closure_put(&c->cl); } @@ -635,7 +655,6 @@ CLOSURE_CALLBACK(bch2_journal_write) unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); BUG_ON(w->write_done); @@ -702,9 +721,11 @@ CLOSURE_CALLBACK(bch2_journal_write) */ struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); - ret = bch2_mark_replicas(c, r); - if (ret) + ret = bch2_replicas_entry_get(c, r); + if (ret) { + r->nr_devs = 0; goto err; + } if (c->opts.nochanges) goto no_io; diff --git a/libbcachefs/sb/clean.c b/libbcachefs/sb/clean.c index 18a350bc..1c36d6dd 100644 --- a/libbcachefs/sb/clean.c +++ b/libbcachefs/sb/clean.c @@ -256,18 +256,10 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = { .to_text = bch2_sb_clean_to_text, }; -int bch2_fs_mark_dirty(struct bch_fs *c) +void bch2_fs_mark_dirty(struct bch_fs *c) { - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - guard(mutex)(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - return bch2_write_super(c); } void bch2_fs_mark_clean(struct bch_fs *c) @@ -277,7 +269,6 @@ void bch2_fs_mark_clean(struct bch_fs *c) unsigned u64s; int ret; - guard(mutex)(&c->sb_lock); if (BCH_SB_CLEAN(c->disk_sb.sb)) return; @@ -321,6 +312,4 @@ void bch2_fs_mark_clean(struct bch_fs *c) } bch2_journal_pos_from_member_info_set(c); - - bch2_write_super(c); } diff --git a/libbcachefs/sb/clean.h b/libbcachefs/sb/clean.h index 71caef28..6d811f12 100644 --- a/libbcachefs/sb/clean.h +++ b/libbcachefs/sb/clean.h @@ -10,7 +10,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry ** extern const struct bch_sb_field_ops bch_sb_field_ops_clean; -int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); #endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/libbcachefs/sb/io.c b/libbcachefs/sb/io.c index 11842325..0c358c19 100644 --- a/libbcachefs/sb/io.c +++ b/libbcachefs/sb/io.c @@ -1021,6 +1021,11 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + if (bch2_sb_has_journal(c->disk_sb.sb)) + bch2_fs_mark_dirty(c); + else + bch2_fs_mark_clean(c); + /* * Note: we do writes to RO devices here, and we might want to change * that in the future. diff --git a/libbcachefs/snapshots/snapshot.c b/libbcachefs/snapshots/snapshot.c index 7159c80c..bf6d76b3 100644 --- a/libbcachefs/snapshots/snapshot.c +++ b/libbcachefs/snapshots/snapshot.c @@ -1136,8 +1136,10 @@ void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) } scoped_guard(mutex, &d->progress_lock) { - bch2_snapshot_delete_nodes_to_text(out, d); + prt_str(out, "Current position: "); bch2_bbpos_to_text(out, d->pos); + prt_newline(out); + bch2_snapshot_delete_nodes_to_text(out, d); } } diff --git a/libbcachefs/util/printbuf.h b/libbcachefs/util/printbuf.h index 5fa5265d..37e0b82c 100644 --- a/libbcachefs/util/printbuf.h +++ b/libbcachefs/util/printbuf.h @@ -71,7 +71,7 @@ enum printbuf_si { PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ }; -#define PRINTBUF_INLINE_TABSTOPS 6 +#define PRINTBUF_INLINE_TABSTOPS 8 struct printbuf { char *buf; diff --git a/libbcachefs/vfs/direct.c b/libbcachefs/vfs/direct.c index c6c4a37e..cbdcf747 100644 --- a/libbcachefs/vfs/direct.c +++ b/libbcachefs/vfs/direct.c @@ -86,7 +86,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) /* bios must be 512 byte aligned: */ if ((offset|iter->count) & (SECTOR_SIZE - 1)) - return -EINVAL; + return bch_err_throw(c, unaligned_io); ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); @@ -627,7 +627,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) goto err_put_write_ref; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { - ret = -EINVAL; + ret = bch_err_throw(c, unaligned_io); goto err_put_write_ref; }