From 8b31dfb3500fc642ccd36f0aaa0c3ab1b54abb1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Fri, 6 Dec 2024 20:48:25 -0500 Subject: [PATCH] Update bcachefs sources to 55a65a994ed5 bcachefs: bcachefs_metadata_version_persistent_inode_cursors Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- .bcachefs_revision | 2 +- bch_bindgen/src/bkey.rs | 2 + include/linux/min_heap.h | 4 +- libbcachefs/alloc_background.c | 85 +++++----- libbcachefs/alloc_background_format.h | 4 +- libbcachefs/alloc_foreground.c | 115 ++++++------- libbcachefs/alloc_foreground.h | 2 +- libbcachefs/backpointers.c | 9 +- libbcachefs/bcachefs.h | 4 +- libbcachefs/bcachefs_format.h | 10 +- libbcachefs/bkey_types.h | 12 +- libbcachefs/btree_gc.c | 13 +- libbcachefs/btree_io.c | 2 +- libbcachefs/btree_node_scan.c | 133 +++++++++------ libbcachefs/btree_node_scan_types.h | 1 - libbcachefs/btree_trans_commit.c | 50 +++--- libbcachefs/btree_update.c | 2 +- libbcachefs/btree_update_interior.c | 14 +- libbcachefs/darray.h | 2 +- libbcachefs/dirent.c | 2 +- libbcachefs/disk_accounting.c | 44 +++-- libbcachefs/disk_accounting.h | 3 +- libbcachefs/errcode.h | 1 + libbcachefs/error.c | 13 +- libbcachefs/extents.c | 13 +- libbcachefs/fs-common.c | 4 +- libbcachefs/fs-io-direct.c | 42 +---- libbcachefs/fs.c | 7 +- libbcachefs/fsck.c | 228 ++------------------------ libbcachefs/fsck.h | 8 + libbcachefs/inode.c | 123 ++++++++++---- libbcachefs/inode.h | 10 ++ libbcachefs/inode_format.h | 8 + libbcachefs/journal.c | 17 +- libbcachefs/journal.h | 2 +- libbcachefs/journal_io.c | 205 +++++++++++++---------- libbcachefs/journal_io.h | 2 +- libbcachefs/journal_reclaim.c | 3 +- libbcachefs/logged_ops.c | 5 +- libbcachefs/logged_ops_format.h | 5 +- libbcachefs/move.c | 2 +- libbcachefs/opts.h | 9 +- libbcachefs/recovery_passes.c | 52 +++--- libbcachefs/sb-clean.c | 6 +- libbcachefs/sb-errors_format.h | 7 +- libbcachefs/snapshot.c | 5 +- libbcachefs/str_hash.c | 209 +++++++++++++++++++++++ libbcachefs/str_hash.h | 7 + libbcachefs/subvolume.c | 14 +- libbcachefs/subvolume.h | 2 +- libbcachefs/super-io.c | 10 +- libbcachefs/super.c | 7 +- libbcachefs/trace.h | 24 +++ 53 files changed, 881 insertions(+), 684 deletions(-) create mode 100644 libbcachefs/str_hash.c diff --git a/.bcachefs_revision b/.bcachefs_revision index 534b898f..64016eff 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -4a32728376a81dd9e75851a49159ff33602840f4 +55a65a994ed5fba038fda00f78416faf6f308bb8 diff --git a/bch_bindgen/src/bkey.rs b/bch_bindgen/src/bkey.rs index 91d515b7..2a012508 100644 --- a/bch_bindgen/src/bkey.rs +++ b/bch_bindgen/src/bkey.rs @@ -50,6 +50,7 @@ pub enum BkeyValC<'a> { logged_op_truncate(&'a c::bch_logged_op_truncate), logged_op_finsert(&'a c::bch_logged_op_finsert), accounting(&'a c::bch_accounting), + inode_alloc_cursor(&'a c::bch_inode_alloc_cursor), } impl<'a, 'b> BkeySC<'a> { @@ -106,6 +107,7 @@ impl<'a, 'b> BkeySC<'a> { KEY_TYPE_logged_op_truncate => logged_op_truncate(transmute(self.v)), KEY_TYPE_logged_op_finsert => logged_op_finsert(transmute(self.v)), KEY_TYPE_accounting => accounting(transmute(self.v)), + KEY_TYPE_inode_alloc_cursor => inode_alloc_cursor(transmute(self.v)), KEY_TYPE_MAX => unreachable!(), } } diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dc..fe17b482 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -15,8 +15,8 @@ */ #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ struct _name { \ - int nr; \ - int size; \ + size_t nr; \ + size_t size; \ _type *data; \ _type preallocated[_nr]; \ } diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b2d57045..7641a3b4 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -323,7 +323,8 @@ void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - a->journal_seq = swab64(a->journal_seq); + a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); + a->journal_seq_empty = swab64(a->journal_seq_empty); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); @@ -346,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu\n", a->journal_seq); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); if (ca) prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); @@ -384,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { - .journal_seq = u.journal_seq, + .journal_seq_nonempty = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, @@ -856,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); + + if (is_empty_delta < 0) { new_a->io_time[READ] = bch2_current_io_time(c, READ); new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); @@ -927,20 +932,31 @@ int bch2_trigger_alloc(struct btree_trans *trans, if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { u64 transaction_seq = trans->journal_res.seq; + BUG_ON(!transaction_seq); + BUG_ON(transaction_seq < new_a->journal_seq_nonempty); + BUG_ON(transaction_seq < new_a->journal_seq_empty); - if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq, + if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, trans, alloc_key_journal_seq_in_future, "bucket journal seq in future (currently at %llu)\n%s", journal_cur_seq(&c->journal), (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) - new_a->journal_seq = transaction_seq; + new_a->journal_seq_nonempty = transaction_seq; int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - (int) data_type_is_empty(old_a->data_type); - /* Record journal sequence number of empty -> nonempty transition: */ - if (is_empty_delta < 0) - new_a->journal_seq = max(new_a->journal_seq, transaction_seq); + /* + * Record journal sequence number of empty -> nonempty transition: + * Note that there may be multiple empty -> nonempty + * transitions, data in a bucket may be overwritten while we're + * still writing to it - so be careful to only record the first: + * */ + if (is_empty_delta < 0 && + new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { + new_a->journal_seq_nonempty = transaction_seq; + new_a->journal_seq_empty = 0; + } /* * Bucket becomes empty: mark it as waiting for a journal flush, @@ -949,18 +965,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, * intermediate sequence numbers: */ if (is_empty_delta > 0) { - if (new_a->journal_seq == transaction_seq || - bch2_journal_noflush_seq(&c->journal, new_a->journal_seq)) - new_a->journal_seq = 0; - else { - new_a->journal_seq = transaction_seq; + if (new_a->journal_seq_nonempty == transaction_seq || + bch2_journal_noflush_seq(&c->journal, + new_a->journal_seq_nonempty, + transaction_seq)) { + new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; + } else { + new_a->journal_seq_empty = transaction_seq; ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - transaction_seq); + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + transaction_seq); if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) + "setting bucket_needs_journal_commit: %s", + bch2_err_str(ret))) goto err; } } @@ -978,7 +997,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) +#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) if (statechange(a->data_type == BCH_DATA_free) && bucket_flushed(new_a)) @@ -1840,16 +1859,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; - goto out; - } - if (!fastpath) { if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; diff --git a/libbcachefs/alloc_background_format.h b/libbcachefs/alloc_background_format.h index befdaa95..74023836 100644 --- a/libbcachefs/alloc_background_format.h +++ b/libbcachefs/alloc_background_format.h @@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) struct bch_alloc_v4 { struct bch_val v; - __u64 journal_seq; + __u64 journal_seq_nonempty; __u32 flags; __u8 gen; __u8 oldest_gen; @@ -70,7 +70,7 @@ struct bch_alloc_v4 { __u32 stripe; __u32 nr_external_backpointers; /* end of fields in original version of alloc_v4 */ - __u64 _fragmentation_lru; /* obsolete */ + __u64 journal_seq_empty; __u32 stripe_sectors; __u32 pad; } __packed __aligned(8); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 095bfe7c..57d5f14c 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -200,14 +200,35 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) } } +static inline bool may_alloc_bucket(struct bch_fs *c, + struct bpos bucket, + struct bucket_alloc_state *s) +{ + if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { + s->skipped_open++; + return false; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + s->skipped_need_journal_commit++; + return false; + } + + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { + s->skipped_nocow++; + return false; + } + + return true; +} + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, u64 bucket, u8 gen, enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { - struct open_bucket *ob; - if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; @@ -216,22 +237,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - s->skipped_open++; - return NULL; - } - - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { - s->skipped_need_journal_commit++; - return NULL; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { - s->skipped_nocow++; - return NULL; - } - spin_lock(&c->freelist_lock); if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { @@ -250,10 +255,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - ob = bch2_open_bucket_alloc(c); + struct open_bucket *ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; @@ -279,8 +283,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - u8 gen; + if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + return NULL; + + u8 gen; int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); if (ret < 0) return ERR_PTR(ret); @@ -300,6 +307,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; @@ -359,7 +367,10 @@ again: s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl); + ob = may_alloc_bucket(c, k.k->p, s) + ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, + watermark, s, cl) + : NULL; next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); @@ -626,9 +637,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.devs[ret.nr++] = i; + ret.data[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } @@ -700,18 +711,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, stripe, devs_may_alloc); int ret = -BCH_ERR_insufficient_devices; BUG_ON(*nr_effective >= nr_replicas); - for (unsigned i = 0; i < devs_sorted.nr; i++) { - struct bch_dev_usage usage; - struct open_bucket *ob; - - unsigned dev = devs_sorted.devs[i]; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; @@ -720,8 +726,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -765,10 +772,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted; - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, ec_idx; int ret = 0; if (nr_replicas < 2) @@ -777,34 +780,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + struct ec_stripe_head *h = + bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - - for (i = 0; i < devs_sorted.nr; i++) - for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; - ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) - goto got_bucket; - } - goto out_put_head; -got_bucket: - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); + struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); -out_put_head: + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, ob); + goto out; + } + } +out: bch2_ec_stripe_head_put(c, h); return ret; } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 4f87745d..f25481a0 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; + u8 data[BCH_SB_MEMBERS_MAX]; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 72f37171..72311dc7 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -42,16 +42,17 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke u32 bucket_offset; struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); - prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset); + prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); - prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); + prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_printf(out, " suboffset=%u len=%u pos=", + prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len); + bp.v->bucket_len, + bp.v->bucket_gen); bch2_bpos_to_text(out, bp.v->pos); } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 58d42b3f..d27550ef 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -1048,6 +1048,7 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; + enum bch_recovery_pass next_recovery_pass; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ @@ -1063,9 +1064,6 @@ struct bch_fs { struct btree_node *verify_ondisk; struct mutex verify_lock; - u64 *unused_inode_hints; - unsigned inode_shard_bits; - /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index f140c336..09e53bef 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ x(logged_op_finsert, 33) \ - x(accounting, 34) + x(accounting, 34) \ + x(inode_alloc_cursor, 35) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -682,7 +683,8 @@ struct bch_sb_field_ext { x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); +LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -1347,7 +1350,8 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + BIT_ULL(KEY_TYPE_logged_op_finsert)| \ + BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h index 2af6279b..b4f328f9 100644 --- a/libbcachefs/bkey_types.h +++ b/libbcachefs/bkey_types.h @@ -213,16 +213,16 @@ BCH_BKEY_TYPES(); enum bch_validate_flags { BCH_VALIDATE_write = BIT(0), BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_journal = BIT(2), - BCH_VALIDATE_silent = BIT(3), + BCH_VALIDATE_silent = BIT(2), }; #define BKEY_VALIDATE_CONTEXTS() \ x(unknown) \ - x(commit) \ + x(superblock) \ x(journal) \ x(btree_root) \ - x(btree_node) + x(btree_node) \ + x(commit) struct bkey_validate_context { enum { @@ -230,10 +230,12 @@ struct bkey_validate_context { BKEY_VALIDATE_CONTEXTS() #undef x } from:8; + enum bch_validate_flags flags:8; u8 level; enum btree_id btree; bool root:1; - enum bch_validate_flags flags:8; + unsigned journal_offset; + u64 journal_seq; }; #endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e59924cf..24f2f3bd 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -29,6 +29,7 @@ #include "move.h" #include "recovery_passes.h" #include "reflink.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -359,11 +360,9 @@ again: if (ret) break; - if (!btree_id_is_alloc(b->c.btree_id)) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - break; - } + ret = bch2_btree_lost_data(c, b->c.btree_id); + if (ret) + break; continue; } @@ -525,7 +524,7 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + ret = bch2_btree_lost_data(c, i); if (ret) break; reconstruct_root: @@ -741,7 +740,7 @@ static int bch2_gc_btrees(struct bch_fs *c) (printbuf_reset(&buf), bch2_btree_id_to_text(&buf, btree), buf.buf))) - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + ret = bch2_btree_lost_data(c, btree); } fsck_err: printbuf_exit(&buf); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 9df9fc1c..d99f8a78 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -26,7 +26,7 @@ static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); - prt_printf(out, " seq %llux\n", bn->keys.seq); + prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); prt_str(out, "min: "); bch2_bpos_to_text(out, bn->min_key); prt_newline(out); diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index eeafb5e7..a5282ff1 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -12,6 +12,7 @@ #include "recovery_passes.h" #include <linux/kthread.h> +#include <linux/min_heap.h> #include <linux/sort.h> struct find_btree_nodes_worker { @@ -31,8 +32,6 @@ static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, con if (n->range_updated) prt_str(out, " range updated"); - if (n->overwritten) - prt_str(out, " overwritten"); for (unsigned i = 0; i < n->nr_ptrs; i++) { prt_char(out, ' '); @@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r) -found_btree_node_cmp_time(l, r); } +static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) +{ + return found_btree_node_cmp_pos(l, r) < 0; +} + +static inline void found_btree_node_swap(void *_l, void *_r, void *arg) +{ + struct found_btree_node *l = _l; + struct found_btree_node *r = _r; + + swap(*l, *r); +} + +const struct min_heap_callbacks found_btree_node_heap_cbs = { + .less = found_btree_node_cmp_pos_less, + .swp = found_btree_node_swap, +}; + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, struct bio *bio, struct btree_node *bn, u64 offset) { @@ -295,55 +312,48 @@ err: return f->ret ?: ret; } -static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +static bool nodes_overlap(const struct found_btree_node *l, + const struct found_btree_node *r) { - while (n + 1 < end && - found_btree_node_cmp_pos(n, n + 1) > 0) { - swap(n[0], n[1]); - n++; - } + return (l->btree_id == r->btree_id && + l->level == r->level && + bpos_gt(l->max_key, r->min_key)); } static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *start, - struct found_btree_node *end) + struct found_btree_node *l, + found_btree_nodes *nodes_heap) { - struct found_btree_node *n; -again: - for (n = start + 1; - n < end && - n->btree_id == start->btree_id && - n->level == start->level && - bpos_lt(n->min_key, start->max_key); - n++) { - int cmp = found_btree_node_cmp_time(start, n); + struct found_btree_node *r; + + while ((r = min_heap_peek(nodes_heap)) && + nodes_overlap(l, r)) { + int cmp = found_btree_node_cmp_time(l, r); if (cmp > 0) { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } else if (cmp < 0) { - BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + BUG_ON(bpos_eq(l->min_key, r->min_key)); - start->max_key = bpos_predecessor(n->min_key); - start->range_updated = true; - } else if (n->level) { - n->overwritten = true; + l->max_key = bpos_predecessor(r->min_key); + l->range_updated = true; + } else if (r->level) { + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); } else { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } } @@ -355,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) { struct find_btree_nodes *f = &c->found_btree_nodes; struct printbuf buf = PRINTBUF; + found_btree_nodes nodes_heap = {}; size_t dst; int ret = 0; @@ -409,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) bch2_print_string_as_lines(KERN_INFO, buf.buf); } - dst = 0; - darray_for_each(f->nodes, i) { - if (i->overwritten) - continue; + swap(nodes_heap, f->nodes); - ret = handle_overwrites(c, i, &darray_top(f->nodes)); + { + /* darray must have same layout as a heap */ + min_heap_char real_heap; + BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); + BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); + BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); + BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); + } + + min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); + + if (nodes_heap.nr) { + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); if (ret) goto err; - BUG_ON(i->overwritten); - f->nodes.data[dst++] = *i; + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); } - f->nodes.nr = dst; - if (c->opts.verbose) { + while (true) { + ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); + if (ret) + goto err; + + if (!nodes_heap.nr) + break; + + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } + + for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) + BUG_ON(nodes_overlap(n, n + 1)); + + if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); + } else { + bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); err: + darray_exit(&nodes_heap); printbuf_exit(&buf); return ret; } diff --git a/libbcachefs/btree_node_scan_types.h b/libbcachefs/btree_node_scan_types.h index b6c36c45..2811b685 100644 --- a/libbcachefs/btree_node_scan_types.h +++ b/libbcachefs/btree_node_scan_types.h @@ -6,7 +6,6 @@ struct found_btree_node { bool range_updated:1; - bool overwritten:1; u8 btree_id; u8 level; unsigned sectors_written; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 78d72c26..9011cc3f 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -719,19 +719,29 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, validate_context); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + trans_for_each_update(trans, i) { - enum bch_validate_flags invalid_flags = 0; + validate_context.level = i->level; + validate_context.btree = i->btree_id; - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_commit, - .level = i->level, - .btree = i->btree_id, - .flags = invalid_flags, - }); + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); if (unlikely(ret)){ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); @@ -740,24 +750,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, btree_insert_entry_checks(trans, i); } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - goto fatal_err; - } - } - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 06fd5aa6..b1da6dba 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); int ret = bkey_err(k); if (ret) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index f2a1d5d3..7d9dab95 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1607,8 +1607,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) return ret; - bch2_btree_interior_update_will_free_node(as, b); - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; @@ -1707,6 +1705,8 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -2063,9 +2063,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, trace_and_count(c, btree_node_merge, trans, b); - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, @@ -2101,6 +2098,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err_free_update; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); @@ -2155,8 +2155,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto out; - bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_build_aux_trees(n); @@ -2180,6 +2178,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 8f4c3f06..c6151495 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) #define darray_init(_d) \ do { \ diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 41813f9c..600eee93 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, &s); target->inum = le64_to_cpu(s.inode); } diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index 77534838..a0e49cbe 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -324,6 +324,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } return 0; err: free_percpu(n.v[1]); @@ -722,11 +730,18 @@ int bch2_accounting_read(struct bch_fs *c) iter.flags &= ~BTREE_ITER_with_journal; int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + if (k.k->type != KEY_TYPE_accounting) continue; struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + if (!bch2_accounting_is_mem(acc_k)) { struct disk_accounting_pos next = { .type = acc_k.type + 1 }; bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); @@ -746,6 +761,7 @@ int bch2_accounting_read(struct bch_fs *c) if (i->k->k.type == KEY_TYPE_accounting) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + if (!bch2_accounting_is_mem(acc_k)) continue; @@ -782,15 +798,16 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); - unsigned i = 0; - while (i < acc->k.nr) { - unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as @@ -799,26 +816,25 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, - v, acc->k.data[idx].nr_counters); + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(acc->k.data[idx].v[0]); - free_percpu(acc->k.data[idx].v[1]); - darray_remove_item(&acc->k, &acc->k.data[idx]); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; - i++; } + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h index cb20f723..4a3d9ff8 100644 --- a/libbcachefs/disk_accounting.h +++ b/libbcachefs/disk_accounting.h @@ -120,7 +120,8 @@ void bch2_accounting_mem_gc(struct bch_fs *); static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) { - return acc.type != BCH_DISK_ACCOUNTING_inum; + return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc.type != BCH_DISK_ACCOUNTING_inum; } /* diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 924d2400..775425df 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -264,6 +264,7 @@ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 7af5c594..038da6a6 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -477,8 +477,8 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, unsigned fsck_flags = 0; if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { - //if (test_bit(err, c->sb.errors_silent)) - // return -BCH_ERR_fsck_delete_bkey; + if (test_bit(err, c->sb.errors_silent)) + return -BCH_ERR_fsck_delete_bkey; fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } @@ -486,9 +486,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, fsck_flags |= fsck_flags_extra[err]; struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey in %s btree=", + prt_printf(&buf, "invalid bkey in %s", bch2_bkey_validate_contexts[from.from]); + + if (from.from == BKEY_VALIDATE_journal) + prt_printf(&buf, " journal seq=%llu offset=%u", + from.journal_seq, from.journal_offset); + + prt_str(&buf, " btree="); bch2_btree_id_to_text(&buf, from.btree); prt_printf(&buf, " level=%u: ", from.level); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 2fc9ace5..05d5f71a 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1238,6 +1238,12 @@ static int extent_ptr_validate(struct bch_fs *c, { int ret = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr2) + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); + /* bad pointers are repaired by check_fix_ptrs(): */ rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); @@ -1252,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c, unsigned bucket_size = ca->mi.bucket_size; rcu_read_unlock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - bkey_fsck_err_on(bucket >= nbuckets, c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index d2966b67..7d279f21 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!snapshot_src.inum) { /* Inode wasn't specified, just snapshot: */ struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); if (ret) goto err; diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index b0367b9d..2089c36b 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -226,7 +226,6 @@ struct dio_write { struct mm_struct *mm; const struct iovec *iov; unsigned loop:1, - have_mm_ref:1, extending:1, sync:1, flush:1; @@ -391,9 +390,6 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) kfree(dio->iov); - if (dio->have_mm_ref) - mmdrop(dio->mm); - ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); @@ -533,24 +529,9 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (unlikely(dio->iter.count) && !dio->sync && - !dio->loop) { - /* - * Rest of write will be submitted asynchronously - - * unless copying the iov fails: - */ - if (likely(!bch2_dio_write_copy_iov(dio))) { - /* - * aio guarantees that mm_struct outlives the - * request, but io_uring does not - */ - if (dio->mm) { - mmgrab(dio->mm); - dio->have_mm_ref = true; - } - } else { - dio->sync = sync = true; - } - } + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; dio->loop = true; closure_call(&dio->op.cl, bch2_write, NULL, NULL); @@ -578,25 +559,15 @@ err: static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) { - struct mm_struct *mm = dio->have_mm_ref ? dio->mm: NULL; + struct mm_struct *mm = dio->mm; bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); - if (mm) { - if (unlikely(!mmget_not_zero(mm))) { - /* process exited */ - dio->op.error = -ESRCH; - bch2_dio_write_done(dio); - return; - } - + if (mm) kthread_use_mm(mm); - } bch2_dio_write_loop(dio); - if (mm) { + if (mm) kthread_unuse_mm(mm); - mmput(mm); - } } static void bch2_dio_write_loop_async(struct bch_write_op *op) @@ -670,7 +641,6 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) dio->mm = current->mm; dio->iov = NULL; dio->loop = false; - dio->have_mm_ref = false; dio->extending = extending; dio->sync = is_sync_kiocb(req) || extending; dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c6e7df7c..3f83f131 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -499,7 +499,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); @@ -569,8 +569,7 @@ retry: inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_with_updates, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -651,7 +650,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index d67a3f52..a436c072 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -109,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -226,8 +226,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol); bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", le32_to_cpu(st.master_subvol), snapshot); if (ret) @@ -942,69 +941,16 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - u32 snap; - u64 inum; - int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - /* * Prefer to delete the first one, since that will be the one at the wrong * offset: * return value: 0 -> delete k1, 1 -> delete k2 */ -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -static int fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) +int bch2_fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) { if (new->k.type != KEY_TYPE_dirent) return 0; @@ -1032,153 +978,6 @@ err: return ret; } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) -{ - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); - - if (u64s > U8_MAX) - return -EINVAL; - - new->k.u64s = u64s; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) - break; - } - - if (ret) - return ret; - - return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); -} - -static int hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - u64 hash; - int ret = 0; - - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc.btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k)) - goto duplicate_entries; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - goto bad_hash; - } - } -out: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -bad_hash: - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - fsck_update_backpointers(trans, s, desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; -} - static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -1421,7 +1220,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_subvol) { struct bch_subvolume s; - ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -2497,7 +2296,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2611,7 +2410,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -2864,7 +2663,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - break; + goto out; if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); @@ -2899,7 +2698,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) if (ret) { /* Should have been caught in dirents pass */ bch_err_msg(c, ret, "error looking up parent directory"); - break; + goto out; } min_bi_depth = parent_inode.bi_depth; @@ -2930,8 +2729,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); } - redo_bi_depth = false; - break; + goto out; } } diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index 4481b40a..57494827 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -2,6 +2,14 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H +#include "str_hash.h" + +int bch2_fsck_update_backpointers(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct bkey_i *); + int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); int bch2_check_indirect_extents(struct bch_fs *); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 8818e418..d1fc44a7 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } +int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, + c, inode_alloc_cursor_inode_bad, + "k.p.inode bad"); +fsck_err: + return ret; +} + +void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); + + prt_printf(out, "idx %llu generation %llu", + le64_to_cpu(i.v->idx), + le64_to_cpu(i.v->gen)); +} + void bch2_inode_init_early(struct bch_fs *c, struct bch_inode_unpacked *inode_u) { @@ -859,6 +881,59 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } +static struct bkey_i_inode_alloc_cursor * +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) +{ + struct bch_fs *c = trans->c; + + u64 cursor_idx = c->opts.shard_inode_numbers ? cpu : 0; + + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + struct bkey_i_inode_alloc_cursor *cursor = + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); + ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + goto err; + + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + if (c->opts.shard_inode_numbers) { + bits -= cursor->v.bits; + + *min = (cpu << bits); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); + + *min = max_t(u64, *min, BLOCKDEV_INODE_MAX); + } else { + *min = BLOCKDEV_INODE_MAX; + *max = ~(ULLONG_MAX << bits); + } + + if (le64_to_cpu(cursor->v.idx) < *min) + cursor->v.idx = cpu_to_le64(*min); + + if (le64_to_cpu(cursor->v.idx) >= *max) { + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : cursor; +} + /* * This just finds an empty slot: */ @@ -867,35 +942,20 @@ int bch2_inode_create(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, u32 snapshot, u64 cpu) { - struct bch_fs *c = trans->c; - struct bkey_s_c k; - u64 min, max, start, pos, *hint; - int ret = 0; - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + u64 min, max; + struct bkey_i_inode_alloc_cursor *cursor = + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + int ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + return ret; - if (c->opts.shard_inode_numbers) { - bits -= c->inode_shard_bits; + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); - - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; - } else { - min = BLOCKDEV_INODE_MAX; - max = ~(ULLONG_MAX << bits); - hint = c->unused_inode_hints; - } - - start = READ_ONCE(*hint); - - if (start >= max || start < min) - start = min; - - pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| BTREE_ITER_intent); + struct bkey_s_c k; again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -925,6 +985,7 @@ again: /* Retry from start */ pos = start = min; bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); @@ -935,9 +996,9 @@ found_slot: return ret; } - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = bkey_generation(k); + inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -999,8 +1060,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; struct bkey_s_c k; u32 snapshot; int ret; @@ -1040,13 +1099,7 @@ retry: goto err; } - bch2_inode_unpack(k, &inode_u); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 5bca6950..d2e13452 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk .min_val_size = 8, \ }) +int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ + .key_validate = bch2_inode_alloc_cursor_validate, \ + .val_to_text = bch2_inode_alloc_cursor_to_text, \ + .min_val_size = 16, \ +}) + #if 0 typedef struct { u64 lo; diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h index be1e7476..1b93e189 100644 --- a/libbcachefs/inode_format.h +++ b/libbcachefs/inode_format.h @@ -165,4 +165,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, struct bch_inode_v3, bi_flags, 31, 36); LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); +struct bch_inode_alloc_cursor { + struct bch_val v; + __u8 bits; + __u8 pad; + __le32 gen; + __le64 idx; +}; + #endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index dc665219..2cd20114 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -807,10 +807,11 @@ int bch2_journal_flush(struct journal *j) } /* - * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the + * range [start, end) * @seq */ -bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; @@ -819,15 +820,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) return false; spin_lock(&j->lock); - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) goto out; for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < seq; + unwritten_seq < end; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); @@ -1564,6 +1565,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->mi.durability) + continue; + struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1573,6 +1577,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); + prt_printf(out, "durability %u:\n", ca->mi.durability); printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); @@ -1584,6 +1589,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); } + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); + rcu_read_unlock(); --out->atomic; diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index a6a2e888..cb0df066 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -404,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64); +bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index bb69d808..7f2efe85 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -301,7 +301,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BCH_VALIDATE_write) { \ + switch (from.flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -390,15 +390,12 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .level = entry->level, - .btree = entry->btree_id, - .flags = flags|BCH_VALIDATE_journal, - }; + + from.level = entry->level; + from.btree = entry->btree_id; while (k != vstruct_last(entry)) { int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); @@ -435,11 +432,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; int ret = 0; + from.root = true; + from.level = entry->level + 1; + from.btree = entry->btree_id; + if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, version, jset, entry, @@ -456,13 +457,6 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .level = entry->level + 1, - .btree = entry->btree_id, - .root = true, - .flags = flags, - }; ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) ret = 0; @@ -480,7 +474,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { /* obsolete, don't care: */ return 0; @@ -495,7 +489,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -522,7 +516,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -564,7 +558,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -598,7 +592,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -642,7 +636,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -682,7 +676,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); @@ -739,7 +733,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -756,10 +750,11 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { + from.flags = 0; return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -772,10 +767,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, @@ -788,7 +783,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -818,7 +813,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -836,11 +831,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, flags) + version, big_endian, from) : 0; } @@ -858,10 +853,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bch_validate_flags flags) { + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { + from.journal_offset = (u64 *) entry - jset->_data; + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), c, version, jset, entry, journal_entry_past_jset_end, @@ -870,8 +873,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, break; } - ret = bch2_journal_entry_validate(c, jset, entry, - version, JSET_BIG_ENDIAN(jset), flags); + ret = bch2_journal_entry_validate(c, jset, entry, version, + JSET_BIG_ENDIAN(jset), from); if (ret) break; } @@ -884,13 +887,17 @@ static int jset_validate(struct bch_fs *c, struct jset *jset, u64 sector, enum bch_validate_flags flags) { - unsigned version; + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -935,15 +942,16 @@ static int jset_validate_early(struct bch_fs *c, unsigned bucket_sectors_left, unsigned sectors_read) { - size_t bytes = vstruct_bytes(jset); - unsigned version; - enum bch_validate_flags flags = BCH_VALIDATE_journal; + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -956,6 +964,7 @@ static int jset_validate_early(struct bch_fs *c, return -EINVAL; } + size_t bytes = vstruct_bytes(jset); if (bytes > (sectors_read << 9) && sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; @@ -1240,8 +1249,6 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bch_validate_flags flags = BCH_VALIDATE_journal; - i = *_i; if (journal_replay_ignore(i)) @@ -1261,6 +1268,10 @@ int bch2_journal_read(struct bch_fs *c, continue; } + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(i->j.seq), + }; if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, jset_last_seq_newer_than_seq, @@ -1420,27 +1431,50 @@ fsck_err: /* journal write: */ +static void journal_advance_devs_to_next_bucket(struct journal *j, + struct dev_alloc_list *devs, + unsigned sectors, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + + struct journal_device *ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); + } + } +} + static void __journal_write_alloc(struct journal *j, struct journal_buf *w, - struct dev_alloc_list *devs_sorted, + struct dev_alloc_list *devs, unsigned sectors, unsigned *replicas, unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja; - struct bch_dev *ca; - unsigned i; - if (*replicas >= replicas_want) - return; - - for (i = 0; i < devs_sorted->nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; /* * Check that we can use this device, and aren't already using @@ -1486,65 +1520,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; - struct journal_device *ja; - struct bch_dev *ca; struct dev_alloc_list devs_sorted; unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned i, replicas = 0, replicas_want = + unsigned replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; rcu_read_lock(); -retry: - devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); - - if (replicas >= replicas_want) - goto done; - - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - if (!ca) - continue; - - ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } + /* We might run more than once if we have to stop and do discards: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); + bkey_for_each_ptr(ptrs, p) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); + if (ca) + replicas += ca->mi.durability; } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); +retry_target: + devs = target_rw_devs(c, BCH_DATA_journal, target); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); +retry_alloc: + __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + + if (likely(replicas >= replicas_want)) + goto done; + + if (!advance_done) { + journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); + advance_done = true; + goto retry_alloc; + } if (replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; - goto retry; + advance_done = false; + goto retry_target; } done: rcu_read_unlock(); BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -2032,7 +2054,7 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } - if (ret) { + if (ret && !bch2_journal_error(j)) { struct printbuf buf = PRINTBUF; buf.atomic++; @@ -2044,8 +2066,9 @@ CLOSURE_CALLBACK(bch2_journal_write) spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - goto err; } + if (ret) + goto err; /* * write is allocated, no longer need to account for it in diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 2ca9cde3..12b39fcb 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index b7936ad3..3c824260 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -146,7 +146,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr) + if (!ca->journal.nr || + !ca->mi.durability) continue; min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c index 1ac51af1..75f27ec2 100644 --- a/libbcachefs/logged_ops.c +++ b/libbcachefs/logged_ops.c @@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_max(trans, iter, BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX), + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX)); + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); if (ret) return ret; diff --git a/libbcachefs/logged_ops_format.h b/libbcachefs/logged_ops_format.h index 0b370a96..cfb67c95 100644 --- a/libbcachefs/logged_ops_format.h +++ b/libbcachefs/logged_ops_format.h @@ -2,7 +2,10 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H -#define LOGGED_OPS_INUM 0 +enum logged_ops_inums { + LOGGED_OPS_INUM_logged_ops, + LOGGED_OPS_INUM_inode_cursors, +}; struct bch_logged_op_truncate { struct bch_val v; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a50f27f4..9b066c61 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -825,7 +825,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - continue; + goto next; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index ea69099e..2df5c8f7 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -222,15 +222,20 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT, \ OPT_BOOL(), \ BCH_SB_SHARD_INUMS, true, \ NULL, "Shard new inode numbers by CPU id") \ + x(shard_inode_numbers_bits, u8, \ + OPT_FS|OPT_FORMAT, \ + OPT_UINT(0, 8), \ + BCH_SB_SHARD_INUMS_NBITS, 0, \ + NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index f6d3a99c..0b3c951c 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -103,27 +103,31 @@ u64 bch2_recovery_passes_from_stable(u64 v) static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->opts.recovery_passes & BIT_ULL(pass)) - return 0; - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; + if (c->recovery_passes_complete & BIT_ULL(pass)) + return 0; + + bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); + if (print) + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); return -BCH_ERR_cannot_rewind_recovery; } - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (print) + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); c->opts.recovery_passes |= BIT_ULL(pass); - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; + if (c->curr_recovery_pass > pass) { + c->next_recovery_pass = pass; c->recovery_passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { @@ -264,7 +268,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + c->next_recovery_pass = c->curr_recovery_pass + 1; + spin_lock_irq(&c->recovery_pass_lock); unsigned pass = c->curr_recovery_pass; @@ -285,31 +291,25 @@ int bch2_run_recovery_passes(struct bch_fs *c) ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + spin_lock_irq(&c->recovery_pass_lock); - if (c->curr_recovery_pass < pass) { + if (c->next_recovery_pass < c->curr_recovery_pass) { /* * bch2_run_explicit_recovery_pass() was called: we * can't always catch -BCH_ERR_restart_recovery because * it may have been called from another thread (btree * node read completion) */ - spin_unlock_irq(&c->recovery_pass_lock); - continue; - } else if (c->curr_recovery_pass == pass) { - c->curr_recovery_pass++; + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); } else { - BUG(); + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); } + c->curr_recovery_pass = c->next_recovery_pass; spin_unlock_irq(&c->recovery_pass_lock); - - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); } return ret; diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c index 00527528..59c8770e 100644 --- a/libbcachefs/sb-clean.c +++ b/libbcachefs/sb-clean.c @@ -23,6 +23,10 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { + struct bkey_validate_context from = { + .flags = write, + .from = BKEY_VALIDATE_superblock, + }; struct jset_entry *entry; int ret; @@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); + from); if (ret) return ret; } diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 4248ff3c..e26317c3 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -58,7 +58,7 @@ enum bch_fsck_flags { x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ x(bset_blacklisted_journal_seq, 47, 0) \ - x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ x(btree_node_bad_min_key, 51, 0) \ @@ -168,7 +168,7 @@ enum bch_fsck_flags { x(ptr_to_incorrect_stripe, 151, 0) \ x(ptr_gen_newer_than_bucket_gen, 152, 0) \ x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, 0) \ + x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ x(ptr_cached_and_erasure_coded, 156, 0) \ x(ptr_crc_uncompressed_size_too_small, 157, 0) \ @@ -211,6 +211,7 @@ enum bch_fsck_flags { x(bkey_in_missing_snapshot, 190, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ x(inode_unpack_error, 193, 0) \ x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ @@ -311,7 +312,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 301, 0) + x(MAX, 302, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index f368270d..99f04551 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -570,8 +570,7 @@ static int check_snapshot_tree(struct btree_trans *trans, goto err; } - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -811,7 +810,7 @@ static int check_snapshot(struct btree_trans *trans, if (should_have_subvol) { id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + ret = bch2_subvolume_get(trans, id, false, &subvol); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c new file mode 100644 index 00000000..c3276a7e --- /dev/null +++ b/libbcachefs/str_hash.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "dirent.h" +#include "fsck.h" +#include "str_hash.h" +#include "subvolume.h" + +static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) +{ + if (d.v->d_type == DT_SUBVOL) { + struct bch_subvolume subvol; + int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), + false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} + +static int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +} + +static int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; + + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; + + if (hash_k.k->type != desc.key_type) + return 0; + + hash = desc.hash_bkey(hash_info, hash_k); + + if (likely(hash == hash_k.k->p.offset)) + return 0; + + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k)) + goto duplicate_entries; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + if (fsck_err(trans, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); + if (ret) + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } +fsck_err: + goto out; +duplicate_entries: + ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +} diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 00c78505..0c20f3af 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -393,4 +393,11 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +struct snapshots_seen; +int bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 5e5ae405..0e756e35 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - iter_flags, subvolume, s); + BTREE_ITER_cached| + BTREE_ITER_with_updates, subvolume, s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found, trans->c, "missing subvolume %u", subvol); @@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); } int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) { struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); + int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); if (ret) return ret; @@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, struct bch_snapshot snap; return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); } int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, @@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d struct bch_subvolume s; return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_cached, &s)) ?: + bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index d53d292c..910f6196 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -24,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, int, struct bch_subvolume *); + bool, struct bch_subvolume *); int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *, bool); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 912c3696..dbc09e30 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -432,7 +432,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); prt_str(out, " > incompat_allowed "); bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - return -BCH_ERR_invalid_sb_version; + if (flags & BCH_VALIDATE_write) + return -BCH_ERR_invalid_sb_version; + else + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); } if (!flags) { @@ -457,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } +#ifdef __KERNEL__ + if (!BCH_SB_SHARD_INUMS_NBITS(sb)) + SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); +#endif + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 14157820..bd6aecea 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c) #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); - kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); @@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", @@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || - !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL))) { + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 2d5932d2..7baf66be 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* disk_accounting.c */ + +TRACE_EVENT(accounting_mem_insert, + TP_PROTO(struct bch_fs *c, const char *acc), + TP_ARGS(c, acc), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned, new_nr ) + __string(acc, acc ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->new_nr = c->accounting.k.nr; + __assign_str(acc); + ), + + TP_printk("%d,%d entries %u added %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_nr, + __get_str(acc)) +); + /* fs.c: */ TRACE_EVENT(bch2_sync_fs, TP_PROTO(struct super_block *sb, int wait),