From 1055935ffe151de39c45e33ea13d3370e46c8fbd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 22:31:09 -0500 Subject: [PATCH] Update bcachefs sources to 864591728963 bcachefs: Dropped superblock write is no longer a fatal error Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- Makefile.compiler | 2 +- include/linux/blk_types.h | 1 + include/linux/kmemleak.h | 4 + include/linux/kobject.h | 1 + libbcachefs/alloc_background.c | 2 - libbcachefs/alloc_foreground.c | 4 - libbcachefs/backpointers.c | 146 ++++++--- libbcachefs/bcachefs.h | 7 +- libbcachefs/bcachefs_format.h | 51 ++- libbcachefs/btree_gc.c | 29 +- libbcachefs/btree_gc.h | 4 +- libbcachefs/btree_io.c | 32 +- libbcachefs/btree_io.h | 6 +- libbcachefs/btree_iter.c | 86 +++-- libbcachefs/btree_iter.h | 29 +- libbcachefs/btree_key_cache.c | 58 +++- libbcachefs/btree_locking.c | 14 + libbcachefs/btree_locking.h | 44 +-- libbcachefs/btree_node_scan.c | 2 +- libbcachefs/btree_trans_commit.c | 89 +++--- libbcachefs/btree_types.h | 59 ++-- libbcachefs/btree_update.c | 13 +- libbcachefs/btree_update.h | 1 + libbcachefs/btree_update_interior.c | 36 +-- libbcachefs/btree_write_buffer.c | 10 + libbcachefs/buckets.c | 57 ++-- libbcachefs/buckets.h | 9 +- libbcachefs/clock.c | 25 +- libbcachefs/data_update.c | 2 +- libbcachefs/disk_accounting.c | 48 +-- libbcachefs/disk_accounting.h | 4 +- libbcachefs/ec.c | 25 +- libbcachefs/errcode.h | 5 + libbcachefs/fs-common.c | 28 +- libbcachefs/fs-io-buffered.c | 9 - libbcachefs/fsck.c | 93 ++++-- libbcachefs/inode.c | 35 +- libbcachefs/inode_format.h | 6 +- libbcachefs/io_write.c | 6 +- libbcachefs/journal_io.c | 4 +- libbcachefs/move.c | 4 +- libbcachefs/opts.h | 7 +- libbcachefs/recovery.c | 33 +- libbcachefs/recovery_passes_types.h | 2 +- libbcachefs/sb-counters_format.h | 165 +++++----- libbcachefs/sb-downgrade.c | 1 - libbcachefs/six.c | 17 +- libbcachefs/six.h | 1 + libbcachefs/snapshot.c | 475 ++++++++++++---------------- libbcachefs/snapshot.h | 10 +- libbcachefs/str_hash.c | 129 ++++++-- libbcachefs/str_hash.h | 25 +- libbcachefs/subvolume.c | 50 ++- libbcachefs/subvolume_types.h | 2 +- libbcachefs/super-io.c | 11 +- libbcachefs/super.c | 8 +- libbcachefs/sysfs.c | 14 +- libbcachefs/trace.h | 43 ++- libbcachefs/util.h | 10 + libbcachefs/varint.c | 5 +- 61 files changed, 1196 insertions(+), 904 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 64016eff..5d86ae9c 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -55a65a994ed5fba038fda00f78416faf6f308bb8 +864591728963d416c49e502bfee56a283eda31a5 diff --git a/Makefile.compiler b/Makefile.compiler index e0842496..8c102968 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -13,7 +13,7 @@ cc-cross-prefix = $(firstword $(foreach c, $(1), \ $(if $(shell command -v -- $(c)gcc 2>/dev/null), $(c)))) # output directory for tests below -TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$ +TMPOUT = .tmp_$$$$ # try-run # Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3cbf8c9e..2384a5e3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -6,6 +6,7 @@ #define __LINUX_BLK_TYPES_H #include +#include #include #include #include diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 6a3cd1bf..93a73c07 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; +extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; @@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr) static inline void kmemleak_not_leak(const void *ptr) { } +static inline void kmemleak_transient_leak(const void *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c33b2126..24096a62 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7641a3b4..94e7bc88 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -933,8 +933,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { u64 transaction_seq = trans->journal_res.seq; BUG_ON(!transaction_seq); - BUG_ON(transaction_seq < new_a->journal_seq_nonempty); - BUG_ON(transaction_seq < new_a->journal_seq_empty); if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, trans, alloc_key_journal_seq_in_future, diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 57d5f14c..6df41c33 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) return; } - percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - ob->valid = false; ob->data_type = 0; - spin_unlock(&ob->lock); - percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); bch2_open_bucket_hash_remove(c, ob); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 72311dc7..ebeb6a5f 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -178,7 +178,7 @@ static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -static int bch2_backpointers_maybe_flush(struct btree_trans *trans, +static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { @@ -201,17 +201,30 @@ static int backpointer_target_not_found(struct btree_trans *trans, * looking at may have already been deleted - failure to find what it * pointed to is not an error: */ - ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed); + ret = last_flushed + ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) + : 0; if (ret) return ret; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.v->level ? "btree node" : "extent"); bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "\n "); + prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, target_k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) + if (p.ptr.dev == bp.k->p.inode) { + prt_printf(&buf, "\n "); + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); + } + if (fsck_err(trans, backpointer_to_missing_ptr, "%s", buf.buf)) ret = bch2_backpointer_del(trans, bp.k->p); @@ -491,7 +504,7 @@ check_existing_bp: struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed); + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -553,11 +566,11 @@ check_existing_bp: goto err; missing: printbuf_reset(&buf); - prt_str(&buf, "missing backpointer "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_newline(&buf); + prt_str(&buf, "missing backpointer\n for: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\n got: "); + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); + prt_printf(&buf, "\n got: "); bch2_bkey_val_to_text(&buf, c, bp_k); if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) @@ -586,12 +599,16 @@ static int check_extent_to_backpointers(struct btree_trans *trans, rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); + bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); rcu_read_unlock(); - if (check) { + if (check || empty) { struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - int ret = check_bp_exists(trans, s, &bp, k); + + int ret = check + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); if (ret) return ret; } @@ -825,12 +842,15 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t } } -static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, struct bkey_s_c alloc_k, - struct bkey_buf *last_flushed) +static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); + +static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + struct bkey_buf *last_flushed) { - int ret = 0; + struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + bool need_commit = false; if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || @@ -846,6 +866,7 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru struct btree_iter iter; struct bkey_s_c bp_k; + int ret = 0; for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, alloc_k.k->p), bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { @@ -854,6 +875,17 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && + (bp.v->bucket_gen != a->gen || + bp.v->pad)) { + ret = bch2_backpointer_del(trans, bp_k.k->p); + if (ret) + break; + + need_commit = true; + continue; + } + if (bp.v->bucket_gen != a->gen) continue; @@ -863,30 +895,40 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru if (ret) goto err; - /* Cached pointers don't have backpointers: */ - - if (sectors[ALLOC_dirty] != a->dirty_sectors || - sectors[ALLOC_stripe] != a->stripe_sectors) { - ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (need_commit) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto err; + } - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + /* Cached pointers don't have backpointers: */ + + if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_stripe] != a->stripe_sectors) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { + ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (ret) + goto err; + } + + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { + ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: + -BCH_ERR_transaction_restart_nested; + goto err; + } + + if (!sectors[ALLOC_dirty] && + !sectors[ALLOC_stripe]) + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); + else + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); } err: bch2_dev_put(ca); return ret; } -static int check_bucket_backpointer_mismatches(struct btree_trans *trans, - struct bkey_buf *last_flushed) -{ - return for_each_btree_key(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, ({ - check_bucket_backpointer_mismatch_one(trans, k, last_flushed); - })); -} - static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) { switch (k.k->type) { @@ -896,6 +938,9 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) rcu_read_lock(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { + if (pos.inode >= c->sb.nr_devices) + break; + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); if (!ca) goto next; @@ -941,7 +986,7 @@ err: } static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, - struct bpos start, struct bpos *end) + struct bpos start, struct bpos *end) { struct bch_fs *c = trans->c; int ret = 0; @@ -1022,7 +1067,11 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), sizeof(unsigned long), GFP_KERNEL); - if (!ca->bucket_backpointer_mismatches) { + ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + if (!ca->bucket_backpointer_mismatches || + !ca->bucket_backpointer_empty) { bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; goto err_free_bitmaps; @@ -1035,21 +1084,25 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); - ret = check_bucket_backpointer_mismatches(trans, &s.last_flushed); + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, ({ + check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + })); if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0; + u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; for_each_member_device(c, ca) { - nr_buckets += ca->mi.nbuckets; - nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); + nr_buckets += ca->mi.nbuckets; + nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); + nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); } - if (!nr_mismatches) + if (!nr_mismatches && !nr_empty) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches, nr_buckets); + nr_mismatches + nr_empty, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); @@ -1086,6 +1139,8 @@ err: bch2_btree_cache_unpin(c); err_free_bitmaps: for_each_member_device(c, ca) { + kvfree(ca->bucket_backpointer_empty); + ca->bucket_backpointer_empty = NULL; kvfree(ca->bucket_backpointer_mismatches); ca->bucket_backpointer_mismatches = NULL; } @@ -1122,6 +1177,25 @@ static int check_one_backpointer(struct btree_trans *trans, return ret; } +static int check_bucket_backpointers_to_extents(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket) +{ + u32 restart_count = trans->restart_count; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), + bucket_pos_to_bp_end(ca, bucket), + 0, k, + check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) + ); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret ?: trans_was_restarted(trans, restart_count); +} + static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index d27550ef..161cf2f0 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -547,17 +547,16 @@ struct bch_dev { /* * Buckets: - * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_gens_lock, for device resize - holding any is sufficient for - * access: Or rcu_read_lock(), but only for dev_ptr_stale(): + * Per-bucket arrays are protected by either rcu_read_lock or + * state_lock, for device resize. */ GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; - struct rw_semaphore bucket_lock; unsigned long *bucket_backpointer_mismatches; + unsigned long *bucket_backpointer_empty; struct bch_dev_usage __percpu *usage; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 09e53bef..06809305 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -684,7 +684,8 @@ struct bch_sb_field_ext { x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) + x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ + x(autofix_errors, BCH_VERSION(1, 19)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1287,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); /* Btree: */ enum btree_id_flags { - BTREE_ID_EXTENTS = BIT(0), - BTREE_ID_SNAPSHOTS = BIT(1), - BTREE_ID_SNAPSHOT_FIELD = BIT(2), - BTREE_ID_DATA = BIT(3), + BTREE_IS_extents = BIT(0), + BTREE_IS_snapshots = BIT(1), + BTREE_IS_snapshot_field = BIT(2), + BTREE_IS_data = BIT(3), + BTREE_IS_write_buffer = BIT(4), }; #define BCH_BTREE_IDS() \ - x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ + x(extents, 0, \ + BTREE_IS_extents| \ + BTREE_IS_snapshots| \ + BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ @@ -1302,17 +1307,20 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_reservation)| \ BIT_ULL(KEY_TYPE_reflink_p)| \ BIT_ULL(KEY_TYPE_inline_data)) \ - x(inodes, 1, BTREE_ID_SNAPSHOTS, \ + x(inodes, 1, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_inode)| \ BIT_ULL(KEY_TYPE_inode_v2)| \ BIT_ULL(KEY_TYPE_inode_v3)| \ BIT_ULL(KEY_TYPE_inode_generation)) \ - x(dirents, 2, BTREE_ID_SNAPSHOTS, \ + x(dirents, 2, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_hash_whiteout)| \ BIT_ULL(KEY_TYPE_dirent)) \ - x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ + x(xattrs, 3, \ + BTREE_IS_snapshots, \ BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_hash_whiteout)| \ @@ -1326,7 +1334,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_quota)) \ x(stripes, 6, 0, \ BIT_ULL(KEY_TYPE_stripe)) \ - x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ + x(reflink, 7, \ + BTREE_IS_extents| \ + BTREE_IS_data, \ BIT_ULL(KEY_TYPE_reflink_v)| \ BIT_ULL(KEY_TYPE_indirect_inline_data)| \ BIT_ULL(KEY_TYPE_error)) \ @@ -1334,29 +1344,38 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_subvolume)) \ x(snapshots, 9, 0, \ BIT_ULL(KEY_TYPE_snapshot)) \ - x(lru, 10, 0, \ + x(lru, 10, \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_set)) \ - x(freespace, 11, BTREE_ID_EXTENTS, \ + x(freespace, 11, \ + BTREE_IS_extents, \ BIT_ULL(KEY_TYPE_set)) \ x(need_discard, 12, 0, \ BIT_ULL(KEY_TYPE_set)) \ - x(backpointers, 13, 0, \ + x(backpointers, 13, \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_backpointer)) \ x(bucket_gens, 14, 0, \ BIT_ULL(KEY_TYPE_bucket_gens)) \ x(snapshot_trees, 15, 0, \ BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ + x(deleted_inodes, 16, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_finsert)| \ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ - x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ + x(rebalance_work, 18, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ BIT_ULL(KEY_TYPE_set)) \ - x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \ + x(accounting, 20, \ + BTREE_IS_snapshot_field| \ + BTREE_IS_write_buffer, \ BIT_ULL(KEY_TYPE_accounting)) \ enum btree_id { diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 24f2f3bd..dd1d9b74 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -733,16 +733,8 @@ static int bch2_gc_btrees(struct bch_fs *c) continue; ret = bch2_gc_btree(trans, btree, true); - - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "btree node read error for %s", - (printbuf_reset(&buf), - bch2_btree_id_to_text(&buf, btree), - buf.buf))) - ret = bch2_btree_lost_data(c, btree); } -fsck_err: + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); @@ -811,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; - percpu_down_read(&c->mark_lock); __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); old_gc = gc; @@ -822,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc.data_type = old->data_type; gc.dirty_sectors = old->dirty_sectors; } - percpu_up_read(&c->mark_lock); /* * gc.data_type doesn't yet include need_discard & need_gc_gen states - @@ -840,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, * safe w.r.t. transaction restarts, so fixup the gc_bucket so * we don't run it twice: */ - percpu_down_read(&c->mark_lock); struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); gc_m->data_type = gc.data_type; gc_m->dirty_sectors = gc.dirty_sectors; - percpu_up_read(&c->mark_lock); } if (fsck_err_on(new.data_type != gc.data_type, @@ -1088,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - percpu_down_read(&c->mark_lock); rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); @@ -1097,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (dev_ptr_stale(ca, ptr) > 16) { rcu_read_unlock(); - percpu_up_read(&c->mark_lock); goto update; } } @@ -1112,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, *gen = ptr->gen; } rcu_read_unlock(); - percpu_up_read(&c->mark_lock); return 0; update: u = bch2_bkey_make_mut(trans, iter, &k, 0); @@ -1141,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } @@ -1254,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_fs_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_exit(struct bch_fs *c) +{ +} + +int bch2_fs_btree_gc_init(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); + + init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); + return 0; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 8a47e8bd..9693a90a 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_gc_init(struct bch_fs *); + +void bch2_fs_btree_gc_exit(struct bch_fs *); +int bch2_fs_btree_gc_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index d99f8a78..e371e60e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -489,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) if (b->nsets == MAX_BSETS && !btree_node_write_in_flight(b) && should_compact_all(c, b)) { - bch2_btree_node_write(c, b, SIX_LOCK_write, - BTREE_WRITE_init_next_bset); + bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); reinit_iter = true; } @@ -2345,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, } } +void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, + enum six_lock_type lock_type_held, + unsigned flags) +{ + struct bch_fs *c = trans->c; + + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { + __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && + six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); + __bch2_btree_node_unlock_write(trans, b); + } + + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { + __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); + } +} + static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) { struct bucket_table *tbl; diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 9b01ca3d..6f9e4a6d 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -144,11 +144,13 @@ enum btree_write_flags { void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type, unsigned); +void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, + enum six_lock_type, unsigned); -static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, +static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, enum six_lock_type lock_held) { - bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); + bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } bool bch2_btree_flush_all_reads(struct bch_fs *); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 9c54891c..c291d495 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -699,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans, bch2_trans_revalidate_updates_in_node(trans, b); } +void bch2_trans_node_drop(struct btree_trans *trans, + struct btree *b) +{ + struct btree_path *path; + unsigned i, level = b->c.level; + + trans_for_each_path(trans, path, i) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + /* * A btree node has been modified in such a way as to invalidate iterators - fix * them: @@ -1854,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * !bkey_eq(path->pos, ck->key.pos)); *u = ck->k->k; - k = bkey_i_to_s_c(ck->k); + k = (struct bkey_s_c) { u, &ck->k->v }; } return k; @@ -2144,21 +2157,18 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); - + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } - - return k; } static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, @@ -2175,21 +2185,19 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek_prev(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } - - return k; } /* @@ -2234,10 +2242,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - iter->k = u; - k.k = &iter->k; - } + if (!k.k) + return k; + + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + + iter->k = u; + k.k = &iter->k; return k; } @@ -2260,7 +2273,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + break; } struct btree_path *path = btree_iter_path(trans, iter); @@ -2270,7 +2283,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* No btree nodes at requested level: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } btree_path_set_should_be_locked(trans, path); @@ -2281,15 +2294,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; - ret = bkey_err(k); - if (ret) { + if (bkey_err(k)) { bch2_btree_iter_set_pos(iter, iter->pos); - goto out; + break; } } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_journal(trans, iter, k); + btree_trans_peek_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) @@ -2318,12 +2330,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } } -out: - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(iter); return k; } @@ -2424,7 +2435,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k)) { + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_key_cache_fill)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2547,7 +2559,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_prev_journal(trans, iter, k); + btree_trans_peek_prev_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) @@ -2784,6 +2796,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; + + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_key_cache_fill))) + iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; struct bpos end = iter->pos; @@ -3028,7 +3045,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _RET_IP_); } @@ -3044,8 +3061,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, flags |= BTREE_ITER_snapshot_field; flags |= BTREE_ITER_all_snapshots; + if (!depth && btree_id_cached(trans->c, btree_id)) + flags |= BTREE_ITER_with_key_cache; + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - __bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, depth, flags), _RET_IP_); iter->min_depth = depth; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 3477fc8c..b9538e6e 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -372,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); +void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -446,10 +447,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned level, + unsigned flags) { + if (level || !btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_is_extents; @@ -468,19 +476,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, return flags; } -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) -{ - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - return __bch2_btree_iter_flags(trans, btree_id, flags); -} - static inline void bch2_trans_iter_init_common(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -517,7 +512,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, if (__builtin_constant_p(btree_id) && __builtin_constant_p(flags)) bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _THIS_IP_); else bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 3bd40ea0..7636a5e9 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -197,7 +197,9 @@ out: return ck; } -static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, +static int btree_key_cache_create(struct btree_trans *trans, + struct btree_path *path, + struct btree_path *ck_path, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * key_u64s = min(256U, (key_u64s * 3) / 2); key_u64s = roundup_pow_of_two(key_u64s); - struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); + struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); int ret = PTR_ERR_OR_ZERO(ck); if (ret) return ret; @@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(path->btree_id)); + bch2_btree_id_str(ck_path->btree_id)); return -BCH_ERR_ENOMEM_btree_key_cache_create; } } ck->c.level = 0; - ck->c.btree_id = path->btree_id; - ck->key.btree_id = path->btree_id; - ck->key.pos = path->pos; + ck->c.btree_id = ck_path->btree_id; + ck->key.btree_id = ck_path->btree_id; + ck->key.pos = ck_path->pos; ck->flags = 1U << BKEY_CACHED_ACCESSED; if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); struct bkey_i *new_k = allocate_dropping_locks(trans, ret, kmalloc(key_u64s * sizeof(u64), _gfp)); @@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * bkey_reassemble(ck->k, k); + ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); + if (unlikely(ret)) + goto err; + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + + bch2_btree_node_unlock_write(trans, path, path_l(path)->b); + if (unlikely(ret)) /* raced with another fill? */ goto err; atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - enum six_lock_type lock_want = __btree_lock_want(path, 0); + enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); if (lock_want == SIX_LOCK_read) six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; + btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); + ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: bkey_cached_free(bc, ck); - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; } @@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; @@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (unlikely(ret)) goto out; - ret = btree_key_cache_create(trans, ck_path, k); + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); if (ret) goto err; + + if (trace_key_cache_fill_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); + } out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); @@ -593,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bkey_cached_free(bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - path->should_be_locked = false; + + struct btree_path *path2; + unsigned i; + trans_for_each_path(trans, path2, i) + if (path2->l[0].b == (void *) ck) { + __bch2_btree_path_unlock(trans, path2); + path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); + path2->should_be_locked = false; + btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + } + + bch2_trans_verify_locks(trans); } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index d343df9f..85039314 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } +void bch2_trans_unlock_write(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_write_locked(path, l)) + bch2_btree_node_unlock_write(trans, path, path->l[l].b); +} + int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { @@ -856,6 +867,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) (want == BTREE_NODE_UNLOCKED || have != BTREE_NODE_WRITE_LOCKED) && want != have); + + BUG_ON(btree_node_locked(path, l) && + path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); } } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 7474ab6c..b54ef48e 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -16,6 +16,7 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); void bch2_trans_unlock_noassert(struct btree_trans *); +void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) { @@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path, path->nodes_locked |= (type + 1) << (level << 1); } -static inline void mark_btree_node_unlocked(struct btree_path *path, - unsigned level) -{ - EBUG_ON(btree_node_write_locked(path, level)); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); -} - static inline void mark_btree_node_locked(struct btree_trans *trans, struct btree_path *path, unsigned level, @@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, /* unlock: */ +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); + static inline void btree_node_unlock(struct btree_trans *trans, struct btree_path *path, unsigned level) { int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED); if (lock_type != BTREE_NODE_UNLOCKED) { + if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { + bch2_btree_node_unlock_write(trans, path, path->l[level].b); + lock_type = BTREE_NODE_INTENT_LOCKED; + } six_unlock_type(&path->l[level].b->c.lock, lock_type); btree_trans_lock_hold_time_update(trans, path, level); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); } - mark_btree_node_unlocked(path, level); } static inline int btree_path_lowest_level_locked(struct btree_path *path) @@ -162,28 +162,32 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans, * Updates the saved lock sequence number, so that bch2_btree_node_relock() will * succeed: */ +static inline void +__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) +{ + if (!b->c.lock.write_lock_recurse) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path_with_node(trans, b, linked, i) + linked->l[b->c.level].lock_seq++; + } + + six_unlock_write(&b->c.lock); +} + static inline void bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, struct btree *b) { - struct btree_path *linked; - unsigned i; - EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - - trans_for_each_path_with_node(trans, b, linked, i) - linked->l[b->c.level].lock_seq++; - - six_unlock_write(&b->c.lock); + __bch2_btree_node_unlock_write(trans, b); } -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); - int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index a5282ff1..a7f06dee 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -152,7 +152,7 @@ static inline void found_btree_node_swap(void *_l, void *_r, void *arg) swap(*l, *r); } -const struct min_heap_callbacks found_btree_node_heap_cbs = { +static const struct min_heap_callbacks found_btree_node_heap_cbs = { .less = found_btree_node_cmp_pos_less, .swp = found_btree_node_swap, }; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 9011cc3f..6b79b672 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) return 0; } -static inline void bch2_trans_unlock_write(struct btree_trans *trans) +static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { trans_for_each_update(trans, i) @@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new |= 1 << BTREE_NODE_need_write; } while (!try_cmpxchg(&b->flags, &old, new)); - btree_node_write_if_need(c, b, SIX_LOCK_read); + btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); bch2_trans_put(trans); @@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct bkey_i *new_k; int ret; - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); bch2_trans_unlock(trans); new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); @@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, old, flags); } -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) { verify_update_old_key(trans, i); @@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), BTREE_TRIGGER_insert| BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { + } else if (!i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { + } else if (!i->insert_trigger_run) { i->insert_trigger_run = true; return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { @@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned btree_id_start) + unsigned *btree_id_updates_start) { - for (int overwrite = 1; overwrite >= 0; --overwrite) { - bool trans_trigger_run; + bool trans_trigger_run; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; - for (unsigned i = btree_id_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id != btree_id) - continue; - - int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; + for (unsigned i = *btree_id_updates_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; + i++) { + if (trans->updates[i].btree_id < btree_id) { + *btree_id_updates_start = i; + continue; } - } while (trans_trigger_run); - } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + i->btree_id == btree_id && + btree_node_type_has_trans_triggers(i->bkey_type) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); return 0; } static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - unsigned btree_id = 0, btree_id_start = 0; + unsigned btree_id = 0, btree_id_updates_start = 0; int ret = 0; /* @@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->nr_updates && - trans->updates[btree_id_start].btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); + ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); if (ret) return ret; } - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); - if (ret) - return ret; - break; - } - } + btree_id_updates_start = 0; + ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); + if (ret) + return ret; #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) @@ -875,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - bch2_trans_unlock_write(trans); + bch2_trans_unlock_updates_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index baab5288..a6f251eb 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -790,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; } -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(type) & mask; -} - static inline bool btree_id_is_extents(enum btree_id btree) -{ - return btree_node_type_is_extents(__btree_node_type(0, btree)); -} - -static inline bool btree_type_has_snapshots(enum btree_id id) { const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) BCH_BTREE_IDS() #undef x ; - return BIT_ULL(id) & mask; + return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_snapshot_field(enum btree_id id) +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); +} + +static inline bool btree_type_has_snapshots(enum btree_id btree) { const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) BCH_BTREE_IDS() #undef x ; - return BIT_ULL(id) & mask; + return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_ptrs(enum btree_id id) +static inline bool btree_type_has_snapshot_field(enum btree_id btree) { const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) +#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) BCH_BTREE_IDS() #undef x ; - return BIT_ULL(id) & mask; + return BIT_ULL(btree) & mask; +} + +static inline bool btree_type_has_ptrs(enum btree_id btree) +{ + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return BIT_ULL(btree) & mask; +} + +static inline bool btree_type_uses_write_buffer(enum btree_id btree) +{ + const u64 mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return BIT_ULL(btree) & mask; } struct btree_root { diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index b1da6dba..13d794f2 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -823,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { + unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); + prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -862,7 +869,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - __bch2_trans_log_msg(trans, &buf, u64s)); + bch2_trans_log_msg(trans, &buf)); } err: printbuf_exit(&buf); diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 58df2019..8f22ef9a 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -159,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 7d9dab95..f4aeadbe 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -238,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); @@ -249,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, mutex_unlock(&c->btree_cache.lock); six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } + bch2_trans_node_drop(trans, b); } static void bch2_btree_node_free_never_used(struct btree_update *as, @@ -264,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, { struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; - struct btree_path *path; - unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -287,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } + bch2_trans_node_drop(trans, b); } static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, @@ -803,7 +792,7 @@ err: mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); six_unlock_write(&b->c.lock); - btree_node_write_if_need(c, b, SIX_LOCK_intent); + btree_node_write_if_need(trans, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); bch2_path_put(trans, path_idx, true); } @@ -824,7 +813,7 @@ err: b = as->new_nodes[i]; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - btree_node_write_if_need(c, b, SIX_LOCK_read); + btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -1709,14 +1698,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (n3) { bch2_btree_update_get_open_buckets(as, n3); - bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); } if (n2) { bch2_btree_update_get_open_buckets(as, n2); - bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); } bch2_btree_update_get_open_buckets(as, n1); - bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); /* * The old node must be freed (in memory) _before_ unlocking the new @@ -1911,7 +1900,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * BUG_ON(ret); bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); bch2_trans_node_add(trans, path, n); six_unlock_intent(&n->c.lock); @@ -2104,7 +2093,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); bch2_btree_node_free_inmem(trans, trans->paths + path, b); bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); @@ -2181,7 +2170,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_interior_update_will_free_node(as, b); bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); @@ -2291,7 +2280,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 49ce2d1e..b56c4987 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -312,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + BUG_ON(!btree_type_uses_write_buffer(k->btree)); + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); @@ -632,6 +634,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (trace_write_buffer_maybe_flush_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, referring_k); + trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + bch2_bkey_buf_reassemble(&tmp, c, referring_k); if (bkey_is_btree_ptr(referring_k.k)) { diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 1e43ece2..345b117a 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -262,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_read(&c->mark_lock); - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -364,7 +362,6 @@ found: bch_info(c, "new key %s", buf.buf); } - percpu_up_read(&c->mark_lock); struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); @@ -373,8 +370,6 @@ found: BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); bch2_trans_iter_exit(trans, &iter); - percpu_down_read(&c->mark_lock); - if (ret) goto err; @@ -382,7 +377,6 @@ found: bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: - percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; } @@ -547,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a) + struct bch_alloc_v4 *a, + bool insert) { u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : !p->ptr.cached ? &a->dirty_sectors : @@ -557,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, if (ret) return ret; - - alloc_data_type_set(a, ptr_data_type); + if (insert) + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -591,7 +586,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v); + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); if (ret) goto err; @@ -603,22 +598,19 @@ static int bch2_trigger_pointer(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = -BCH_ERR_trigger_pointer; - goto err_unlock; + goto err; } bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new); + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); @@ -996,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * struct bch_fs *c = trans->c; int ret = 0; - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", ca->dev_idx, bch2_data_type_str(data_type))) - goto err_unlock; + goto err; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -1010,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) - goto err; + goto err_unlock; if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) - goto err; + goto err_unlock; g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); bucket_unlock(g); - percpu_up_read(&c->mark_lock); ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); return ret; -err: - bucket_unlock(g); err_unlock: - percpu_up_read(&c->mark_lock); + bucket_unlock(g); +err: return -BCH_ERR_metadata_bucket_inconsistency; } @@ -1269,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) for_each_member_device(c, ca) { BUG_ON(ca->buckets_nouse); - ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { @@ -1295,10 +1284,14 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - BUG_ON(resize && ca->buckets_nouse); + if (resize) + lockdep_assert_held(&c->state_lock); - bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets), - GFP_KERNEL|__GFP_ZERO); + if (resize && ca->buckets_nouse) + return -BCH_ERR_no_resize_with_buckets_nouse; + + bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), + GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; @@ -1309,11 +1302,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->nbuckets_minus_first = bucket_gens->nbuckets - bucket_gens->first_bucket; - if (resize) { - down_write(&ca->bucket_lock); - percpu_down_write(&c->mark_lock); - } - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { @@ -1331,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) nbuckets = ca->mi.nbuckets; - if (resize) { - percpu_up_write(&c->mark_lock); - up_write(&ca->bucket_lock); - } - ret = 0; err: if (bucket_gens) diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 3bebc4c3..a9acdd6c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b) static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return genradix_ptr(&ca->buckets_gc, b); + return bucket_valid(ca, b) + ? genradix_ptr(&ca->buckets_gc, b) + : NULL; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->state_lock) || - lockdep_is_held(&ca->bucket_lock)); + lockdep_is_held(&ca->fs->state_lock)); } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 1d6b691e..1f8e035d 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -14,21 +14,13 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus return (*_l)->expire < (*_r)->expire; } -static inline void io_timer_swp(void *l, void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - swap(*_l, *_r); -} +static const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = NULL, +}; void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { @@ -48,11 +40,6 @@ out: void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) @@ -142,10 +129,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; if (clock->timers.nr && time_after_eq64(now, clock->timers.data[0]->expire)) { diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 31b2aeb0..58521493 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -620,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans, * and we have to check for this because we go rw before repairing the * snapshots table - just skip it, we can move it later. */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; if (!bkey_get_dev_refs(c, k)) diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index a0e49cbe..b32e91ba 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ memcpy_u64s_small(acc->v.d, d, nr); } +static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) @@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, accounting_key_init(&k_i.k, k, d, nr); - return likely(!gc) - ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) - : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; + } else { + return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, @@ -471,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc return ret; } -void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_read(&c->mark_lock); - out->atomic++; - - eytzinger0_for_each(i, acc->k.nr) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); - - bch2_accounting_key_to_text(out, &acc_k); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - prt_str(out, ":"); - for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(out, " %llu", v[j]); - prt_newline(out); - } - - --out->atomic; - percpu_up_read(&c->mark_lock); -} - static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { @@ -931,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c) bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; + break; - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; + } bch2_accounting_mem_read(c, k.k->p, v, nr); diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h index 4a3d9ff8..5360cbb3 100644 --- a/libbcachefs/disk_accounting.h +++ b/libbcachefs/disk_accounting.h @@ -138,7 +138,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, bpos_to_disk_accounting_pos(&acc_k, a.k->p); bool gc = mode == BCH_ACCOUNTING_gc; - EBUG_ON(gc && !acc->gc_running); + if (gc && !acc->gc_running) + return 0; if (!bch2_accounting_is_mem(acc_k)) return 0; @@ -255,7 +256,6 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); -void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); int bch2_gc_accounting_start(struct bch_fs *); int bch2_gc_accounting_done(struct bch_fs *); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 6b297f90..d2a5e76e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -305,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -BCH_ERR_mark_stripe; - goto err_unlock; + goto err; } bucket_lock(g); @@ -319,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); + if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } @@ -1058,6 +1056,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h) ec_stripes_heap_set_backpointer(_h, j); } +static const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, +}; + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; @@ -1070,11 +1073,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); @@ -1085,11 +1083,6 @@ void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); BUG_ON(min_heap_full(&c->ec_stripes_heap)); @@ -1108,10 +1101,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c, void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; ec_stripes_heap *h = &c->ec_stripes_heap; bool do_deletes; size_t i; diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 775425df..4590cd0c 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -118,6 +118,7 @@ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ + x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -196,6 +197,9 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ + x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ + x(EINVAL, varint_decode_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -313,6 +317,7 @@ static inline long bch2_err_class(long err) #define BLK_STS_REMOVED ((__force blk_status_t)128) +#include const char *bch2_blk_status_to_str(blk_status_t); #endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 7d279f21..2c3d46ac 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -574,6 +574,11 @@ static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsig printbuf_nul_terminate(out); } +static inline void prt_str_reversed(struct printbuf *out, const char *s) +{ + prt_bytes_reversed(out, s, strlen(s)); +} + static inline void reverse_bytes(void *b, size_t n) { char *e = b + n, *s = b; @@ -596,17 +601,20 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb struct bch_inode_unpacked inode; ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); if (ret) - goto err; + goto disconnected; if (!inode.bi_dir && !inode.bi_dir_offset) { ret = -BCH_ERR_ENOENT_inode_no_backpointer; - goto err; + goto disconnected; } + inum.subvol = inode.bi_parent_subvol ?: inum.subvol; + inum.inum = inode.bi_dir; + u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - goto err; + goto disconnected; struct btree_iter d_iter; struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, @@ -614,23 +622,19 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb 0, dirent); ret = bkey_err(d.s_c); if (ret) - goto err; + goto disconnected; struct qstr dirent_name = bch2_dirent_get_name(d); prt_bytes_reversed(path, dirent_name.name, dirent_name.len); prt_char(path, '/'); - if (d.v->d_type == DT_SUBVOL) - inum.subvol = le32_to_cpu(d.v->d_parent_subvol); - inum.inum = d.k->p.inode; - bch2_trans_iter_exit(trans, &d_iter); } if (orig_pos == path->pos) prt_char(path, '/'); - +out: ret = path->allocation_failure ? -ENOMEM : 0; if (ret) goto err; @@ -639,4 +643,10 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb return 0; err: return ret; +disconnected: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + prt_str_reversed(path, "(disconnected)"); + goto out; } diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index ff8b8df5..ab1d5db2 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -625,15 +625,6 @@ do_io: BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_emergency_ro, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index a436c072..206fc046 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -205,6 +205,36 @@ err: return ret; } +/* + * Find any subvolume associated with a tree of snapshots + * We can't rely on master_subvol - it might have been deleted. + */ +static int find_snapshot_tree_subvol(struct btree_trans *trans, + u32 tree_id, u32 *subvol) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + if (le32_to_cpu(s.v->tree) != tree_id) + continue; + + if (s.v->subvol) { + *subvol = le32_to_cpu(s.v->subvol); + goto found; + } + } + ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked *lostfound, @@ -223,19 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (ret) return ret; - subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; + u32 subvolid; + ret = find_snapshot_tree_subvol(trans, + bch2_snapshot_tree(c, snapshot), &subvolid); + bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", + bch2_snapshot_tree(c, snapshot)); + if (ret) + return ret; struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol); - bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", - le32_to_cpu(st.master_subvol), snapshot); + ret = bch2_subvolume_get(trans, subvolid, false, &subvol); + bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); if (ret) return ret; if (!subvol.inode) { struct btree_iter iter; struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), + BTREE_ID_subvolumes, POS(0, subvolid), 0, subvolume); ret = PTR_ERR_OR_ZERO(subvol); if (ret) @@ -245,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, bch2_trans_iter_exit(trans, &iter); } - root_inum.inum = le64_to_cpu(subvol.inode); + subvol_inum root_inum = { + .subvol = subvolid, + .inum = le64_to_cpu(subvol.inode) + }; struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", - root_inum.inum, le32_to_cpu(st.master_subvol)); + root_inum.inum, subvolid); if (ret) return ret; @@ -458,7 +496,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * continue; struct bch_inode_unpacked child_inode; - bch2_inode_unpack(k, &child_inode); + ret = bch2_inode_unpack(k, &child_inode); + if (ret) + break; if (!inode_should_reattach(&child_inode)) { ret = maybe_delete_dirent(trans, @@ -809,9 +849,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, { struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(inode, &u)); - - return darray_push(&w->inodes, ((struct inode_walker_entry) { + return bch2_inode_unpack(inode, &u) ?: + darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = inode.k->p.snapshot, })); @@ -1065,7 +1104,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans, goto err; BUG(); found_root: - BUG_ON(bch2_inode_unpack(k, root)); + ret = bch2_inode_unpack(k, root); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1096,7 +1135,9 @@ static int check_inode(struct btree_trans *trans, if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + goto err; if (snapshot_root->bi_inum != u.bi_inum) { ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); @@ -1107,7 +1148,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, - "inodes in different snapshots don't match")) { + "inode hash info in different snapshots don't match")) { u.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); do_update = true; @@ -1318,7 +1359,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, break; struct bch_inode_unpacked parent_inode; - bch2_inode_unpack(k, &parent_inode); + ret = bch2_inode_unpack(k, &parent_inode); + if (ret) + break; if (!inode_should_reattach(&parent_inode)) break; @@ -1341,7 +1384,9 @@ static int check_unreachable_inode(struct btree_trans *trans, return 0; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + ret = bch2_inode_unpack(k, &inode); + if (ret) + return ret; if (!inode_should_reattach(&inode)) return 0; @@ -2296,7 +2341,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2410,7 +2455,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -2653,7 +2698,9 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) int ret = 0; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(inode_k, &inode)); + ret = bch2_inode_unpack(inode_k, &inode); + if (ret) + return ret; while (!inode.bi_subvol) { struct btree_iter dirent_iter; @@ -2864,7 +2911,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, /* Should never fail, checked by bch2_inode_invalid: */ struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(k, &u)); + _ret3 = bch2_inode_unpack(k, &u); + if (_ret3) + break; /* * Backpointer and directory structure checks are sufficient for @@ -2942,7 +2991,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; if (S_ISDIR(u.bi_mode)) return 0; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index d1fc44a7..04ec0520 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -48,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, u8 *p; if (in >= end) - return -1; + return -BCH_ERR_inode_unpack_error; if (!*in) - return -1; + return -BCH_ERR_inode_unpack_error; /* * position of highest set bit indicates number of bytes: @@ -61,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, bytes = byte_table[shift - 1]; if (in + bytes > end) - return -1; + return -BCH_ERR_inode_unpack_error; p = (u8 *) be + 16 - bytes; memcpy(p, in, bytes); @@ -177,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return ret; \ \ if (field_bits > sizeof(unpacked->_name) * 8) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ \ unpacked->_name = field[1]; \ in += ret; @@ -218,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v2() @@ -269,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v3() @@ -886,7 +886,7 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m { struct bch_fs *c = trans->c; - u64 cursor_idx = c->opts.shard_inode_numbers ? cpu : 0; + u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); @@ -907,19 +907,16 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m if (ret) goto err; - cursor->v.bits = c->opts.shard_inode_numbers_bits; - - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); - if (c->opts.shard_inode_numbers) { - bits -= cursor->v.bits; - - *min = (cpu << bits); - *max = (cpu << bits) | ~(ULLONG_MAX << bits); - - *min = max_t(u64, *min, BLOCKDEV_INODE_MAX); - } else { + if (c->opts.inodes_32bit) { *min = BLOCKDEV_INODE_MAX; - *max = ~(ULLONG_MAX << bits); + *max = INT_MAX; + } else { + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = 63 - c->opts.shard_inode_numbers_bits; + + *min = max(cpu << bits, (u64) INT_MAX + 1); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); } if (le64_to_cpu(cursor->v.idx) < *min) diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h index 1b93e189..b99a5bf1 100644 --- a/libbcachefs/inode_format.h +++ b/libbcachefs/inode_format.h @@ -102,7 +102,8 @@ struct bch_inode_generation { x(bi_subvol, 32) \ x(bi_parent_subvol, 32) \ x(bi_nocow, 8) \ - x(bi_depth, 32) + x(bi_depth, 32) \ + x(bi_inodes_32bit, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -115,7 +116,8 @@ struct bch_inode_generation { x(foreground_target, 16) \ x(background_target, 16) \ x(erasure_code, 16) \ - x(nocow, 8) + x(nocow, 8) \ + x(inodes_32bit, 8) enum inode_opt_id { #define x(name, ...) \ diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 20da357e..3e71860f 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -1356,6 +1356,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_trans_put(trans); + darray_exit(&buckets); + if (ret) { struct printbuf buf = PRINTBUF; bch2_write_op_error(&buf, op); @@ -1366,9 +1369,6 @@ err: op->flags |= BCH_WRITE_SUBMITTED; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_SUBMITTED)) { closure_sync(&op->cl); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 7f2efe85..e1773ac2 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1114,8 +1114,10 @@ reread: (printbuf_reset(&err), prt_str(&err, "journal "), bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + err.buf))) { saw_bad = true; + bch2_fatal_error(c); + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 9b066c61..c493ea62 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -414,7 +414,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, continue; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 2df5c8f7..e763d52e 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -222,15 +222,10 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT, \ - OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, true, \ - NULL, "Shard new inode numbers by CPU id") \ x(shard_inode_numbers_bits, u8, \ OPT_FS|OPT_FORMAT, \ OPT_UINT(0, 8), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 7849916e..98825437 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -107,6 +107,12 @@ out: return ret; } +static void kill_btree(struct bch_fs *c, enum btree_id btree) +{ + bch2_btree_id_root(c, btree)->alive = false; + bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +} + /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { @@ -157,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) + if (btree_id_is_alloc(i)) + kill_btree(c, i); } /* @@ -573,9 +572,6 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) - continue; - printbuf_reset(&buf); bch2_btree_id_level_to_text(&buf, i, r->level); @@ -785,6 +781,11 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { + SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); + write_sb = true; + } + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -882,15 +883,15 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) - bch2_reconstruct_alloc(c); - zero_out_btree_mem_ptr(&c->journal_keys); ret = journal_replay_early(c, clean); if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_reconstruct_alloc(c); + /* * After an unclean shutdown, skip then next few journal sequence * numbers as they may have been referenced by btree writes that diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h index f967b23d..71baad41 100644 --- a/libbcachefs/recovery_passes_types.h +++ b/libbcachefs/recovery_passes_types.h @@ -49,7 +49,7 @@ x(fs_upgrade_for_subvolumes, 22, 0) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index 62ea4782..fdcf598f 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -2,86 +2,91 @@ #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H #define _BCACHEFS_SB_COUNTERS_FORMAT_H -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) +enum counters_flags { + TYPE_COUNTER = BIT(0), /* event counters */ + TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ +}; + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0, TYPE_SECTORS) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_alloc, 5, TYPE_COUNTER) \ + x(bucket_alloc_fail, 6, TYPE_COUNTER) \ + x(btree_cache_scan, 7, TYPE_COUNTER) \ + x(btree_cache_reap, 8, TYPE_COUNTER) \ + x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ + x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ + x(btree_node_write, 13, TYPE_COUNTER) \ + x(btree_node_read, 14, TYPE_COUNTER) \ + x(btree_node_compact, 15, TYPE_COUNTER) \ + x(btree_node_merge, 16, TYPE_COUNTER) \ + x(btree_node_split, 17, TYPE_COUNTER) \ + x(btree_node_rewrite, 18, TYPE_COUNTER) \ + x(btree_node_alloc, 19, TYPE_COUNTER) \ + x(btree_node_free, 20, TYPE_COUNTER) \ + x(btree_node_set_root, 21, TYPE_COUNTER) \ + x(btree_path_relock_fail, 22, TYPE_COUNTER) \ + x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ + x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ + x(journal_entry_full, 25, TYPE_COUNTER) \ + x(journal_full, 26, TYPE_COUNTER) \ + x(journal_reclaim_finish, 27, TYPE_COUNTER) \ + x(journal_reclaim_start, 28, TYPE_COUNTER) \ + x(journal_write, 29, TYPE_COUNTER) \ + x(read_promote, 30, TYPE_COUNTER) \ + x(read_bounce, 31, TYPE_COUNTER) \ + x(read_split, 33, TYPE_COUNTER) \ + x(read_retry, 32, TYPE_COUNTER) \ + x(read_reuse_race, 34, TYPE_COUNTER) \ + x(move_extent_read, 35, TYPE_SECTORS) \ + x(move_extent_write, 36, TYPE_SECTORS) \ + x(move_extent_finish, 37, TYPE_SECTORS) \ + x(move_extent_fail, 38, TYPE_COUNTER) \ + x(move_extent_start_fail, 39, TYPE_COUNTER) \ + x(copygc, 40, TYPE_COUNTER) \ + x(copygc_wait, 41, TYPE_COUNTER) \ + x(gc_gens_end, 42, TYPE_COUNTER) \ + x(gc_gens_start, 43, TYPE_COUNTER) \ + x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ + x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ + x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ + x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ + x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ + x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ + x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ + x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ + x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ + x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ + x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ + x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ + x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ + x(trans_restart_relock, 57, TYPE_COUNTER) \ + x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ + x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ + x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ + x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ + x(trans_restart_relock_path, 62, TYPE_COUNTER) \ + x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ + x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ + x(trans_restart_traverse, 65, TYPE_COUNTER) \ + x(trans_restart_upgrade, 66, TYPE_COUNTER) \ + x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ + x(trans_restart_injected, 69, TYPE_COUNTER) \ + x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ + x(trans_traverse_all, 71, TYPE_COUNTER) \ + x(transaction_commit, 72, TYPE_COUNTER) \ + x(write_super, 73, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ + x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ + x(trans_restart_split_race, 76, TYPE_COUNTER) \ + x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ + x(write_buffer_flush_sync, 78, TYPE_COUNTER) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index fe453e17..051214fd 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -83,7 +83,6 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_backpointers_to_extents)|\ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ diff --git a/libbcachefs/six.c b/libbcachefs/six.c index 617d07e5..537bf049 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long if (type != SIX_LOCK_write) six_release(&lock->dep_map, ip); - else - lock->seq++; if (type == SIX_LOCK_intent && lock->intent_lock_recurse) { @@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long return; } + if (type == SIX_LOCK_write && + lock->write_lock_recurse) { + --lock->write_lock_recurse; + return; + } + + if (type == SIX_LOCK_write) + lock->seq++; + do_six_unlock_type(lock, type); } EXPORT_SYMBOL_GPL(six_unlock_ip); @@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) atomic_add(l[type].lock_val, &lock->state); } break; + case SIX_LOCK_write: + lock->write_lock_recurse++; + fallthrough; case SIX_LOCK_intent: EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); lock->intent_lock_recurse++; break; - case SIX_LOCK_write: - BUG(); - break; } } EXPORT_SYMBOL_GPL(six_lock_increment); diff --git a/libbcachefs/six.h b/libbcachefs/six.h index 68d46fd7..c142e06b 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -137,6 +137,7 @@ struct six_lock { atomic_t state; u32 seq; unsigned intent_lock_recurse; + unsigned write_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; raw_spinlock_t wait_lock; diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 99f04551..cf6b3256 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" @@ -279,23 +280,6 @@ fsck_err: return ret; } -static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - struct snapshot_t *t = snapshot_t_mut(c, id); - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); -} - -static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - __set_is_ancestor_bitmap(c, id); - mutex_unlock(&c->snapshot_table_lock); -} - static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + t->live = true; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t->skip[2] = 0; } - __set_is_ancestor_bitmap(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); if (BCH_SNAPSHOT_DELETED(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); @@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot, s); } -static int bch2_snapshot_live(struct btree_trans *trans, u32 id) -{ - struct bch_snapshot v; - int ret; - - if (!id) - return 0; - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot node %u not found", id); - if (ret) - return ret; - - return !BCH_SNAPSHOT_DELETED(&v); -} - -/* - * If @k is a snapshot with just one live child, it's part of a linear chain, - * which we consider to be an equivalence class: and then after snapshot - * deletion cleanup, there should only be a single key at a given position in - * this equivalence class. - * - * This sets the equivalence class of @k to be the child's equivalence class, if - * it's part of such a linear chain: this correctly sets equivalence classes on - * startup if we run leaf to root (i.e. in natural key order). - */ -static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - unsigned i, nr_live = 0, live_idx = 0; - struct bkey_s_c_snapshot snap; - u32 id = k.k->p.offset, child[2]; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); - - for (i = 0; i < 2; i++) { - int ret = bch2_snapshot_live(trans, child[i]); - - if (ret < 0) - return ret; - - if (ret) - live_idx = i; - nr_live += ret; - } - - mutex_lock(&c->snapshot_table_lock); - - snapshot_t_mut(c, id)->equiv = nr_live == 1 - ? snapshot_t_mut(c, child[live_idx])->equiv - : id; - - mutex_unlock(&c->snapshot_table_lock); - - return 0; -} - /* fsck: */ static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) @@ -570,6 +495,9 @@ static int check_snapshot_tree(struct btree_trans *trans, goto err; } + if (!st.v->master_subvol) + goto out; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -613,6 +541,7 @@ static int check_snapshot_tree(struct btree_trans *trans, u->v.master_subvol = cpu_to_le32(subvol_id); st = snapshot_tree_i_to_s_c(u); } +out: err: fsck_err: bch2_trans_iter_exit(trans, &snapshot_iter); @@ -913,7 +842,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_equiv(c, id)) + if (bch2_snapshot_exists(c, id)) return 0; /* Do we need to reconstruct the snapshot_tree entry as well? */ @@ -962,8 +891,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: - bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -1061,7 +989,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + if (fsck_err_on(!bch2_snapshot_exists(c, *id), trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1094,10 +1022,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), + if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: @@ -1111,13 +1041,11 @@ fsck_err: int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) { struct btree_iter iter; - struct bkey_i_snapshot *s; - int ret = 0; - - s = bch2_bkey_get_mut_typed(trans, &iter, + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); + int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", id); @@ -1305,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; new_snapids[i] = iter.pos.offset; - - mutex_lock(&c->snapshot_table_lock); - snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; - mutex_unlock(&c->snapshot_table_lock); } err: bch2_trans_iter_exit(trans, &iter); @@ -1414,102 +1338,95 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) -{ - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ +static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) +{ + darray_for_each(*l, i) + if (i->id == id) + return i->live_child; + return 0; +} + +static unsigned __live_child(struct snapshot_table *t, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + struct snapshot_t *s = __snapshot_t(t, id); + if (!s) return 0; - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) + if (s->children[i] && + !snapshot_list_has_id(delete_leaves, s->children[i]) && + !interior_delete_has_id(delete_interior, s->children[i])) + return s->children[i]; - if (snapshot_list_has_id(deleted, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - if (!bpos_eq(*last_pos, k.k->p) && - snapshot_list_has_id(equiv_seen, equiv)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - *last_pos = k.k->p; - - ret = snapshot_list_add_nodup(c, equiv_seen, equiv); - if (ret) - return ret; - - /* - * When we have a linear chain of snapshot nodes, we consider - * those to form an equivalence class: we're going to collapse - * them all down to a single node, and keep the leaf-most node - - * which has the same id as the equivalence class id. - * - * If there are multiple keys in different snapshots at the same - * position, we're only going to keep the one in the newest - * snapshot (we delete the others above) - the rest have been - * overwritten and are redundant, and for the key we're going to keep we - * need to move it to the equivalance class ID if it's not there - * already. - */ - if (equiv != k.k->p.snapshot) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - new->k.p.snapshot = equiv; - - struct btree_iter new_iter; - bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_cached| - BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&new_iter) ?: - bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &new_iter); - if (ret) - return ret; + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { + u32 live_child = s->children[i] + ? __live_child(t, s->children[i], delete_leaves, delete_interior) + : 0; + if (live_child) + return live_child; } return 0; } -static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +static unsigned live_child(struct bch_fs *c, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; + rcu_read_lock(); + u32 ret = __live_child(rcu_dereference(c->snapshots), id, + delete_leaves, delete_interior); + rcu_read_unlock(); + return ret; +} - if (k.k->type != KEY_TYPE_snapshot) - return 0; +static int delete_dead_snapshots_process_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) - return 0; + u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + if (live_child) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); + new->k.p.snapshot = live_child; - ret = bch2_snapshot_live(trans, children[0]) ?: - bch2_snapshot_live(trans, children[1]); - if (ret < 0) + struct btree_iter dst_iter; + struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, + iter->btree_id, new->k.p, + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + ret = bkey_err(dst_k); + if (ret) + return ret; + + ret = (bkey_deleted(dst_k.k) + ? bch2_trans_update(trans, &dst_iter, new, + BTREE_UPDATE_internal_snapshot_node) + : 0) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &dst_iter); return ret; - return !ret; + } + + return 0; } /* @@ -1517,26 +1434,57 @@ static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - int ret = bch2_snapshot_needs_delete(trans, k); + if (k.k->type != KEY_TYPE_snapshot) + return 0; - return ret <= 0 - ? ret - : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + unsigned live_children = 0; + + if (BCH_SNAPSHOT_SUBVOL(s.v)) + return 0; + + for (unsigned i = 0; i < 2; i++) { + u32 child = le32_to_cpu(s.v->children[i]); + + live_children += child && + !snapshot_list_has_id(delete_leaves, child); + } + + if (live_children == 0) { + return snapshot_list_add(c, delete_leaves, s.k->p.offset); + } else if (live_children == 1) { + struct snapshot_interior_delete d = { + .id = s.k->p.offset, + .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + }; + + if (!d.live_child) { + bch_err(c, "error finding live child of snapshot %u", d.id); + return -EINVAL; + } + + return darray_push(delete_interior, d); + } else { + return 0; + } } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - snapshot_id_list *skip) + interior_delete_list *skip) { rcu_read_lock(); - while (snapshot_list_has_id(skip, id)) + while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); while (n--) { do { id = __bch2_snapshot_parent(c, id); - } while (snapshot_list_has_id(skip, id)); + } while (interior_delete_has_id(skip, id)); } rcu_read_unlock(); @@ -1545,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - snapshot_id_list *deleted) + interior_delete_list *deleted) { struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; @@ -1555,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - if (snapshot_list_has_id(deleted, k.k->p.offset)) + if (interior_delete_has_id(deleted, k.k->p.offset)) return 0; s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); @@ -1564,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return ret; darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); if (!nr_deleted_ancestors) return 0; @@ -1582,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { u32 id = le32_to_cpu(s->v.skip[j]); - if (snapshot_list_has_id(deleted, id)) { + if (interior_delete_has_id(deleted, id)) { id = bch2_snapshot_nth_parent_skip(c, parent, depth > 1 @@ -1601,51 +1549,44 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct btree_trans *trans; - snapshot_id_list deleted = { 0 }; - snapshot_id_list deleted_interior = { 0 }; - int ret = 0; - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - trans = bch2_trans_get(c); + struct btree_trans *trans = bch2_trans_get(c); + snapshot_id_list delete_leaves = {}; + interior_delete_list delete_interior = {}; + int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(trans, k)); - bch_err_msg(c, ret, "deleting redundant snapshots"); - if (ret) - goto err; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(trans, k)); - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); - if (ret) - goto err; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - if (k.k->type != KEY_TYPE_snapshot) - continue; - - BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) - ? snapshot_list_add(c, &deleted, k.k->p.offset) - : 0; - })); + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, + check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; + if (!delete_leaves.nr && !delete_interior.nr) + goto err; + + { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "deleting leaves"); + darray_for_each(delete_leaves, i) + prt_printf(&buf, " %u", *i); + + prt_printf(&buf, " interior"); + darray_for_each(delete_interior, i) + prt_printf(&buf, " %u->%u", i->id, i->live_child); + + ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); + printbuf_exit(&buf); + if (ret) + goto err; + } + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; if (!btree_type_has_snapshots(btree)) @@ -1655,33 +1596,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, &deleted, - &equiv_seen, &last_pos)); + delete_dead_snapshots_process_key(trans, &iter, k, + &delete_leaves, + &delete_interior)); bch2_disk_reservation_put(c, &res); - darray_exit(&equiv_seen); bch_err_msg(c, ret, "deleting keys from dying snapshots"); if (ret) goto err; } - bch2_trans_unlock(trans); - down_write(&c->snapshot_create_lock); - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - u32 snapshot = k.k->p.offset; - u32 equiv = bch2_snapshot_equiv(c, snapshot); - - equiv != snapshot - ? snapshot_list_add(c, &deleted_interior, snapshot) - : 0; - })); - - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err_create_lock; + darray_for_each(delete_leaves, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) + goto err; + } /* * Fixing children of deleted snapshots can't be done completely @@ -1691,30 +1623,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); if (ret) - goto err_create_lock; + goto err; - darray_for_each(deleted, i) { + darray_for_each(delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch2_snapshot_node_delete(trans, i->id)); + bch_err_msg(c, ret, "deleting snapshot %u", i->id); if (ret) - goto err_create_lock; + goto err; } - - darray_for_each(deleted_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err_create_lock; - } -err_create_lock: - up_write(&c->snapshot_create_lock); err: - darray_exit(&deleted_interior); - darray_exit(&deleted); + darray_exit(&delete_interior); + darray_exit(&delete_leaves); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1767,37 +1689,36 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } +static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) +{ + /* If there's one child, it's redundant and keys will be moved to the child */ + return !!snap.v->children[0] + !!snap.v->children[1] == 1; +} + static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot snap; - int ret = 0; - if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); + struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); if (BCH_SNAPSHOT_DELETED(snap.v) || - bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || - (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return 0; - } + interior_snapshot_needs_delete(snap)) + set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - return ret; + return 0; } int bch2_snapshots_read(struct bch_fs *c) { + /* + * Initializing the is_ancestor bitmaps requires ancestors to already be + * initialized - so mark in reverse: + */ int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, + POS_MAX, 0, k, __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(trans, k) ?: - bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + bch2_check_snapshot_needs_deletion(trans, k))); bch_err_fn(c, ret); /* diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index ae23d45f..00373cf3 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -119,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->equiv : 0; + return s ? s->live : 0; } -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = __bch2_snapshot_equiv(c, id); + bool ret = __bch2_snapshot_exists(c, id); rcu_read_unlock(); - return id; + return ret; } static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c index c3276a7e..f5977c5c 100644 --- a/libbcachefs/str_hash.c +++ b/libbcachefs/str_hash.c @@ -101,38 +101,108 @@ static int hash_pick_winner(struct btree_trans *trans, } } -int bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) +static int repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != snapshot_root->bi_inum) + break; + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + break; + + if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, + "inode hash info in different snapshots don't match")) { + inode.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + break; + } + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * All versions of the same inode in different snapshots must have the same hash + * seed/type: verify that the hash info we're using matches the root + */ +static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found; + } + bch_err(c, "%s(): inum %llu not found", __func__, inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; +found:; + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + if (memcmp(hash_info, &hash2, sizeof(hash2))) { + ret = repair_inode_hash_info(trans, &inode); + if (!ret) { + bch_err(c, "inode hash info mismatch with root, but mismatch not found"); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c k; - u64 hash; int ret = 0; - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - + u64 hash = desc->hash_bkey(hash_info, hash_k); if (hash_k.k->p.offset < hash) goto bad_hash; - for_each_btree_key_norestart(trans, iter, desc.btree_id, + for_each_btree_key_norestart(trans, iter, desc->btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; - if (k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k)) + if (k.k->type == desc->key_type && + !desc->cmp_bkey(k, hash_k)) goto duplicate_entries; if (bkey_deleted(k.k)) { @@ -145,16 +215,23 @@ out: printbuf_exit(&buf); return ret; bad_hash: + /* + * Before doing any repair, check hash_info itself: + */ + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); + if (ret) + goto out; + if (fsck_err(trans, hash_table_key_wrong_offset, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); if (IS_ERR(new)) return PTR_ERR(new); - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, (subvol_inum) { 0, hash_k.k->p.inode }, hash_k.k->p.snapshot, new, STR_HASH_must_create| @@ -166,9 +243,9 @@ bad_hash: if (k.k) goto duplicate_entries; - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -BCH_ERR_transaction_restart_nested; goto out; @@ -176,7 +253,7 @@ bad_hash: fsck_err: goto out; duplicate_entries: - ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); if (ret < 0) goto out; @@ -192,14 +269,14 @@ duplicate_entries: switch (ret) { case 0: - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); break; case 1: - ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); break; case 2: - ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); goto out; } diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 0c20f3af..55a4ac7b 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -394,10 +394,25 @@ int bch2_hash_delete(struct btree_trans *trans, } struct snapshots_seen; -int bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); +int __bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + +static inline int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + if (hash_k.k->type != desc->key_type) + return 0; + + if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) + return 0; + + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); +} #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 0e756e35..e3d04752 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -409,26 +409,56 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d */ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - u32 snapid; - int ret = 0; + struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - subvol = bch2_bkey_get_iter_typed(trans, &iter, + struct bkey_s_c_subvolume subvol = + bch2_bkey_get_iter_typed(trans, &subvol_iter, BTREE_ID_subvolumes, POS(0, subvolid), BTREE_ITER_cached|BTREE_ITER_intent, subvolume); - ret = bkey_err(subvol); + int ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing subvolume %u", subvolid); if (ret) - return ret; + goto err; - snapid = le32_to_cpu(subvol.v->snapshot); + u32 snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_btree_delete_at(trans, &iter, 0) ?: + struct bkey_s_c_snapshot snapshot = + bch2_bkey_get_iter_typed(trans, &snapshot_iter, + BTREE_ID_snapshots, POS(0, snapid), + 0, snapshot); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot %u", snapid); + if (ret) + goto err; + + u32 treeid = le32_to_cpu(snapshot.v->tree); + + struct bkey_s_c_snapshot_tree snapshot_tree = + bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, + BTREE_ID_snapshot_trees, POS(0, treeid), + 0, snapshot_tree); + + if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { + struct bkey_i_snapshot_tree *snapshot_tree_mut = + bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, + &snapshot_tree.s_c, + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); + if (ret) + goto err; + + snapshot_tree_mut->v.master_subvol = 0; + } + + ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: bch2_snapshot_node_set_deleted(trans, snapid); - bch2_trans_iter_exit(trans, &iter); +err: + bch2_trans_iter_exit(trans, &snapshot_tree_iter); + bch2_trans_iter_exit(trans, &snapshot_iter); + bch2_trans_iter_exit(trans, &subvol_iter); return ret; } diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index f2ec4277..1549d6da 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list; #define IS_ANCESTOR_BITMAP 128 struct snapshot_t { + bool live; u32 parent; u32 skip[3]; u32 depth; u32 children[2]; u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 tree; - u32 equiv; unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; }; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index dbc09e30..8037ccba 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1084,9 +1084,16 @@ int bch2_write_super(struct bch_fs *c) ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - bch2_fs_fatal_error(c, "%s", buf.buf); + + if (c->opts.errors != BCH_ON_ERROR_continue && + c->opts.errors != BCH_ON_ERROR_fix_safe) { + ret = -BCH_ERR_erofs_sb_err; + bch2_fs_fatal_error(c, "%s", buf.buf); + } else { + bch_err(c, "%s", buf.buf); + } + printbuf_exit(&buf); - ret = -BCH_ERR_erofs_sb_err; } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index bd6aecea..d97ea7bd 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -563,6 +563,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); + bch2_fs_btree_gc_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); @@ -770,13 +771,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->recovery_pass_lock); sema_init(&c->online_fsck_mutex, 1); - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); - for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); @@ -911,6 +908,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_gc_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: @@ -1306,8 +1304,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); - init_rwsem(&ca->bucket_lock); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_quantiles_init(&ca->io_latency[READ]); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 97733c76..a7eb1f51 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -203,7 +203,6 @@ read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); -read_attribute(accounting); read_attribute(usage_base); #define x(t, n, ...) read_attribute(t); @@ -397,9 +396,6 @@ SHOW(bch2_fs) if (attr == &sysfs_alloc_debug) bch2_fs_alloc_debug_to_text(out, c); - if (attr == &sysfs_accounting) - bch2_fs_accounting_to_text(out, c); - if (attr == &sysfs_usage_base) bch2_fs_usage_base_to_text(out, c); @@ -509,15 +505,22 @@ SHOW(bch2_fs_counters) printbuf_tabstop_push(out, 32); - #define x(t, ...) \ + #define x(t, n, f, ...) \ if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + if (f & TYPE_SECTORS) { \ + counter <<= 9; \ + counter_since_mount <<= 9; \ + } \ + \ prt_printf(out, "since mount:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ prt_printf(out, "since filesystem creation:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -595,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_disk_groups, &sysfs_alloc_debug, - &sysfs_accounting, &sysfs_usage_base, NULL }; diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 7baf66be..9d40b7d4 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -1338,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1374,10 +1380,21 @@ TRACE_EVENT(path_downgrade, __entry->pos_snapshot) ); -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) +TRACE_EVENT(key_cache_fill, + TP_PROTO(struct btree_trans *trans, const char *key), + TP_ARGS(trans, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %s", __entry->trans_fn, __get_str(key)) ); TRACE_EVENT(write_buffer_flush, @@ -1436,6 +1453,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); +TRACE_EVENT(write_buffer_maybe_flush, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), + TP_ARGS(trans, caller_ip, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) +); + DEFINE_EVENT(fs_str, rebalance_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index c292b9ce..1a172011 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } +static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +{ + void *p = unlikely(n >= INT_MAX) + ? vmalloc(n) + : kvmalloc(n, flags & ~__GFP_ZERO); + if (p && (flags & __GFP_ZERO)) + memset(p, 0, n); + return p; +} + #define init_heap(heap, _size, gfp) \ ({ \ (heap)->nr = 0; \ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index 6a78553d..6620ecae 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -9,6 +9,7 @@ #include #endif +#include "errcode.h" #include "varint.h" /** @@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) u64 v; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { __le64 v_le = 0; @@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { v >>= bytes;