From 8e02744a90673d9e4ea06cfa0ff123e7bada962f Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Fri, 29 Mar 2024 22:09:24 -0400 Subject: [PATCH] Update bcachefs sources to 794723fc10c4 bcachefs: Topology repair now uses nodes found by scanning to fill holes Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- .bcachefs_revision | 2 +- c_src/cmd_list_journal.c | 2 +- include/linux/bio.h | 4 + include/linux/bitmap.h | 1 + include/linux/bitops.h | 1 - include/linux/kernel.h | 3 - include/linux/rcupdate.h | 1 + include/linux/slab.h | 5 + libbcachefs/alloc_background.c | 15 +- libbcachefs/alloc_foreground.c | 10 +- libbcachefs/backpointers.c | 13 +- libbcachefs/backpointers.h | 32 +- libbcachefs/bcachefs.h | 9 +- libbcachefs/bset.c | 14 +- libbcachefs/bset.h | 2 + libbcachefs/btree_cache.c | 5 +- libbcachefs/btree_gc.c | 508 +++++++++--------- libbcachefs/btree_io.c | 26 +- libbcachefs/btree_iter.c | 52 +- libbcachefs/btree_journal_iter.c | 48 +- libbcachefs/btree_journal_iter.h | 8 +- libbcachefs/btree_key_cache.c | 2 +- libbcachefs/btree_node_scan.c | 495 +++++++++++++++++ libbcachefs/btree_node_scan.h | 11 + libbcachefs/btree_node_scan_types.h | 30 ++ libbcachefs/btree_trans_commit.c | 2 +- libbcachefs/btree_update.c | 6 + libbcachefs/btree_update_interior.c | 207 ++++--- libbcachefs/btree_update_interior.h | 5 +- libbcachefs/btree_write_buffer.c | 16 +- libbcachefs/buckets.c | 18 +- libbcachefs/chardev.c | 2 +- libbcachefs/data_update.c | 9 + libbcachefs/debug.c | 2 +- libbcachefs/ec.c | 6 +- libbcachefs/errcode.h | 3 +- libbcachefs/error.c | 2 +- libbcachefs/error.h | 10 +- libbcachefs/extents.c | 61 ++- libbcachefs/extents.h | 31 +- libbcachefs/eytzinger.c | 234 ++++++++ libbcachefs/eytzinger.h | 63 ++- libbcachefs/fs-io-direct.c | 4 +- libbcachefs/fs.c | 4 +- libbcachefs/fsck.c | 53 +- libbcachefs/inode.c | 2 +- libbcachefs/io_misc.c | 2 + libbcachefs/journal.c | 12 +- libbcachefs/journal_io.c | 15 +- libbcachefs/journal_seq_blacklist.c | 3 +- libbcachefs/logged_ops.c | 4 +- libbcachefs/movinggc.c | 3 +- libbcachefs/opts.h | 4 +- libbcachefs/recovery.c | 353 +++--------- libbcachefs/recovery.h | 32 +- libbcachefs/recovery_passes.c | 204 +++++++ libbcachefs/recovery_passes.h | 16 + ...covery_types.h => recovery_passes_types.h} | 12 +- libbcachefs/reflink.c | 3 +- libbcachefs/replicas.c | 19 +- libbcachefs/sb-downgrade.c | 2 +- libbcachefs/sb-errors_types.h | 5 +- libbcachefs/snapshot.c | 61 ++- libbcachefs/snapshot.h | 63 +-- libbcachefs/subvolume.c | 72 +++ libbcachefs/subvolume.h | 3 + libbcachefs/subvolume_types.h | 2 + libbcachefs/super-io.c | 10 +- libbcachefs/super.c | 4 + libbcachefs/util.c | 143 ----- libbcachefs/util.h | 7 +- linux/bio.c | 14 + 72 files changed, 2043 insertions(+), 1064 deletions(-) create mode 100644 libbcachefs/btree_node_scan.c create mode 100644 libbcachefs/btree_node_scan.h create mode 100644 libbcachefs/btree_node_scan_types.h create mode 100644 libbcachefs/eytzinger.c create mode 100644 libbcachefs/recovery_passes.c create mode 100644 libbcachefs/recovery_passes.h rename libbcachefs/{recovery_types.h => recovery_passes_types.h} (92%) diff --git a/.bcachefs_revision b/.bcachefs_revision index c9361961..9abb2e20 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -83338f5b2cb8406cda8bf7be3f566ab97c696917 +794723fc10c4a1ff28d4b11c436277ba783f47e6 diff --git a/c_src/cmd_list_journal.c b/c_src/cmd_list_journal.c index 38ec8a28..9e1427d1 100644 --- a/c_src/cmd_list_journal.c +++ b/c_src/cmd_list_journal.c @@ -223,7 +223,7 @@ int cmd_list_journal(int argc, char *argv[]) opt_set(opts, degraded, true); opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_FIX_yes); - opt_set(opts, keep_journal, true); + opt_set(opts, retain_recovery_info ,true); opt_set(opts, read_journal_only,true); while ((opt = getopt_long(argc, argv, "an:t:k:vh", diff --git a/include/linux/bio.h b/include/linux/bio.h index 1f8acca2..b3755406 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -235,6 +235,7 @@ enum { struct bio *bio_alloc_bioset(struct block_device *, unsigned, blk_opf_t, gfp_t, struct bio_set *); + extern void bio_put(struct bio *); int bio_add_page(struct bio *, struct page *, unsigned, unsigned); @@ -242,6 +243,9 @@ int bio_add_page(struct bio *, struct page *, unsigned, unsigned); struct bio *bio_alloc_clone(struct block_device *, struct bio *, gfp_t, struct bio_set *); +struct bio *bio_alloc(struct block_device *, unsigned, + blk_opf_t, gfp_t); + struct bio *bio_kmalloc(unsigned int, gfp_t); extern void bio_endio(struct bio *); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index db2dfdb2..5192c3e1 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -3,6 +3,7 @@ #include <string.h> #include <linux/bitops.h> +#include <linux/kernel.h> #include <stdlib.h> #define DECLARE_BITMAP(name,bits) \ diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 758476b1..93739532 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -2,7 +2,6 @@ #define _TOOLS_LINUX_BITOPS_H_ #include <asm/types.h> -#include <linux/kernel.h> #include <linux/compiler.h> #include <linux/page.h> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index ef0b1a7d..8f1cbc2d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -111,9 +111,6 @@ #define struct_group(NAME, MEMBERS...) \ __struct_group(/* no tag */, NAME, /* no attrs */, MEMBERS) -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - /* This counts to 12. Any more, it will return 13th argument. */ #define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f5260270..6b7785e6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -13,6 +13,7 @@ #define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ #define kfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ +#define kvfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ #define kvfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) diff --git a/include/linux/slab.h b/include/linux/slab.h index ca0c7934..35f7b1ba 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -20,6 +20,11 @@ #define ARCH_KMALLOC_MINALIGN 16 #define KMALLOC_MAX_SIZE SIZE_MAX +static inline size_t kmalloc_size_roundup(size_t s) +{ + return roundup_pow_of_two(s); +} + static inline void *kmalloc_noprof(size_t size, gfp_t flags) { unsigned i; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index c47f72f2..893e38f9 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -532,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c) u8 gen = bch2_alloc_to_v4(k, &a)->gen; unsigned offset; struct bpos pos = alloc_gens_pos(iter.pos, &offset); + int ret2 = 0; if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - if (ret) - break; + ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret2) + goto iter_err; have_bucket_gens_key = false; } @@ -549,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - 0; +iter_err: + ret2; })); if (have_bucket_gens_key && !ret) @@ -852,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_journal_seq); if (ret) { bch2_fs_fatal_error(c, - "error setting bucket_needs_journal_commit: %i", ret); + "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); return ret; } } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index ca58193d..214b15c8 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -1356,15 +1356,17 @@ retry: /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, cl); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + if (!ret2 || + bch2_err_matches(ret2, BCH_ERR_transaction_restart) || + bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { + ret = ret2; goto alloc_done; + } } /* diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 8cb35ea5..762c8ddf 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -29,8 +29,7 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, - &bucket2, &bp2); + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && !memcmp(&bp, &bp2, sizeof(bp))) return true; @@ -44,6 +43,11 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + /* these will be caught by fsck */ + if (!bch2_dev_exists2(c, bp.k->p.inode)) + return 0; + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; @@ -378,7 +382,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } @@ -502,8 +506,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, - k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index 327365a9..da012ca7 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -90,20 +90,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } -static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p) +static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry) { - return level ? BCH_DATA_btree : - p.has_ec ? BCH_DATA_stripe : - BCH_DATA_user; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 339dc3e1..f129fc4f 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -209,7 +209,7 @@ #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_types.h" +#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "time_stats.h" @@ -456,6 +456,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" +#include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" @@ -810,7 +811,6 @@ struct bch_fs { /* snapshot.c: */ struct snapshot_table __rcu *snapshots; - size_t snapshot_table_size; struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; @@ -826,6 +826,7 @@ struct bch_fs { struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; struct mutex btree_root_lock; + unsigned long btrees_lost_data; /* bitmask */ struct btree_cache btree_cache; @@ -849,6 +850,8 @@ struct bch_fs { struct workqueue_struct *btree_interior_update_worker; struct work_struct btree_interior_update_work; + struct workqueue_struct *btree_node_rewrite_worker; + struct list_head pending_node_rewrites; struct mutex pending_node_rewrites_lock; @@ -1102,6 +1105,8 @@ struct bch_fs { struct journal_keys journal_keys; struct list_head journal_iters; + struct find_btree_nodes found_btree_nodes; + u64 last_bucket_seq_cleanup; u64 counters_on_mount[BCH_COUNTER_NR]; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 3fd1085b..3bb47784 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b, printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_verify_btree_nr_keys(struct btree *b) +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { struct bset_tree *t; struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; + struct btree_nr_keys nr = {}; for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); + return nr; +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 79c77baa..120a79fd 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, /* Accounting: */ +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); + static inline void btree_keys_account_key(struct btree_nr_keys *n, unsigned bset, struct bkey_packed *k, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 562561a9..84474324 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } @@ -1134,6 +1135,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 584aee70..6280da12 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -7,11 +7,13 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_methods.h" #include "bkey_buf.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" @@ -24,7 +26,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "super-io.h" @@ -40,6 +42,7 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +#define DID_FILL_FROM_SCAN 12 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { @@ -70,90 +73,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* - * Missing: if an interior btree node is empty, we need to do something - - * perhaps just kill it - */ -static int bch2_gc_check_topology(struct bch_fs *c, - struct btree *b, - struct bkey_buf *prev, - struct bkey_buf cur, - bool is_last) -{ - struct bpos node_start = b->data->min_key; - struct bpos node_end = b->data->max_key; - struct bpos expected_start = bkey_deleted(&prev->k->k) - ? node_start - : bpos_successor(prev->k->k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - int ret = 0; - - if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - - if (!bpos_eq(expected_start, bp->v.min_key)) { - bch2_topology_error(c); - - if (bkey_deleted(&prev->k->k)) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, node_start); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); - } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - } - - if (is_last && !bpos_eq(cur.k->k.p, node_end)) { - bch2_topology_error(c); - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); - bch2_bpos_to_text(&buf2, node_end); - - if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - - bch2_bkey_buf_copy(prev, c, cur.k); -err: -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) { switch (b->key.k.type) { @@ -212,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) return -BCH_ERR_ENOMEM_gc_repair_key; @@ -237,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); if (ret) return ret; @@ -268,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur, + struct bpos *pulled_from_scan) { struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - if (!prev) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, b->data->min_key); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + + prt_printf(&buf, " at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { + prt_printf(&buf, "\n prev: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + prt_str(&buf, "\n next: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - if (prev && - bpos_gt(expected_start, cur->data->min_key) && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { - /* cur overwrites prev: */ + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, cur->data->min_key)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) + goto err; - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, - cur->data->min_key), c, - btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_PREV_NODE; - goto out; + *pulled_from_scan = cur->data->min_key; + ret = DID_FILL_FROM_SCAN; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); } - - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, - bpos_predecessor(cur->data->min_key)), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } else { - /* prev overwrites cur: */ - - if (mustfix_fsck_err_on(bpos_ge(expected_start, - cur->data->max_key), c, - btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_THIS_NODE; - goto out; + } else { /* overlap */ + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) + ret = DROP_PREV_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } + } else { + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) + ret = DROP_THIS_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } } - - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_min(c, cur, expected_start); } -out: +err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, - struct btree *child) + struct btree *child, struct bpos *pulled_from_scan) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); - bch2_bpos_to_text(&buf2, b->key.k.p); + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = set_node_max(c, child, b->key.k.p); - if (ret) - goto err; + prt_printf(&buf, "at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); + + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) { + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, b->key.k.p)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) + goto err; + + *pulled_from_scan = b->key.k.p; + ret = DID_FILL_FROM_SCAN; + } else { + ret = set_node_max(c, child, b->key.k.p); + } } err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, + struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; - bool have_child, dropped_children = false; + bool have_child, new_pass = false; struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) return 0; -again: - prev = NULL; - have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); +again: + cur = prev = NULL; + have_child = new_pass = false; bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -415,9 +367,10 @@ again: b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); cur = NULL; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: + bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; continue; @@ -427,7 +380,23 @@ again: if (ret) break; - ret = btree_repair_node_boundaries(c, b, prev, cur); + if (bch2_btree_node_is_stale(c, cur)) { + bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); @@ -445,6 +414,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { + bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); @@ -452,8 +422,6 @@ again: break; bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); goto again; } else if (ret) break; @@ -465,7 +433,11 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev); + ret = btree_repair_node_end(c, b, prev, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } } if (!IS_ERR_OR_NULL(prev)) @@ -479,6 +451,10 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); + + if (new_pass) + goto again; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -495,7 +471,7 @@ again: if (ret) goto err; - ret = bch2_btree_repair_topology_recurse(trans, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); six_unlock_read(&cur->c.lock); cur = NULL; @@ -503,7 +479,7 @@ again: bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - dropped_children = true; + new_pass = true; } if (ret) @@ -530,12 +506,14 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - if (!ret && dropped_children) + if (!ret && new_pass) goto again; + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); return ret; } @@ -543,32 +521,63 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree *b; - unsigned i; + struct bpos pulled_from_scan = POS_MIN; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; - if (!r->alive) - continue; + if (r->error) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; +reconstruct_root: + bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); - b = r->b; - if (btree_node_fake(b)) - continue; + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + } else { + bch2_btree_root_alloc_fake(c, i, 1); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + break; + } + + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + reconstructed_root = true; + } + + struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b); + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch_err(c, "empty btree root - repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + r->b = NULL; + + if (!reconstructed_root) + goto reconstruct_root; + + bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + r->alive = false; + ret = 0; } } - +fsck_err: bch2_trans_put(trans); - return ret; } @@ -591,7 +600,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); if (fsck_err_on(!g->gen_valid, c, ptr_to_missing_alloc_key, @@ -657,7 +666,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id continue; if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != data_type, c, + bucket_data_type(g->data_type) != + bucket_data_type(data_type), c, ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", @@ -698,18 +708,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } if (do_update) { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct bkey_i *new; - if (is_root) { bch_err(c, "cannot update btree roots yet"); ret = -EINVAL; goto err; } - new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { ret = -BCH_ERR_ENOMEM_gc_repair_key; bch_err_msg(c, ret, "allocating new key"); @@ -724,7 +729,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * btree node isn't there anymore, the read path will * sort it out: */ - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); @@ -732,19 +737,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id ptr->gen = g->gen; } } else { - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->gen) < 0) || - gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type); - })); + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { @@ -774,12 +786,6 @@ found: } } - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } - if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); @@ -793,6 +799,12 @@ found: bch_info(c, "new key %s", buf.buf); } + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + *k = bkey_i_to_s_c(new); } err: @@ -819,10 +831,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", @@ -831,8 +839,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); + bch2_key_trigger(trans, btree_id, level, old, + unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: bch_err_fn(c, ret); @@ -841,42 +854,30 @@ err: static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { - struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; - struct bkey_buf prev, cur; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; bch2_btree_node_iter_init_from_start(&iter, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, initial); if (ret) - break; + return ret; bch2_btree_node_iter_advance(&iter, b); - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_gc_check_topology(c, b, &prev, cur, - bch2_btree_node_iter_end(&iter)); - if (ret) - break; - } } - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret; + return 0; } static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, @@ -925,14 +926,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf cur, prev; + struct bkey_buf cur; struct printbuf buf = PRINTBUF; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -943,20 +946,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (ret) goto fsck_err; - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - k = bkey_i_to_s_c(cur.k); - - bch2_btree_and_journal_iter_advance(&iter); - - ret = bch2_gc_check_topology(c, b, - &prev, cur, - !bch2_btree_and_journal_iter_peek(&iter).k); - if (ret) - goto fsck_err; - } else { - bch2_btree_and_journal_iter_advance(&iter); - } + bch2_btree_and_journal_iter_advance(&iter); } if (b->c.level > target_depth) { @@ -1015,7 +1005,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b } fsck_err: bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); printbuf_exit(&buf); return ret; @@ -1033,9 +1022,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, b = bch2_btree_id_root(c, btree_id)->b; - if (btree_node_fake(b)) - return 0; - six_lock_read(&b->c.lock, NULL, NULL); printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); @@ -1392,11 +1378,11 @@ static int bch2_alloc_write_key(struct btree_trans *trans, *old, b->data_type); gc = *b; - percpu_up_read(&c->mark_lock); if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + percpu_up_read(&c->mark_lock); if (metadata_only && gc.data_type != BCH_DATA_sb && diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 624c8287..9cd9b657 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -654,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) */ bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + b->nr = bch2_btree_node_count_keys(b); struct bkey_s_c k; struct bkey unpacked; @@ -1066,7 +1067,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, - "error decrypting btree node: %i", ret)) + "decrypting btree node: %s", bch2_err_str(ret))) goto fsck_err; btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && @@ -1107,7 +1108,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, - "error decrypting btree node: %i\n", ret)) + "decrypting btree node: %s", bch2_err_str(ret))) goto fsck_err; sectors = vstruct_sectors(bne, c->block_bits); @@ -1265,8 +1266,10 @@ fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || ret == -BCH_ERR_btree_node_read_err_must_retry) retry_read = 1; - else + else { set_btree_node_read_error(b); + set_bit(b->c.btree_id, &c->btrees_lost_data); + } goto out; } @@ -1327,6 +1330,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); + set_bit(b->c.btree_id, &c->btrees_lost_data); break; } } @@ -1338,7 +1342,7 @@ start: if (saw_error && !btree_node_read_error(b)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); - bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", + bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); bch2_btree_node_rewrite_async(c, b); @@ -1526,9 +1530,10 @@ fsck_err: ret = -1; } - if (ret) + if (ret) { set_btree_node_read_error(b); - else if (*saw_error) + set_bit(b->c.btree_id, &c->btrees_lost_data); + } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { @@ -1657,13 +1662,14 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err(c, "%s", buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); set_btree_node_read_error(b); + set_bit(b->c.btree_id, &c->btrees_lost_data); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); @@ -1874,8 +1880,8 @@ out: return; err: set_btree_node_noevict(b); - if (!bch2_err_matches(ret, EROFS)) - bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); + bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, + "writing btree node: %s", bch2_err_str(ret)); goto out; } @@ -2131,7 +2137,7 @@ do_write: ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, - "error encrypting btree node: %i\n", ret)) + "encrypting btree node: %s", bch2_err_str(ret))) goto err; nonce = btree_nonce(i, b->written << 9); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 51bcdc6c..2a211a4b 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (ret) goto err; } else { - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); + if (!k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_btree_need_topology_repair; + goto err; + } + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); if ((flags & BTREE_ITER_PREFETCH) && c->opts.btree_node_prefetch) { @@ -962,7 +976,6 @@ err: return ret; } - static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -2790,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); + if (trans->used_mempool) { + if (trans->mem_bytes >= new_bytes) + goto out_change_top; + + /* No more space from mempool item, need malloc new one */ + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = kmalloc(new_bytes, GFP_KERNEL); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_mem); + return ERR_PTR(ret); + } + } + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = false; + mempool_free(trans->mem, &c->btree_trans_mem_pool); + goto out_new_mem; + } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); @@ -2798,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = true; kfree(trans->mem); } @@ -2811,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } - +out_new_mem: trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -2819,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } - +out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3093,7 +3133,7 @@ void bch2_trans_put(struct btree_trans *trans) if (paths_allocated != trans->_paths_allocated) kvfree_rcu_mightsleep(paths_allocated); - if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + if (trans->used_mempool) mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 50e04356..5cbcbfe8 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -261,6 +261,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -363,7 +379,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -375,9 +391,10 @@ again: bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -435,7 +452,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &trans->c->journal_iters); + if (trans->journal_replay_not_finished && + !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -548,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c) bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h index c9d19da3..af25046e 100644 --- a/libbcachefs/btree_journal_iter.h +++ b/libbcachefs/btree_journal_iter.h @@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); +bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); @@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); +void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + unsigned, unsigned, + struct bpos, struct bpos); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 8a71d434..581edcb0 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -676,7 +676,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && !bch2_journal_error(j), c, - "error flushing key cache: %s", bch2_err_str(ret)); + "flushing key cache: %s", bch2_err_str(ret)); if (ret) goto out; diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c new file mode 100644 index 00000000..3f33be7e --- /dev/null +++ b/libbcachefs/btree_node_scan.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_journal_iter.h" +#include "btree_node_scan.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "recovery_passes.h" + +#include <linux/kthread.h> +#include <linux/sort.h> + +struct find_btree_nodes_worker { + struct closure *cl; + struct find_btree_nodes *f; + struct bch_dev *ca; +}; + +static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) +{ + prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); + if (n->overwritten) + prt_str(out, " overwritten"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, n->ptrs + i); + } +} + +static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) +{ + printbuf_indent_add(out, 2); + darray_for_each(nodes, i) { + found_btree_node_to_text(out, c, i); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) +{ + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); + + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); + bp->k.p = f->max_key; + bp->v.seq = cpu_to_le64(f->cookie); + bp->v.sectors_written = 0; + bp->v.flags = 0; + bp->v.min_key = f->min_key; + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); +} + +static bool found_btree_node_is_readable(struct btree_trans *trans, + const struct found_btree_node *f) +{ + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; + + found_btree_node_to_key(&k.k, f); + + struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); + bool ret = !IS_ERR_OR_NULL(b); + if (ret) + six_unlock_read(&b->c.lock); + + /* + * We might update this node's range; if that happens, we need the node + * to be re-read so the read path can trim keys that are no longer in + * this node + */ + if (b != btree_node_root(trans->c, b)) + bch2_btree_node_evict(trans, &k.k); + return ret; +} + +static int found_btree_node_cmp_cookie(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + cmp_int(l->cookie, r->cookie); +} + +/* + * Given two found btree nodes, if their sequence numbers are equal, take the + * one that's readable: + */ +static int found_btree_node_cmp_time(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return cmp_int(l->seq, r->seq); +} + +static int found_btree_node_cmp_pos(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->min_key, r->min_key) ?: + -found_btree_node_cmp_time(l, r); +} + +static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, bn, PAGE_SIZE); + + submit_bio_wait(bio); + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status))) + return; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return; + + rcu_read_lock(); + struct found_btree_node n = { + .btree_id = BTREE_NODE_ID(bn), + .level = BTREE_NODE_LEVEL(bn), + .seq = BTREE_NODE_SEQ(bn), + .cookie = le64_to_cpu(bn->keys.seq), + .min_key = bn->min_key, + .max_key = bn->max_key, + .nr_ptrs = 1, + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, + .ptrs[0].offset = offset, + .ptrs[0].dev = ca->dev_idx, + .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + }; + rcu_read_unlock(); + + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + mutex_lock(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; + goto unlock; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +unlock: + mutex_unlock(&f->lock); + } +} + +static int read_btree_nodes_worker(void *p) +{ + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; + void *buf = (void *) __get_free_page(GFP_KERNEL); + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; + + if (!buf || !bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + w->f->ret = -ENOMEM; + goto err; + } + + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + for (unsigned bucket_offset = 0; + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) { + if (time_after(jiffies, last_print + HZ * 30)) { + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(cur_sector * 100, end_sector)); + last_print = jiffies; + } + + try_read_btree_node(w->f, ca, bio, buf, + bucket * ca->mi.bucket_size + bucket_offset); + } +err: + bio_put(bio); + free_page((unsigned long) buf); + percpu_ref_get(&ca->io_ref); + closure_put(w->cl); + kfree(w); + return 0; +} + +static int read_btree_nodes(struct find_btree_nodes *f) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + for_each_online_member(c, ca) { + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + struct task_struct *t; + + if (!w) { + percpu_ref_put(&ca->io_ref); + ret = -ENOMEM; + goto err; + } + + percpu_ref_get(&ca->io_ref); + closure_get(&cl); + w->cl = &cl; + w->f = f; + w->ca = ca; + + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = IS_ERR_OR_NULL(t); + if (ret) { + percpu_ref_put(&ca->io_ref); + closure_put(&cl); + f->ret = ret; + bch_err(c, "error starting kthread: %i", ret); + break; + } + } +err: + closure_sync(&cl); + return f->ret ?: ret; +} + +static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +{ + while (n + 1 < end && + found_btree_node_cmp_pos(n, n + 1) > 0) { + swap(n[0], n[1]); + n++; + } +} + +static int handle_overwrites(struct bch_fs *c, + struct found_btree_node *start, + struct found_btree_node *end) +{ + struct found_btree_node *n; +again: + for (n = start + 1; + n < end && + n->btree_id == start->btree_id && + n->level == start->level && + bpos_lt(n->min_key, start->max_key); + n++) { + int cmp = found_btree_node_cmp_time(start, n); + + if (cmp > 0) { + if (bpos_cmp(start->max_key, n->max_key) >= 0) + n->overwritten = true; + else { + n->range_updated = true; + n->min_key = bpos_successor(start->max_key); + n->range_updated = true; + bubble_up(n, end); + goto again; + } + } else if (cmp < 0) { + BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + + start->max_key = bpos_predecessor(n->min_key); + start->range_updated = true; + } else { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); + found_btree_node_to_text(&buf, c, start); + prt_str(&buf, "\n "); + found_btree_node_to_text(&buf, c, n); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return -1; + } + } + + return 0; +} + +int bch2_scan_for_btree_nodes(struct bch_fs *c) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; + size_t dst; + int ret = 0; + + if (f->nodes.nr) + return 0; + + mutex_init(&f->lock); + + ret = read_btree_nodes(f); + if (ret) + return ret; + + if (!f->nodes.nr) { + bch_err(c, "%s: no btree nodes found", __func__); + ret = -EINVAL; + goto err; + } + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + + dst = 0; + darray_for_each(f->nodes, i) { + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; + + if (prev && + prev->cookie == i->cookie) { + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { + bch_err(c, "%s: found too many replicas for btree node", __func__); + ret = -EINVAL; + goto err; + } + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; + } else { + f->nodes.data[dst++] = *i; + } + } + f->nodes.nr = dst; + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + dst = 0; + darray_for_each(f->nodes, i) { + if (i->overwritten) + continue; + + ret = handle_overwrites(c, i, &darray_top(f->nodes)); + if (ret) + goto err; + + BUG_ON(i->overwritten); + f->nodes.data[dst++] = *i; + } + f->nodes.nr = dst; + + if (c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +err: + printbuf_exit(&buf); + return ret; +} + +static int found_btree_node_range_start_cmp(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->max_key, r->min_key); +} + +#define for_each_found_btree_node_in_range(_f, _search, _idx) \ + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ + sizeof((_f)->nodes.data[0]), \ + found_btree_node_range_start_cmp, &search); \ + _idx < (_f)->nodes.nr && \ + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ + (_f)->nodes.data[_idx].level == _search.level && \ + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) + +bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + struct found_btree_node search = { + .btree_id = b->c.btree_id, + .level = b->c.level, + .min_key = b->data->min_key, + .max_key = b->key.k.p, + }; + + for_each_found_btree_node_in_range(f, search, idx) + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) + return true; + return false; +} + +bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +{ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, + .min_key = POS_MIN, + .max_key = SPOS_MAX, + }; + + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) + return true; + return false; +} + +int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos node_min, struct bpos node_max) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + + struct found_btree_node search = { + .btree_id = btree, + .level = level, + .min_key = node_min, + .max_key = node_max, + }; + + for_each_found_btree_node_in_range(f, search, idx) { + struct found_btree_node n = f->nodes.data[idx]; + + n.range_updated |= bpos_lt(n.min_key, node_min); + n.min_key = bpos_max(n.min_key, node_min); + + n.range_updated |= bpos_gt(n.max_key, node_max); + n.max_key = bpos_min(n.max_key, node_max); + + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, &n); + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + + BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) + return ret; + } + + return 0; +} + +void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) +{ + darray_exit(&f->nodes); +} diff --git a/libbcachefs/btree_node_scan.h b/libbcachefs/btree_node_scan.h new file mode 100644 index 00000000..08687b20 --- /dev/null +++ b/libbcachefs/btree_node_scan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_H +#define _BCACHEFS_BTREE_NODE_SCAN_H + +int bch2_scan_for_btree_nodes(struct bch_fs *); +bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); +void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/libbcachefs/btree_node_scan_types.h b/libbcachefs/btree_node_scan_types.h new file mode 100644 index 00000000..abb7b27d --- /dev/null +++ b/libbcachefs/btree_node_scan_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H +#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H + +#include "darray.h" + +struct found_btree_node { + bool range_updated:1; + bool overwritten:1; + u8 btree_id; + u8 level; + u32 seq; + u64 cookie; + + struct bpos min_key; + struct bpos max_key; + + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; +}; + +typedef DARRAY(struct found_btree_node) found_btree_nodes; + +struct find_btree_nodes { + int ret; + struct mutex lock; + found_btree_nodes nodes; +}; + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 30d69a6d..96669fed 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index a4b40c16..8e47e260 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans, struct bkey_i *update; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + update = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(update); if (ret) @@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 642213ef..8d74dfcb 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,6 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); + + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - bch2_btree_node_iter_advance(&iter, b); - - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; - } - - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); } -#endif + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -646,7 +695,7 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s(): error %s", __func__, bch2_err_str(ret)); + "%s", bch2_err_str(ret)); err: if (as->b) { @@ -1067,13 +1116,18 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < c->journal.watermark) { + if (watermark < c->journal.watermark) { struct journal_res res = { 0 }; + unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark != BCH_WATERMARK_reclaim) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, - watermark|JOURNAL_RES_GET_CHECK)); + bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_journal_reclaim_would_deadlock; if (ret) return ERR_PTR(ret); } @@ -1117,6 +1171,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init(&as->cl, NULL); as->c = c; as->start_time = start_time; + as->ip_started = _RET_IP_; as->mode = BTREE_INTERIOR_NO_UPDATE; as->took_gc_lock = true; as->btree_id = path->btree_id; @@ -1192,7 +1247,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS)) + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_reclaim_would_deadlock) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1373,9 +1429,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); @@ -1434,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1466,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as, __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1481,9 +1543,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1703,7 +1770,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } bch2_btree_insert_keys_interior(as, trans, path, b, keys); @@ -1721,7 +1792,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* @@ -1811,9 +1882,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, { struct bch_fs *c = trans->c; struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, - b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2114,7 +2188,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - bch_err_fn(c, ret); + bch_err_fn_ratelimited(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2161,7 +2235,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); } - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } void bch2_do_pending_node_rewrites(struct bch_fs *c) @@ -2173,7 +2247,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) list_del(&a->list); bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } mutex_unlock(&c->pending_node_rewrites_lock); } @@ -2384,7 +2458,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2403,7 +2477,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2430,9 +2504,9 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2441,12 +2515,12 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%p m %u w %u r %u j %llu\n", - as, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + as->mode, + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); } @@ -2510,6 +2584,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + if (c->btree_node_rewrite_worker) + destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) destroy_workqueue(c->btree_interior_update_worker); mempool_exit(&c->btree_interior_update_pool); @@ -2534,6 +2610,11 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + c->btree_node_rewrite_worker = + alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); + if (!c->btree_node_rewrite_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 3439b037..cffb2c67 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -10,6 +10,8 @@ #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) +int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -32,6 +34,7 @@ struct btree_update { struct closure cl; struct bch_fs *c; u64 start_time; + unsigned long ip_started; struct list_head list; struct list_head unwritten_list; @@ -162,7 +165,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); +void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index b77e7b38..baf63e2f 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -11,6 +11,7 @@ #include "journal_reclaim.h" #include <linux/prefetch.h> +#include <linux/sort.h> static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); @@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } +static int wb_key_seq_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { @@ -357,6 +366,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + sort(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; @@ -378,7 +392,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) } } err: - bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); + bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); bch2_journal_pin_drop(j, &wb->flushing.pin); wb->flushing.keys.nr = 0; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index c2f46b26..941401a2 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) { + BUG(); ret = -EIO; goto err; } @@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + BUG(); ret = -EIO; goto err; } @@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans, static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - s64 *sectors, - unsigned flags) + const union bch_extent_entry *entry, + s64 *sectors, unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct bpos bucket; struct bch_backpointer bp; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_TRANSACTIONAL) { @@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_GC) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); percpu_down_read(&c->mark_lock); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); @@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; @@ -990,8 +992,8 @@ static int __trigger_extent(struct btree_trans *trans, ret = !gc ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); - bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors", - __func__); + bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors", + bch2_err_str(ret)); if (ret) return ret; } @@ -1020,7 +1022,7 @@ static int __trigger_extent(struct btree_trans *trans, struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); printbuf_exit(&buf); } if (ret) diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 38defa19..cbfa6459 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -7,7 +7,7 @@ #include "chardev.h" #include "journal.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super.h" #include "super-io.h" diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 4150feca..b564404d 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -14,6 +14,7 @@ #include "move.h" #include "nocow_locking.h" #include "rebalance.h" +#include "snapshot.h" #include "subvolume.h" #include "trace.h" @@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned ptrs_locked = 0; int ret = 0; + /* + * fs is corrupt we have a key for a snapshot node that doesn't exist, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + return -BCH_ERR_data_update_done; + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index b1f147e6..208ce6f0 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -170,7 +170,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); printbuf_exit(&buf); } out: diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index b98e2c2b..08207524 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -448,7 +448,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); printbuf_exit(&buf); return ret; } @@ -1868,10 +1868,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri return -BCH_ERR_stripe_alloc_blocked; ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); + bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, + "reading stripe key: %s", bch2_err_str(ret)); if (ret) { bch2_stripe_close(c, h->s); - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); return ret; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index af25d8ec..01a79fa3 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -252,7 +252,8 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) + x(0, need_inode_lock) \ + x(0, invalid_snapshot_node) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 04343120..f942a394 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "recovery.h" +#include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 94491190..36caedf7 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *); int bch2_topology_error(struct bch_fs *); +#define bch2_fs_topology_error(c, ...) \ +({ \ + bch_err(c, "btree topology error: " __VA_ARGS__); \ + bch2_topology_error(c); \ +}) + #define bch2_fs_inconsistent(c, ...) \ ({ \ bch_err(c, __VA_ARGS__); \ @@ -191,9 +197,9 @@ do { \ void bch2_fatal_error(struct bch_fs *); -#define bch2_fs_fatal_error(c, ...) \ +#define bch2_fs_fatal_error(c, _msg, ...) \ do { \ - bch_err(c, __VA_ARGS__); \ + bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \ bch2_fatal_error(c); \ } while (0) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 61395b11..0e3ca99f 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, - btree_ptr_v2_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, + c, err, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), + c, err, btree_ptr_v2_min_key_bad, + "min_key > key"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; @@ -973,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return bkey_deleted(k.k); } +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -988,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) - prt_printf(out, " stale"); - } + case BCH_EXTENT_ENTRY_ptr: + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; - } + case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 6219f2c0..528e817e 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -108,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; + return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; } static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; + return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) { - switch (extent_entry_type(e)) { + switch (__extent_entry_type(e)) { case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: @@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return BCH_DATA_user; - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); @@ -700,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, diff --git a/libbcachefs/eytzinger.c b/libbcachefs/eytzinger.c new file mode 100644 index 00000000..a064ccff --- /dev/null +++ b/libbcachefs/eytzinger.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "eytzinger.h" + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap_f; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap_f(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, c, r; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_f) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap_f = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index b04750db..ee0e2df3 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -5,23 +5,33 @@ #include <linux/bitops.h> #include <linux/log2.h> -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. + * + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); } while (n < nr); if (n & 1) { /* @i was greater than @search, return previous node: */ - - if (i == eytzinger0_first(nr)) - return -1; - return eytzinger0_prev(i, nr); } else { return i; } } +static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + return eytzinger0_next(idx, size); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ @@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size, _size))) \ + (_res = _cmp(_search, _base + _i * _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); #endif /* _EYTZINGER_H */ diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 33cb6da3..f49e6c0f 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -536,7 +536,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (likely(!dio->iter.count) || dio->op.error) break; - bio_reset(bio, NULL, REQ_OP_WRITE); + bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); } out: return bch2_dio_write_done(dio); @@ -618,7 +618,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 3f073845..b5ea9fa1 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -108,7 +108,8 @@ retry: goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "inode %u:%llu not found when updating", + "%s: inode %u:%llu not found when updating", + bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); @@ -1996,6 +1997,7 @@ out: return dget(sb->s_root); err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); return ERR_PTR(bch2_err_class(ret)); } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index f48033be..aca6fae4 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -12,7 +12,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "snapshot.h" #include "super.h" #include "xattr.h" @@ -1114,10 +1114,9 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } -static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1149,7 +1148,14 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) } fsck_err: bch_err_fn(c, ret); - return ret ?: trans_was_restarted(trans, restart_count); + return ret; +} + +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + u32 restart_count = trans->restart_count; + return check_i_sectors_notnested(trans, w) ?: + trans_was_restarted(trans, restart_count); } struct extent_end { @@ -1533,7 +1539,7 @@ int bch2_check_extents(struct bch_fs *c) check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: check_extent_overbig(trans, &iter, k); })) ?: - check_i_sectors(trans, &w)); + check_i_sectors_notnested(trans, &w)); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); @@ -1563,10 +1569,9 @@ int bch2_check_indirect_extents(struct bch_fs *c) return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1598,7 +1603,14 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) } fsck_err: bch_err_fn(c, ret); - return ret ?: trans_was_restarted(trans, restart_count); + return ret; +} + +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + u32 restart_count = trans->restart_count; + return check_subdir_count_notnested(trans, w) ?: + trans_was_restarted(trans, restart_count); } static int check_dirent_inode_dirent(struct btree_trans *trans, @@ -2003,7 +2015,8 @@ int bch2_check_dirents(struct bch_fs *c) k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: + check_subdir_count_notnested(trans, &dir)); snapshots_seen_exit(&s); inode_walker_exit(&dir); @@ -2022,8 +2035,10 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, int ret; ret = check_key_has_snapshot(trans, iter, k); - if (ret) + if (ret < 0) return ret; + if (ret) + return 0; i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); @@ -2083,17 +2098,21 @@ static int check_root_trans(struct btree_trans *trans) if (mustfix_fsck_err_on(ret, c, root_subvol_missing, "root subvol missing")) { - struct bkey_i_subvolume root_subvol; + struct bkey_i_subvolume *root_subvol = + bch2_trans_kmalloc(trans, sizeof(*root_subvol)); + ret = PTR_ERR_OR_ZERO(root_subvol); + if (ret) + goto err; snapshot = U32_MAX; inum = BCACHEFS_ROOT_INO; - bkey_subvolume_init(&root_subvol.k_i); - root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol.v.flags = 0; - root_subvol.v.snapshot = cpu_to_le32(snapshot); - root_subvol.v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); + bkey_subvolume_init(&root_subvol->k_i); + root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol->v.flags = 0; + root_subvol->v.snapshot = cpu_to_le32(snapshot); + root_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 2b5e0677..ca4a066e 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -552,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); prt_newline(out); - prt_newline(out); prt_printf(out, "bi_version=%llu", inode->bi_version); + prt_newline(out); #define x(_name, _bits) \ prt_printf(out, #_name "=%llu", (u64) inode->_name); \ diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index 1baf7859..82f9170d 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, ret = 0; err: bch2_logged_op_finish(trans, op_k); + bch_err_fn(c, ret); return ret; } @@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish: break; } err: + bch_err_fn(c, ret); bch2_logged_op_finish(trans, op_k); bch2_trans_iter_exit(trans, &iter); return ret; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index f314b2e7..9c9a25db 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -511,18 +511,18 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; + if (bch2_journal_error(j)) + return -BCH_ERR_erofs_journal_err; + + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { ret = JOURNAL_ERR_journal_full; can_discard = j->can_discard; goto out; } - if (j->blocked) - return -BCH_ERR_journal_res_get_blocked; - - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { ret = JOURNAL_ERR_max_in_flight; goto out; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index d76c3c0c..725fcf46 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1082,9 +1082,7 @@ reread: ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, vstruct_end(j) - (void *) j->encrypted_start); - bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %s", - bch2_err_str(ret)); + bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { @@ -1820,7 +1818,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) jset_entry_for_each_key(i, k) { ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); if (ret) { - bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", + bch2_err_str(ret)); bch2_journal_keys_to_write_buffer_end(c, &wb); return ret; } @@ -1848,7 +1847,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; - BUG_ON(u64s > j->entry_u64s_reserved); + + WARN_ON(u64s > j->entry_u64s_reserved); le32_add_cpu(&jset->u64s, u64s); @@ -1856,7 +1856,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) bytes = vstruct_bytes(jset); if (sectors > w->sectors) { - bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", vstruct_bytes(jset), w->sectors << 9, u64s, w->u64s_reserved, j->entry_u64s_reserved); return -EINVAL; @@ -1884,8 +1884,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret)) + if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index b5303874..37a024e0 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -95,8 +95,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c index ad598105..9fac838d 100644 --- a/libbcachefs/logged_ops.c +++ b/libbcachefs/logged_ops.c @@ -101,8 +101,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s", - __func__, buf.buf, bch2_err_str(ret)); + bch2_fs_fatal_error(c, "deleting logged operation %s: %s", + buf.buf, bch2_err_str(ret)); printbuf_exit(&buf); } } diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 69e06a84..0d2b82d8 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_err_matches(ret, EROFS)) return ret; - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", - __func__, bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 136083c1..9da66395 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -363,11 +363,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ - x(keep_journal, u8, \ + x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys after startup")\ + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 2af219ae..b2b8fdf2 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1,35 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" #include "alloc_background.h" -#include "btree_gc.h" +#include "bkey_buf.h" #include "btree_journal_iter.h" +#include "btree_node_scan.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "buckets.h" #include "dirent.h" -#include "ec.h" #include "errcode.h" #include "error.h" #include "fs-common.h" -#include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" -#include "lru.h" #include "logged_ops.h" #include "move.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-downgrade.h" #include "snapshot.h" -#include "subvolume.h" #include "super-io.h" #include <linux/sort.h> @@ -52,7 +48,7 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void do_reconstruct_alloc(struct bch_fs *c) +static void bch2_reconstruct_alloc(struct bch_fs *c) { bch2_journal_log_msg(c, "dropping alloc info"); bch_info(c, "dropping and reconstructing all alloc info"); @@ -87,13 +83,17 @@ static void do_reconstruct_alloc(struct bch_fs *c) c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - struct journal_keys *keys = &c->journal_keys; - size_t src, dst; - for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->data[src].btree_id)) - keys->data[dst++] = keys->data[src]; - keys->nr = dst; + bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* @@ -184,7 +184,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c) +int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; DARRAY(struct journal_key *) keys_sorted = { 0 }; @@ -192,6 +192,7 @@ static int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; struct btree_trans *trans = bch2_trans_get(c); + bool immediate_flush = false; int ret = 0; if (keys->nr) { @@ -203,6 +204,8 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); + move_gap(keys, keys->nr); + /* * First, attempt to replay keys in sorted order. This is more * efficient - better locality of btree access - but some might fail if @@ -211,6 +214,13 @@ static int bch2_journal_replay(struct bch_fs *c) darray_for_each(*keys, k) { cond_resched(); + /* + * k->allocated means the key wasn't read in from the journal, + * rather it was from early repair code + */ + if (k->allocated) + immediate_flush = true; + /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -262,7 +272,7 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal) + if (!c->opts.retain_recovery_info) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -270,6 +280,12 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); + /* if we did any repair, flush it immediately */ + if (immediate_flush) { + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_meta(&c->journal); + } + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: @@ -419,10 +435,9 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - unsigned i; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) @@ -431,186 +446,46 @@ static int read_btree_roots(struct bch_fs *c) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - if (r->error) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - btree_root_bkey_invalid, - "invalid btree root %s", - bch2_btree_id_str(i)); - if (i == BTREE_ID_alloc) + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", + bch2_btree_id_str(i)) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, + "error reading btree root %s l=%u: %s", + bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) { + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - } + r->error = 0; + } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + bch_info(c, "will run btree node scan"); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + } - ret = bch2_btree_root_read(c, i, &r->key, r->level); - if (ret) { - fsck_err(c, - btree_root_read_error, - "error reading btree root %s", - bch2_btree_id_str(i)); - if (btree_id_is_alloc(i)) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + set_bit(i, &c->btrees_lost_data); ret = 0; } } - for (i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (!r->b) { + if (!r->b && !r->error) { r->alive = false; r->level = 0; - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); } } fsck_err: return ret; } -static int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -noinline_for_stack -static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (keys->nr || c->opts.fsck || !c->sb.clean) - return bch2_fs_read_write_early(c); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - static bool check_version_upgrade(struct bch_fs *c) { unsigned latest_version = bcachefs_metadata_version_current; @@ -683,96 +558,6 @@ static bool check_version_upgrade(struct bch_fs *c) return false; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); - } - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); - } - - return ret; -} - -int bch2_run_online_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; - - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - break; - } - - return ret; -} - int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; @@ -811,6 +596,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } + c->opts.retain_recovery_info |= c->opts.norecovery; + c->opts.nochanges |= c->opts.norecovery; + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -881,7 +669,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -961,7 +749,7 @@ use_clean: c->journal_replay_seq_end = blacklist_seq - 1; if (c->opts.reconstruct_alloc) - do_reconstruct_alloc(c); + bch2_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -1013,6 +801,12 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); + /* fsync if we fixed errors */ + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && test_bit(BCH_FS_errors_fixed, &c->flags) && @@ -1109,9 +903,10 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!c->opts.retain_recovery_info) { bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + } kfree(clean); if (!ret && @@ -1151,11 +946,11 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); for_each_member_device(c, ca) bch2_dev_usage_init(ca); @@ -1226,7 +1021,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 4e9d2471..3962fd87 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -2,37 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -extern const char * const bch2_recovery_passes[]; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return 0; - - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - -int bch2_run_online_recovery_passes(struct bch_fs *); -u64 bch2_fsck_recovery_passes(void); +int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c new file mode 100644 index 00000000..6fa095fe --- /dev/null +++ b/libbcachefs/recovery_passes.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_gc.h" +#include "btree_node_scan.h" +#include "ec.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "lru.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "recovery.h" +#include "recovery_passes.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" + +const char * const bch2_recovery_passes[] = { +#define x(_fn, ...) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys, keys->nr); + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; + continue; + } + if (ret) + break; + } + + return ret; +} + +int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + } + + return ret; +} diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h new file mode 100644 index 00000000..abefa677 --- /dev/null +++ b/libbcachefs/recovery_passes.h @@ -0,0 +1,16 @@ +#ifndef _BCACHEFS_RECOVERY_PASSES_H +#define _BCACHEFS_RECOVERY_PASSES_H + +extern const char * const bch2_recovery_passes[]; + +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *); +int bch2_run_recovery_passes(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_passes_types.h similarity index 92% rename from libbcachefs/recovery_types.h rename to libbcachefs/recovery_passes_types.h index 1361e34d..840542cf 100644 --- a/libbcachefs/recovery_types.h +++ b/libbcachefs/recovery_passes_types.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_TYPES_H -#define _BCACHEFS_RECOVERY_TYPES_H +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H #define PASS_SILENT BIT(0) #define PASS_FSCK BIT(1) @@ -13,11 +13,12 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ x(alloc_read, 0, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_topology, 4, 0) \ x(check_allocations, 5, PASS_FSCK) \ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ @@ -37,7 +38,6 @@ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ x(check_indirect_extents, 26, PASS_FSCK) \ @@ -47,6 +47,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ @@ -56,6 +57,7 @@ enum bch_recovery_pass { #define x(n, id, when) BCH_RECOVERY_PASS_##n, BCH_RECOVERY_PASSES() #undef x + BCH_RECOVERY_PASS_NR }; /* But we also need stable identifiers that can be used in the superblock */ @@ -65,4 +67,4 @@ enum bch_recovery_pass_stable { #undef x }; -#endif /* _BCACHEFS_RECOVERY_TYPES_H */ +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index c47c66c2..ff786473 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -185,8 +185,7 @@ not_found: } else { bkey_error_init(update); update->k.p = p.k->p; - update->k.p.offset = next_idx; - update->k.size = next_idx - *idx; + update->k.size = p.k->size; set_bkey_val_u64s(&update->k, 0); } diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index cc2672c1..678b9c20 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -6,12 +6,15 @@ #include "replicas.h" #include "super-io.h" +#include <linux/sort.h> + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, verify_replicas_entry(search); -#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index e4396cb0..d6f81179 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -7,7 +7,7 @@ #include "bcachefs.h" #include "darray.h" -#include "recovery.h" +#include "recovery_passes.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h index 5178bf57..2fec03a2 100644 --- a/libbcachefs/sb-errors_types.h +++ b/libbcachefs/sb-errors_types.h @@ -265,7 +265,10 @@ x(subvol_children_bad, 257) \ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index ac6ba04d..4e074136 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -91,23 +91,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, /* Snapshot nodes: */ +static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) +{ + while (id && id < ancestor) { + const struct snapshot_t *s = __snapshot_t(t, id); + id = s ? s->parent : 0; + } + return id == ancestor; +} + static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) { - struct snapshot_table *t; - rcu_read_lock(); - t = rcu_dereference(c->snapshots); - - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; + bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); rcu_read_unlock(); - return id == ancestor; + return ret; } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; if (s->skip[2] <= ancestor) return s->skip[2]; @@ -120,13 +126,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - struct snapshot_table *t; bool ret; - EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); - rcu_read_lock(); - t = rcu_dereference(c->snapshots); + struct snapshot_table *t = rcu_dereference(c->snapshots); + + if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) { + ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); + goto out; + } while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); @@ -134,11 +142,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) if (id && id < ancestor) { ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); - EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); } else { ret = id == ancestor; } - +out: rcu_read_unlock(); return ret; @@ -147,36 +155,39 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; - size_t new_size; struct snapshot_table *new, *old; - new_size = max(16UL, roundup_pow_of_two(idx + 1)); + size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); + size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; + new->nr = new_size; + old = rcu_dereference_protected(c->snapshots, true); if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); + memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - kvfree_rcu_mightsleep(old); + kvfree_rcu(old, rcu); - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + return &rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock))->s[idx]; } static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; + struct snapshot_table *table = + rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock)); lockdep_assert_held(&c->snapshot_table_lock); - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + if (likely(table && idx < table->nr)) + return &table->s[idx]; return __snapshot_t_mut(c, id); } @@ -547,7 +558,7 @@ static int check_snapshot_tree(struct btree_trans *trans, "snapshot tree points to missing subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + fsck_err_on(!bch2_snapshot_is_ancestor(c, le32_to_cpu(subvol.snapshot), root_id), c, snapshot_tree_to_wrong_subvol, diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index 7c66ffc0..331f20fd 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) { - return &t->s[U32_MAX - id]; + u32 idx = U32_MAX - id; + + return likely(t && idx < t->nr) + ? &t->s[idx] + : NULL; } static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) @@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = snapshot_t(c, id)->tree; + const struct snapshot_t *s = snapshot_t(c, id); + id = s ? s->tree : 0; rcu_read_unlock(); return id; @@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->parent : 0; } static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) { -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + if (!s) + return 0; - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + parent && + s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", id, snapshot_t(c, id)->depth, parent, snapshot_t(c, parent)->depth); return parent; -#else - return snapshot_t(c, id)->parent; -#endif } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->equiv; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->equiv : 0; } static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) @@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) return id == bch2_snapshot_equiv(c, id); } -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - bool ret; - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; + const struct snapshot_t *s = snapshot_t(c, id); + int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; rcu_read_unlock(); return ret; } -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) { - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; + int ret = bch2_snapshot_is_internal_node(c, id); + if (ret < 0) + return ret; + return !ret; } static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) @@ -249,7 +240,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) return 0; return __bch2_key_has_snapshot_overwrites(trans, id, pos); diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index ce7aed12..88a79c82 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -595,6 +595,78 @@ err: return ret; } +int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +} + int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 903c0516..d2015d54 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -37,6 +37,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); +int bch2_initialize_subvolumes(struct bch_fs *); +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); + int bch2_fs_subvolumes_init(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index ae644adf..9b10c894 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -20,6 +20,8 @@ struct snapshot_t { }; struct snapshot_table { + struct rcu_head rcu; + size_t nr; #ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); #else diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 010daebf..61fc0ce7 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -8,7 +8,7 @@ #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "quota.h" #include "sb-clean.h" @@ -985,7 +985,7 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, " > "); bch2_version_to_text(&buf, bcachefs_metadata_version_current); prt_str(&buf, ")"); - bch2_fs_fatal_error(c, "%s", buf.buf); + bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); return -BCH_ERR_sb_not_downgraded; } @@ -1005,7 +1005,7 @@ int bch2_write_super(struct bch_fs *c) if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { bch2_fs_fatal_error(c, - "Superblock write was silently dropped! (seq %llu expected %llu)", + ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); @@ -1015,7 +1015,7 @@ int bch2_write_super(struct bch_fs *c) if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { bch2_fs_fatal_error(c, - "Superblock modified by another process (seq %llu expected %llu)", + ": Superblock modified by another process (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); @@ -1066,7 +1066,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, - "Unable to write superblock to sufficient devices (from %ps)", + ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) ret = -1; out: diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 1cabdd47..6269bec9 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -15,6 +15,7 @@ #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" @@ -87,6 +88,7 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +__printf(2, 0) static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) { #ifdef __KERNEL__ @@ -534,6 +536,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); @@ -558,6 +561,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 216fadf1..92c6ad75 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - #if 0 void eytzinger1_test(void) { diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 7ffbddb8..b4621b68 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - /* just the memmove, doesn't update @_nr */ #define __array_insert_item(_array, _nr, _pos) \ memmove(&(_array)[(_pos) + 1], \ @@ -683,6 +679,9 @@ static inline void __move_gap(void *array, size_t element_size, /* Move the gap in a gap buffer: */ #define move_gap(_d, _new_gap) \ do { \ + BUG_ON(_new_gap > (_d)->nr); \ + BUG_ON((_d)->gap > (_d)->nr); \ + \ __move_gap((_d)->data, sizeof((_d)->data[0]), \ (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ (_d)->gap = _new_gap; \ diff --git a/linux/bio.c b/linux/bio.c index 93a791c4..08dcd584 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -306,6 +306,20 @@ struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask) return bio; } +struct bio *bio_alloc(struct block_device *bdev, unsigned nr_iovecs, + blk_opf_t opf, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = kmalloc(sizeof(struct bio) + + sizeof(struct bio_vec) * nr_iovecs, gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, bdev, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, opf); + bio->bi_pool = NULL; + return bio; +} + static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs, gfp_t gfp_mask) {