diff --git a/.bcachefs_revision b/.bcachefs_revision index e086bdcf..193d8bd7 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -42284b8b2bb980c80140b640de7cb12bc1e4541c +aa439f3b94eb3141f9b6d71f780300e7fef44af9 diff --git a/cmd_migrate.c b/cmd_migrate.c index bde7288b..fc863f89 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -605,8 +605,6 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, darray_free(s.extents); genradix_free(&s.hardlinks); - - bch2_alloc_write_all(c, false); } static void find_superblock_space(ranges extents, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index eb2e6642..df340ebb 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -38,15 +38,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; -struct bkey_alloc_buf { - struct bkey_i k; - struct bch_alloc_v3 v; - -#define x(_name, _bits) + _bits / 8 - u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; -#undef x -} __attribute__((packed, aligned(8))); - /* Persistent alloc info: */ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, @@ -253,24 +244,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -static void bch2_alloc_pack(struct bch_fs *c, - struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, + const struct bkey_alloc_unpacked src) { - bch2_alloc_pack_v3(dst, src); + struct bkey_alloc_buf *dst; + + dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (!IS_ERR(dst)) + bch2_alloc_pack_v3(dst, src); + + return dst; } int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, struct bkey_alloc_unpacked *u, unsigned trigger_flags) { - struct bkey_alloc_buf *a; + struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); - a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (IS_ERR(a)) - return PTR_ERR(a); - - bch2_alloc_pack(trans->c, a, *u); - return bch2_trans_update(trans, iter, &a->k, trigger_flags); + return PTR_ERR_OR_ZERO(a) ?: + bch2_trans_update(trans, iter, &a->k, trigger_flags); } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -340,119 +332,54 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, #undef x } -static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked u; - - if (!bkey_is_alloc(k.k)) - return 0; - - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = bucket(ca, k.k->p.offset); - u = bch2_alloc_unpack(k); - - *bucket_gen(ca, k.k->p.offset) = u.gen; - g->_mark.gen = u.gen; - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; - g->gen_valid = 1; - - return 0; -} - -int bch2_alloc_read(struct bch_fs *c) -{ - struct btree_trans trans; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); - up_read(&c->gc_lock); - bch2_trans_exit(&trans); - if (ret) { - bch_err(c, "error reading alloc info: %i", ret); - return ret; - } - - return 0; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_alloc, iter->pos); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - old_u = bch2_alloc_unpack(k); - new_u = alloc_mem_to_key(c, iter); - - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return 0; - - ret = bch2_alloc_write(trans, iter, &new_u, - BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); -err: - if (ret == -EINTR) - goto retry; - return ret; -} - -int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) +int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) { struct btree_trans trans; struct btree_iter iter; + struct bkey_s_c k; struct bch_dev *ca; - unsigned i; - int ret = 0; + struct bucket *g; + struct bkey_alloc_unpacked u; + int ret; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, 0, 0); - for_each_member_device(ca, c, i) { - bch2_btree_iter_set_pos(&iter, - POS(ca->dev_idx, ca->mi.first_bucket)); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = __bucket(ca, k.k->p.offset, gc); + u = bch2_alloc_unpack(k); - while (iter.pos.offset < ca->mi.nbuckets) { - ret = bch2_alloc_write_key(&trans, &iter, flags); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } - bch2_btree_iter_advance(&iter); + if (!gc) + *bucket_gen(ca, k.k->p.offset) = u.gen; + + g->_mark.gen = u.gen; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = !gc ? u.oldest_gen : u.gen; + g->gen_valid = 1; + + if (!gc || + (metadata_only && + (u.data_type == BCH_DATA_user || + u.data_type == BCH_DATA_cached || + u.data_type == BCH_DATA_parity))) { + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; } + } -err: bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info: %i", ret); + return ret; } @@ -463,19 +390,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, { struct bch_fs *c = trans->c; struct btree_iter iter; + struct bkey_s_c k; struct bkey_alloc_unpacked u; u64 *time, now; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto out; - u = alloc_mem_to_key(c, &iter); + u = bch2_alloc_unpack(k); time = rw == READ ? &u.read_time : &u.write_time; now = atomic64_read(&c->io_clock[rw].now); @@ -664,20 +592,20 @@ static int bucket_invalidate_btree(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter; + struct bkey_s_c k; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto err; - *u = alloc_mem_to_key(c, &iter); - + *u = bch2_alloc_unpack(k); u->gen++; u->data_type = 0; u->dirty_sectors = 0; @@ -859,8 +787,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) static bool allocator_thread_running(struct bch_dev *ca) { unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) && - test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags) + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) ? ALLOCATOR_running : ALLOCATOR_stopped; alloc_thread_set_state(ca, state); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 86b64177..98c7866e 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ; } +struct bkey_alloc_buf { + struct bkey_i k; + struct bch_alloc_v3 v; + +#define x(_name, _bits) + _bits / 8 + u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +#undef x +} __attribute__((packed, aligned(8))); + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, + const struct bkey_alloc_unpacked); int bch2_alloc_write(struct btree_trans *, struct btree_iter *, struct bkey_alloc_unpacked *, unsigned); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) -{ - struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked ret; - - percpu_down_read(&c->mark_lock); - ca = bch_dev_bkey_exists(c, iter->pos.inode); - g = bucket(ca, iter->pos.offset); - ret = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .oldest_gen = g->oldest_gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; - percpu_up_read(&c->mark_lock); - - return ret; -} - #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); @@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k) k->type == KEY_TYPE_alloc_v3; } -int bch2_alloc_read(struct bch_fs *); +int bch2_alloc_read(struct bch_fs *, bool, bool); static inline void bch2_wake_allocator(struct bch_dev *ca) { @@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_alloc_write_all(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 696c7c93..ddd700c3 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -510,8 +510,6 @@ enum { BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_ALLOC_REPLAY_DONE, - BCH_FS_BTREE_INTERIOR_REPLAY_DONE, BCH_FS_FSCK_DONE, BCH_FS_STARTED, BCH_FS_RW, @@ -531,7 +529,6 @@ enum { /* misc: */ BCH_FS_NEED_ANOTHER_GC, BCH_FS_DELETED_NODES, - BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -860,7 +857,6 @@ struct bch_fs { u64 reflink_hint; reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; - size_t reflink_gc_idx; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 8ec718cd..5153f0e4 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1427,6 +1427,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); /* * Features: diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 9e3213b9..101cef7e 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -505,7 +506,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); if (fsck_err_on(!g->gen_valid, c, @@ -516,9 +516,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; } else { do_update = true; } @@ -532,9 +531,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - g2->_mark.data_type = g->_mark.data_type = data_type; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.data_type = data_type; + g->gen_valid = true; } if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, @@ -545,13 +543,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; + g->_mark.data_type = 0; + g->_mark.dirty_sectors = 0; + g->_mark.cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } @@ -588,9 +585,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bch2_data_types[data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (data_type == BCH_DATA_btree) { - g2->_mark.data_type = g->_mark.data_type = data_type; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.data_type = data_type; + g->gen_valid = true; } else { do_update = true; } @@ -691,10 +687,16 @@ found: } ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) kfree(new); - else + else { + bch2_bkey_val_to_text(&PBUF(buf), c, *k); + bch_info(c, "updated %s", buf); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf); *k = bkey_i_to_s_c(new); + } } fsck_err: return ret; @@ -1145,13 +1147,14 @@ static int bch2_gc_done(struct bch_fs *c, unsigned i, dev; int ret = 0; + percpu_down_write(&c->mark_lock); + #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -1161,18 +1164,6 @@ static int bch2_gc_done(struct bch_fs *c, iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ - } -#define copy_bucket_field(_f) \ - if (dst->b[b]._f != src->b[b]._f) { \ - if (verify) \ - fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", dev, b, \ - dst->b[b].mark.gen, \ - bch2_data_types[dst->b[b].mark.data_type],\ - dst->b[b]._f, src->b[b]._f); \ - dst->b[b]._f = src->b[b]._f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) @@ -1183,36 +1174,18 @@ static int bch2_gc_done(struct bch_fs *c, bch2_fs_usage_acc_to_base(c, i); for_each_member_device(ca, c, dev) { - struct bucket_array *dst = __bucket_array(ca, 0); - struct bucket_array *src = __bucket_array(ca, 1); - size_t b; + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); - for (b = 0; b < src->nbuckets; b++) { - copy_bucket_field(_mark.gen); - copy_bucket_field(_mark.data_type); - copy_bucket_field(_mark.stripe); - copy_bucket_field(_mark.dirty_sectors); - copy_bucket_field(_mark.cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); - dst->b[b].oldest_gen = src->b[b].oldest_gen; - } - - { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, - dev_usage_u64s()); - - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); - } + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); } }; @@ -1254,7 +1227,6 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_fs_field #undef copy_dev_field -#undef copy_bucket_field #undef copy_stripe_field #undef copy_field fsck_err: @@ -1262,6 +1234,8 @@ fsck_err: percpu_ref_put(&ca->ref); if (ret) bch_err(c, "%s: ret %i", __func__, ret); + + percpu_up_write(&c->mark_lock); return ret; } @@ -1284,15 +1258,6 @@ static int bch2_gc_start(struct bch_fs *c, BUG_ON(ca->buckets[1]); BUG_ON(ca->usage_gc); - ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets[1]) { - percpu_ref_put(&ca->ref); - bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; - } - ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); @@ -1301,94 +1266,165 @@ static int bch2_gc_start(struct bch_fs *c, } } - percpu_down_write(&c->mark_lock); - - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); - - for_each_member_device(ca, c, i) { - struct bucket_array *dst = __bucket_array(ca, 1); - struct bucket_array *src = __bucket_array(ca, 0); - size_t b; - - dst->first_bucket = src->first_bucket; - dst->nbuckets = src->nbuckets; - - for (b = 0; b < src->nbuckets; b++) { - struct bucket *d = &dst->b[b]; - struct bucket *s = &src->b[b]; - - d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; - d->gen_valid = s->gen_valid; - - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) - d->_mark = s->mark; - } - }; - - percpu_up_write(&c->mark_lock); - return 0; } -static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + bool initial, bool metadata_only) { struct bch_fs *c = trans->c; - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - char buf[200]; - int ret = 0; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket *g; + struct bkey_s_c k; + struct bkey_alloc_unpacked old_u, new_u, gc_u; + struct bkey_alloc_buf *a; + int ret; - if (!refcount) + /* + * For this to be correct at runtime, we'll need to figure out a way for + * it to actually lock the key in the btree key cache: + */ + + if (!initial) { + ret = bch2_btree_key_cache_flush(trans, + BTREE_ID_alloc, iter->pos); + if (ret) + return ret; + } + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + old_u = new_u = bch2_alloc_unpack(k); + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, iter->pos.offset); + gc_u = (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, + .gen = g->mark.gen, + .oldest_gen = g->oldest_gen, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; + percpu_up_read(&c->mark_lock); + + if (metadata_only && + gc_u.data_type != BCH_DATA_sb && + gc_u.data_type != BCH_DATA_journal && + gc_u.data_type != BCH_DATA_btree) return 0; - r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); - if (!r) - return -ENOMEM; + if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || + gen_after(old_u.gen, gc_u.gen)) + return 0; - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } +#define copy_bucket_field(_f) \ + if (fsck_err_on(new_u._f != gc_u._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + new_u.gen, \ + bch2_data_types[new_u.data_type], \ + new_u._f, gc_u._f)) \ + new_u._f = gc_u._f; \ - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - r->refcount)) { - struct bkey_i *new; + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +#undef copy_bucket_field - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; - } + new_u.oldest_gen = gc_u.oldest_gen; - bkey_reassemble(new, k); + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; - if (!r->refcount) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - } else { - *bkey_refcount(new) = cpu_to_le64(r->refcount); - } + a = bch2_alloc_pack(trans, new_u); + if (IS_ERR(a)) + return PTR_ERR(a); - ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); - kfree(new); - } + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) + : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); fsck_err: return ret; } +static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, + initial, metadata_only)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error writing alloc info: %i", ret); + percpu_ref_put(&ca->ref); + break; + } + } + + bch2_trans_exit(&trans); + return ret; +} + +static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + percpu_up_write(&c->mark_lock); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; + } + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets[1], buckets); + }; + + return bch2_alloc_read(c, true, metadata_only); +} + static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1405,14 +1441,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); - if (initial) { - c->reflink_gc_idx = 0; - - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_done_initial_fn); - goto out; - } - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1420,7 +1448,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (!refcount) continue; - r = genradix_ptr(&c->reflink_gc_table, idx); + r = genradix_ptr(&c->reflink_gc_table, idx++); if (!r || r->offset != k.k->p.offset || r->size != k.k->size) { @@ -1450,7 +1478,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, else *bkey_refcount(new) = cpu_to_le64(r->refcount); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) + : __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); kfree(new); @@ -1460,104 +1490,11 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, } fsck_err: bch2_trans_iter_exit(&trans, &iter); -out: c->reflink_gc_nr = 0; bch2_trans_exit(&trans); return ret; } -static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct gc_stripe *m; - const struct bch_stripe *s; - char buf[200]; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) - if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) - goto inconsistent; - return 0; -inconsistent: - if (fsck_err_on(true, c, - "stripe has wrong block sector count %u:\n" - " %s\n" - " should be %u", i, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - m ? m->block_sectors[i] : 0)) { - struct bkey_i_stripe *new; - - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; - } - - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); - kfree(new); - } -fsck_err: - return ret; -} - -static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, - bool metadata_only) -{ - struct btree_trans trans; - int ret = 0; - - if (metadata_only) - return 0; - - bch2_trans_init(&trans, c, 0, 0); - - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_gc_stripes_done_initial_fn); - } else { - BUG(); - } - - bch2_trans_exit(&trans); - return ret; -} - -static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) -{ - - struct bch_fs *c = trans->c; - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - return 0; - - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - return 0; -} - static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1573,12 +1510,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); c->reflink_gc_nr = 0; - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_start_initial_fn); - goto out; - } - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1598,7 +1529,70 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, r->refcount = 0; } bch2_trans_iter_exit(&trans, &iter); -out: + + bch2_trans_exit(&trans); + return ret; +} + +static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct gc_stripe *m; + const struct bch_stripe *s; + char buf[200]; + unsigned i; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + + for (i = 0; i < s->nr_blocks; i++) + if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) + goto inconsistent; + continue; +inconsistent: + if (fsck_err_on(true, c, + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + break; + } + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) + : __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); + kfree(new); + } + } +fsck_err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); return ret; } @@ -1638,10 +1632,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) !bch2_btree_interior_updates_nr_pending(c)); again: ret = bch2_gc_start(c, metadata_only) ?: + bch2_gc_alloc_start(c, initial, metadata_only) ?: bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; + gc_pos_set(c, gc_phase(GC_PHASE_START)); + bch2_mark_superblocks(c); if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && @@ -1702,16 +1699,15 @@ out: if (!ret) { bch2_journal_block(&c->journal); - percpu_down_write(&c->mark_lock); - ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, initial, metadata_only) ?: + ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: + bch2_gc_reflink_done(c, initial, metadata_only) ?: + bch2_gc_alloc_done(c, initial, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); - } else { - percpu_down_write(&c->mark_lock); } + percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 9ebb81d7..e8e0adac 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "recovery.h" #include "replicas.h" #include "subvolume.h" @@ -1077,6 +1078,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, static void btree_path_verify_new_node(struct btree_trans *trans, struct btree_path *path, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_path_level *l; unsigned plevel; bool parent_locked; @@ -1085,6 +1087,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) return; + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + plevel = b->c.level + 1; if (!btree_path_node(path, plevel)) return; @@ -1105,7 +1110,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans, char buf4[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_dump_btree_node(trans->c, l->b); + bch2_dump_btree_node(c, l->b); bch2_bpos_to_text(&PBUF(buf1), path->pos); bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); @@ -1296,6 +1301,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat return ret; } +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, struct btree_path *path, unsigned plevel, struct btree *b) @@ -1318,6 +1358,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, btree_node_unlock(path, plevel); } +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -1328,14 +1392,28 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); + bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (unlikely(!replay_done)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); @@ -1345,13 +1423,10 @@ static __always_inline int btree_path_down(struct btree_trans *trans, mark_btree_node_locked(path, level, lock_type); btree_path_level_init(trans, path, b); - if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); - if (flags & BTREE_ITER_PREFETCH) - ret = btree_path_prefetch(trans, path); - if (btree_node_read_locked(path, level + 1)) btree_node_unlock(path, level + 1); path->level = level; @@ -2107,6 +2182,59 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } + + return NULL; +} + +static noinline +struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, + struct btree_path *path) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, path->btree_id, + path->level, path->pos); + + while (idx < keys->nr && keys->d[idx].overwritten) + idx++; + + return (idx < keys->nr && + keys->d[idx].btree_id == path->btree_id && + keys->d[idx].level == path->level) + ? keys->d[idx].k + : NULL; +} + +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *next_journal = + __btree_trans_peek_journal(trans, iter->path); + + if (next_journal && + bpos_cmp(next_journal->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + /** * bch2_btree_iter_peek: returns first key greater than or equal to iterator's * current position @@ -2117,7 +2245,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) struct bpos search_key = btree_iter_search_key(iter); struct bkey_i *next_update; struct bkey_s_c k; - int ret, cmp; + int ret; EBUG_ON(iter->path->cached || iter->path->level); bch2_btree_iter_verify(iter); @@ -2136,19 +2264,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) goto out; } + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + next_update = iter->flags & BTREE_ITER_WITH_UPDATES ? btree_trans_peek_updates(trans, iter->btree_id, search_key) : NULL; - k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); - - /* * In the btree, deleted keys sort before non deleted: */ - if (k.k && bkey_deleted(k.k) && - (!next_update || - bpos_cmp(k.k->p, next_update->k.p) <= 0)) { - search_key = k.k->p; - continue; - } - if (next_update && bpos_cmp(next_update->k.p, k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { @@ -2156,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) k = bkey_i_to_s_c(next_update); } + if (k.k && bkey_deleted(k.k)) { + /* + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. + */ + search_key = bpos_cmp(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + if (likely(k.k)) { /* * We can never have a key in a leaf node at POS_MAX, so @@ -2199,14 +2336,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; - cmp = bpos_cmp(k.k->p, iter->path->pos); - if (cmp) { - iter->path = bch2_btree_path_make_mut(trans, iter->path, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - iter->path->pos = k.k->p; - btree_path_check_sort(trans, iter->path, cmp); - } + iter->path = btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + BUG_ON(!iter->path->nodes_locked); out: iter->path->should_be_locked = true; @@ -2247,6 +2380,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) EBUG_ON(iter->path->cached || iter->path->level); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2397,17 +2534,24 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - next_update = iter->flags & BTREE_ITER_WITH_UPDATES - ? btree_trans_peek_updates(trans, iter->btree_id, search_key) - : NULL; - - if (next_update && + if ((iter->flags & BTREE_ITER_WITH_UPDATES) && + (next_update = btree_trans_peek_updates(trans, + iter->btree_id, search_key)) && !bpos_cmp(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); - } else { - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + goto out; } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (next_update = __btree_trans_peek_journal(trans, iter->path)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + goto out; + } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { struct bpos next; @@ -2451,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = (struct bkey_s_c) { &iter->k, NULL }; } } - +out: iter->path->should_be_locked = true; bch2_btree_iter_verify_entry_exit(iter); @@ -2618,6 +2762,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, btree_type_has_snapshots(btree_id)) flags |= BTREE_ITER_FILTER_SNAPSHOTS; + if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + flags |= BTREE_ITER_WITH_JOURNAL; + iter->trans = trans; iter->path = NULL; iter->btree_id = btree_id; diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 0768ef3c..b3d241b1 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty && - test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + return nr_dirty > max_dirty; } int bch2_btree_key_cache_journal_flush(struct journal *, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 08c49ae3..1ace7604 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -207,10 +207,11 @@ struct btree_node_iter { #define BTREE_ITER_CACHED_NOFILL (1 << 8) #define BTREE_ITER_CACHED_NOCREATE (1 << 9) #define BTREE_ITER_WITH_UPDATES (1 << 10) -#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) -#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) -#define BTREE_ITER_NOPRESERVE (1 << 14) +#define BTREE_ITER_WITH_JOURNAL (1 << 11) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) +#define BTREE_ITER_NOPRESERVE (1 << 15) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 16ebf1a2..90ea018d 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -135,21 +135,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; - break; - } - - return NULL; -} - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 6872e56b..e1a5e34e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -44,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) BUG_ON(!b->c.level); - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; bch2_btree_node_iter_init_from_start(&iter, b); @@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { @@ -1847,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) - return; - if (!percpu_ref_tryget(&c->writes)) return; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 09dc585b..e2e878b8 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - EBUG_ON(!insert->level && - !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); - if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, &insert_l(insert)->iter, insert->k))) return false; @@ -476,13 +473,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, JOURNAL_RES_GET_NONBLOCK); if (ret) return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - if (unlikely(trans->extra_journal_entry_u64s)) { memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), trans->extra_journal_entries, diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 78d43997..c72fe777 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1458,24 +1458,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); - struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); + struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, + POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), + BTREE_ITER_WITH_UPDATES| BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); if (ret) { bch2_trans_iter_exit(trans, iter); return ret; } - *u = update && !bpos_cmp(update->k.p, pos) - ? bch2_alloc_unpack(bkey_i_to_s_c(update)) - : alloc_mem_to_key(c, iter); - + *u = bch2_alloc_unpack(k); return 0; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 9a1751d4..9b45640e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c) bch2_stripes_heap_insert(c, m, iter.pos); } -static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) -{ - const struct bch_stripe *s; - struct bch_fs *c = trans->c; - struct stripe *m; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - return ret; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->alive = true; - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); - - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); - - return ret; -} - int bch2_stripes_read(struct bch_fs *c) { struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_stripe *s; + struct stripe *m; + unsigned i; int ret; bch2_trans_init(&trans, c, 0, 0); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_stripes_read_fn); + + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; + + s = bkey_s_c_to_stripe(k).v; + + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + if (ret) bch_err(c, "error reading stripes: %i", ret); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index ab9a6d96..52a3935c 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, u64 seq; int err; - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) - return 0; - lockdep_assert_held(&j->reclaim_lock); while (1) { @@ -692,8 +689,6 @@ static int bch2_journal_reclaim_thread(void *arg) set_freezable(); - kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); - j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 54cc69bd..d6d75121 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -148,7 +148,6 @@ enum journal_space_from { enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 52c0b56a..c6880654 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -330,9 +330,9 @@ enum opt_type { NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ x(journal_transaction_names, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, false, \ NULL, "Log transaction function names in journal") \ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 40e1e991..5da6b3b4 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) static int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, - struct journal_key *r) + const struct journal_key *r) { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: bpos_cmp(l_pos, r->k->k.p)); } -static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { - return (cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p)); + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -static size_t journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +size_t bch2_journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; @@ -116,11 +114,18 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .btree_id = id, .level = level, .k = k, - .allocated = true + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, }; struct journal_keys *keys = &c->journal_keys; struct journal_iter *iter; - unsigned idx = journal_key_search(keys, id, level, k->k.p); + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); if (idx < keys->nr && journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -157,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, return 0; } +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, unsigned level, struct bkey_i *k) { @@ -189,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { struct journal_keys *keys = &c->journal_keys; - size_t idx = journal_key_search(keys, btree, level, pos); + size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->nr && keys->d[idx].btree_id == btree && @@ -200,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->idx - iter->keys->nr - ? iter->keys->d + iter->idx : NULL; + struct journal_key *k = iter->keys->d + iter->idx; - if (k && - k->btree_id == iter->btree_id && - k->level == iter->level) - return k->k; + while (k < iter->keys->d + iter->keys->nr && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return k->k; + + iter->idx++; + k = iter->keys->d + iter->idx; + } - iter->idx = iter->keys->nr; return NULL; } @@ -231,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->btree_id = id; iter->level = level; iter->keys = &c->journal_keys; - iter->idx = journal_key_search(&c->journal_keys, id, level, pos); - list_add(&iter->list, &c->journal_iters); + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -318,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) { memset(iter, 0, sizeof(*iter)); iter->b = b; - bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(c, &iter->journal, - b->c.btree_id, b->c.level, b->data->min_key); + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); } -/* Walk btree, overlaying keys from the journal: */ - -static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, - struct btree_and_journal_iter iter) +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) { - unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; - struct bkey_s_c k; - struct bkey_buf tmp; + struct btree_node_iter node_iter; - BUG_ON(!b->c.level); - - bch2_bkey_buf_init(&tmp); - - while (i < nr && - (k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, - b->c.btree_id, b->c.level - 1); - - bch2_btree_and_journal_iter_advance(&iter); - i++; - } - - bch2_bkey_buf_exit(&tmp, c); -} - -static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, - enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf tmp; - struct btree *child; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (b->c.level) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1, - false); - - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; - - btree_and_journal_iter_prefetch(c, b, iter); - - ret = bch2_btree_and_journal_walk_recurse(trans, child, - btree_id, key_fn); - six_unlock_read(&child->c.lock); - } else { - ret = key_fn(trans, k); - } - - if (ret) - break; - - bch2_btree_and_journal_iter_advance(&iter); - } - - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct bch_fs *c = trans->c; - struct btree *b = c->btree_roots[btree_id].b; - int ret = 0; - - if (btree_node_fake(b)) - return 0; - - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); - six_unlock_read(&b->c.lock); - - return ret; + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -442,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p) ?: + return journal_key_cmp(l, r) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -537,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int __bch2_journal_replay_key(struct btree_trans *trans, - struct journal_key *k) +static int bch2_journal_replay_key(struct btree_trans *trans, + struct journal_key *k) { struct btree_iter iter; unsigned iter_flags = @@ -546,45 +483,32 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, BTREE_ITER_NOT_EXTENTS; int ret; - /* Must be checked with btree locked: */ - if (k->overwritten) - return 0; - if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; + iter_flags |= BTREE_ITER_CACHED; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* Must be checked with btree locked: */ + if (k->overwritten) + goto out; + + ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); +out: bch2_trans_iter_exit(trans, &iter); return ret; } -static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) -{ - unsigned commit_flags = - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED; - - if (!k->allocated) - commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; - - return bch2_trans_do(c, NULL, NULL, commit_flags, - __bch2_journal_replay_key(&trans, k)); -} - static int journal_sort_seq_cmp(const void *_l, const void *_r) { const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(r->level, l->level) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bpos_cmp(l->k->k.p, r->k->k.p); + return cmp_int(l->journal_seq, r->journal_seq); } static int bch2_journal_replay(struct bch_fs *c) @@ -592,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; struct journal_key **keys_sorted, *k; struct journal *j = &c->journal; - struct bch_dev *ca; - unsigned idx; size_t i; - u64 seq; int ret; keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); @@ -609,76 +530,30 @@ static int bch2_journal_replay(struct bch_fs *c) sizeof(keys_sorted[0]), journal_sort_seq_cmp, NULL); - if (keys->nr) + if (keys->nr) { + bch_verbose(c, "starting journal replay, %zu keys", keys->nr); replay_now_at(j, keys->journal_seq_base); - - seq = j->replay_journal_seq; - - /* - * First replay updates to the alloc btree - these will only update the - * btree key cache: - */ - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; - - cond_resched(); - - if (!k->level && k->btree_id == BTREE_ID_alloc) { - j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; - ret = bch2_journal_replay_key(c, k); - if (ret) - goto err; - } } - /* Now we can start the allocator threads: */ - set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); - for_each_member_device(ca, c, idx) - bch2_wake_allocator(ca); - - /* - * Next replay updates to interior btree nodes: - */ for (i = 0; i < keys->nr; i++) { k = keys_sorted[i]; cond_resched(); - if (k->level) { - j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; - ret = bch2_journal_replay_key(c, k); - if (ret) - goto err; - } - } + if (!k->allocated) + replay_now_at(j, keys->journal_seq_base + k->journal_seq); - /* - * Now that the btree is in a consistent state, we can start journal - * reclaim (which will be flushing entries from the btree key cache back - * to the btree: - */ - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); - journal_reclaim_kick(j); - - j->replay_journal_seq = seq; - - /* - * Now replay leaf node updates: - */ - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; - - cond_resched(); - - if (k->level || k->btree_id == BTREE_ID_alloc) - continue; - - replay_now_at(j, keys->journal_seq_base + k->journal_seq); - - ret = bch2_journal_replay_key(c, k); - if (ret) + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED| + (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), + bch2_journal_replay_key(&trans, k)); + if (ret) { + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[k->btree_id], k->level); goto err; + } } replay_now_at(j, j->replay_journal_seq_end); @@ -686,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); bch2_journal_flush_all_pins(j); - kfree(keys_sorted); - - return bch2_journal_error(j); + ret = bch2_journal_error(j); err: - bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", - ret, bch2_btree_ids[k->btree_id], k->level); kfree(keys_sorted); - return ret; } @@ -1217,7 +1087,11 @@ use_clean: bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - ret = bch2_alloc_read(c); + + down_read(&c->gc_lock); + ret = bch2_alloc_read(c, false, false); + up_read(&c->gc_lock); + if (ret) goto err; bch_verbose(c, "alloc read done"); @@ -1231,6 +1105,13 @@ use_clean: set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + /* + * If we're not running fsck, this ensures bch2_fsck_err() calls are + * instead interpreted as bch2_inconsistent_err() calls: + */ + if (!c->opts.fsck) + set_bit(BCH_FS_FSCK_DONE, &c->flags); + if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || @@ -1265,24 +1146,8 @@ use_clean: ret = bch2_journal_replay(c); if (ret) goto err; - bch_verbose(c, "journal replay done"); - - if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && - !c->opts.nochanges) { - /* - * note that even when filesystem was clean there might be work - * to do here, if we ran gc (because of fsck) which recalculated - * oldest_gen: - */ - bch_verbose(c, "writing allocation info"); - err = "error writing out alloc info"; - ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error writing alloc info"); - goto err; - } - bch_verbose(c, "alloc write done"); - } + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); @@ -1430,14 +1295,11 @@ int bch2_fs_initialize(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); - err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index a7a9496a..21bdad9d 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -31,6 +31,9 @@ struct btree_and_journal_iter { } last; }; +size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, + unsigned, struct bpos); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); - -int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); - void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct list_head *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index bbed24b7..8e28a13a 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -752,11 +752,24 @@ int bch2_write_super(struct bch_fs *c) closure_sync(cl); for_each_online_member(ca, c, i) { - if (!ca->sb_write_error && - ca->disk_sb.seq != - le64_to_cpu(ca->sb_read_scratch->seq)) { + if (ca->sb_write_error) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { bch2_fs_fatal_error(c, - "Superblock modified by another process"); + "Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock modified by another process (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); ret = -EROFS; goto out;