From 47c554c31abd26a23906b43d756569e64ff60f8d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jan 2022 19:39:57 -0500 Subject: [PATCH] Update bcachefs sources to 5242db9aec bcachefs: Fix bch2_check_fix_ptrs() --- .bcachefs_revision | 2 +- libbcachefs/alloc_background.c | 237 ++++++++++++++----- libbcachefs/alloc_background.h | 42 +++- libbcachefs/bcachefs.h | 1 + libbcachefs/btree_gc.c | 406 ++++++++++++--------------------- libbcachefs/btree_iter.c | 17 -- libbcachefs/btree_update.h | 17 ++ libbcachefs/buckets.c | 16 +- libbcachefs/recovery.c | 23 +- 9 files changed, 393 insertions(+), 368 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 79a03365..8226b3a6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -50ac18afbb522a3103cecff9aaf9519d4eb5e908 +5242db9aec10220b6ee7162ba7bec173417348cf diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index df340ebb..688a53b4 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -38,6 +38,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; +struct bkey_alloc_buf { + struct bkey_i k; + struct bch_alloc_v3 v; + +#define x(_name, _bits) + _bits / 8 + u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +#undef x +} __attribute__((packed, aligned(8))); + /* Persistent alloc info: */ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, @@ -244,25 +253,24 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, - const struct bkey_alloc_unpacked src) +static void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) { - struct bkey_alloc_buf *dst; - - dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (!IS_ERR(dst)) - bch2_alloc_pack_v3(dst, src); - - return dst; + bch2_alloc_pack_v3(dst, src); } int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, struct bkey_alloc_unpacked *u, unsigned trigger_flags) { - struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); + struct bkey_alloc_buf *a; - return PTR_ERR_OR_ZERO(a) ?: - bch2_trans_update(trans, iter, &a->k, trigger_flags); + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (IS_ERR(a)) + return PTR_ERR(a); + + bch2_alloc_pack(trans->c, a, *u); + return bch2_trans_update(trans, iter, &a->k, trigger_flags); } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -332,7 +340,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, #undef x } -int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) +int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; @@ -343,43 +351,108 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) int ret; bch2_trans_init(&trans, c, 0, 0); + down_read(&c->gc_lock); for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { + if (!bkey_is_alloc(k.k)) + continue; + ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, gc); + g = bucket(ca, k.k->p.offset); u = bch2_alloc_unpack(k); - if (!gc) - *bucket_gen(ca, k.k->p.offset) = u.gen; - + *bucket_gen(ca, k.k->p.offset) = u.gen; g->_mark.gen = u.gen; + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; - g->oldest_gen = !gc ? u.oldest_gen : u.gen; + g->oldest_gen = u.oldest_gen; g->gen_valid = 1; - - if (!gc || - (metadata_only && - (u.data_type == BCH_DATA_user || - u.data_type == BCH_DATA_cached || - u.data_type == BCH_DATA_parity))) { - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - } - } bch2_trans_iter_exit(&trans, &iter); + up_read(&c->gc_lock); bch2_trans_exit(&trans); - if (ret) + if (ret) { bch_err(c, "error reading alloc info: %i", ret); + return ret; + } + return 0; +} + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_alloc_unpacked old_u, new_u; + int ret; +retry: + bch2_trans_begin(trans); + + ret = bch2_btree_key_cache_flush(trans, + BTREE_ID_alloc, iter->pos); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + old_u = bch2_alloc_unpack(k); + new_u = alloc_mem_to_key(c, iter); + + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; + + ret = bch2_alloc_write(trans, iter, &new_u, + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +err: + if (ret == -EINTR) + goto retry; + return ret; +} + +int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + for_each_member_device(ca, c, i) { + bch2_btree_iter_set_pos(&iter, + POS(ca->dev_idx, ca->mi.first_bucket)); + + while (iter.pos.offset < ca->mi.nbuckets) { + ret = bch2_alloc_write_key(&trans, &iter, flags); + if (ret) { + percpu_ref_put(&ca->ref); + goto err; + } + bch2_btree_iter_advance(&iter); + } + } +err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); return ret; } @@ -390,20 +463,19 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; struct bkey_alloc_unpacked u; u64 *time, now; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; - u = bch2_alloc_unpack(k); + u = alloc_mem_to_key(c, &iter); time = rw == READ ? &u.read_time : &u.write_time; now = atomic64_read(&c->io_clock[rw].now); @@ -586,34 +658,56 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } +/* + * returns sequence number of most recent journal entry that updated this + * bucket: + */ +static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) +{ + if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + return bucket_seq; + } else { + return 0; + } +} + static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_alloc_unpacked *u) + struct bch_dev *ca, u64 b) { struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked u; struct btree_iter iter; - struct bkey_s_c k; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto err; - *u = bch2_alloc_unpack(k); - u->gen++; - u->data_type = 0; - u->dirty_sectors = 0; - u->cached_sectors = 0; - u->read_time = atomic64_read(&c->io_clock[READ].now); - u->write_time = atomic64_read(&c->io_clock[WRITE].now); + u = alloc_mem_to_key(c, &iter); - ret = bch2_alloc_write(trans, &iter, u, + u.gen++; + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_alloc_write(trans, &iter, &u, BTREE_TRIGGER_BUCKET_INVALIDATE); err: bch2_trans_iter_exit(trans, &iter); @@ -623,23 +717,21 @@ err: static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq, unsigned flags) { - struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; size_t b; int ret = 0; - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; - BUG_ON(!ca->alloc_heap.used || !ca->alloc_heap.data[0].nr); b = ca->alloc_heap.data[0].bucket; /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + + BUG_ON(m.dirty_sectors); bch2_mark_alloc_bucket(c, ca, b, true); @@ -648,15 +740,38 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!fifo_push(&ca->free_inc, b)); spin_unlock(&c->freelist_lock); + /* + * If we're not invalidating cached data, we only increment the bucket + * gen in memory here, the incremented gen will be updated in the btree + * by bch2_trans_mark_pointer(): + */ + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + *bucket_gen(ca, b) = m.gen; + percpu_up_read(&c->mark_lock); + goto out; + } + percpu_up_read(&c->mark_lock); + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: + */ + if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { + ret = 1; + goto out; + } + ret = bch2_trans_do(c, NULL, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_JOURNAL_RESERVED| flags, - bucket_invalidate_btree(&trans, ca, b, &u)); - + bucket_invalidate_btree(&trans, ca, b)); +out: if (!ret) { /* remove from alloc_heap: */ struct alloc_heap_entry e, *top = ca->alloc_heap.data; @@ -672,7 +787,7 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, * bucket (i.e. deleting the last reference) before writing to * this bucket again: */ - *journal_seq = max(*journal_seq, u.journal_seq); + *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); } else { size_t b2; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 98c7866e..86b64177 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -38,23 +38,40 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ; } -struct bkey_alloc_buf { - struct bkey_i k; - struct bch_alloc_v3 v; - -#define x(_name, _bits) + _bits / 8 - u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; -#undef x -} __attribute__((packed, aligned(8))); - struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, - const struct bkey_alloc_unpacked); int bch2_alloc_write(struct btree_trans *, struct btree_iter *, struct bkey_alloc_unpacked *, unsigned); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); +static inline struct bkey_alloc_unpacked +alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) +{ + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked ret; + + percpu_down_read(&c->mark_lock); + ca = bch_dev_bkey_exists(c, iter->pos.inode); + g = bucket(ca, iter->pos.offset); + ret = (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, + .gen = g->mark.gen, + .oldest_gen = g->oldest_gen, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; + percpu_up_read(&c->mark_lock); + + return ret; +} + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); @@ -84,7 +101,7 @@ static inline bool bkey_is_alloc(const struct bkey *k) k->type == KEY_TYPE_alloc_v3; } -int bch2_alloc_read(struct bch_fs *, bool, bool); +int bch2_alloc_read(struct bch_fs *); static inline void bch2_wake_allocator(struct bch_dev *ca) { @@ -122,6 +139,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); +int bch2_alloc_write_all(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 7b39a419..c64db2bf 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -534,6 +534,7 @@ enum { /* misc: */ BCH_FS_NEED_ANOTHER_GC, BCH_FS_DELETED_NODES, + BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 268ad74d..a201052e 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,7 +9,6 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" -#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -534,6 +533,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); if (fsck_err_on(!g->gen_valid, c, @@ -544,8 +544,9 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; - g->gen_valid = true; + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } @@ -559,12 +560,13 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; - g->gen_valid = true; - g->_mark.data_type = 0; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } @@ -601,8 +603,9 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bch2_data_types[data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (data_type == BCH_DATA_btree) { - g->_mark.data_type = data_type; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + g2->_mark.data_type = g->_mark.data_type = data_type; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } @@ -1166,14 +1169,13 @@ static int bch2_gc_done(struct bch_fs *c, unsigned i, dev; int ret = 0; - percpu_down_write(&c->mark_lock); - #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -1183,6 +1185,18 @@ static int bch2_gc_done(struct bch_fs *c, iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } +#define copy_bucket_field(_f) \ + if (dst->b[b]._f != src->b[b]._f) { \ + if (verify) \ + fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", dev, b, \ + dst->b[b].mark.gen, \ + bch2_data_types[dst->b[b].mark.data_type],\ + dst->b[b]._f, src->b[b]._f); \ + dst->b[b]._f = src->b[b]._f; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) @@ -1193,18 +1207,36 @@ static int bch2_gc_done(struct bch_fs *c, bch2_fs_usage_acc_to_base(c, i); for_each_member_device(ca, c, dev) { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, - dev_usage_u64s()); + struct bucket_array *dst = __bucket_array(ca, 0); + struct bucket_array *src = __bucket_array(ca, 1); + size_t b; - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); + for (b = 0; b < src->nbuckets; b++) { + copy_bucket_field(_mark.gen); + copy_bucket_field(_mark.data_type); + copy_bucket_field(_mark.stripe); + copy_bucket_field(_mark.dirty_sectors); + copy_bucket_field(_mark.cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + dst->b[b].oldest_gen = src->b[b].oldest_gen; + } + + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } } }; @@ -1246,6 +1278,7 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_fs_field #undef copy_dev_field +#undef copy_bucket_field #undef copy_stripe_field #undef copy_field fsck_err: @@ -1253,8 +1286,6 @@ fsck_err: percpu_ref_put(&ca->ref); if (ret) bch_err(c, "%s: ret %i", __func__, ret); - - percpu_up_write(&c->mark_lock); return ret; } @@ -1277,6 +1308,15 @@ static int bch2_gc_start(struct bch_fs *c, BUG_ON(ca->buckets[1]); BUG_ON(ca->usage_gc); + ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets[1]) { + percpu_ref_put(&ca->ref); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; + } + ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); @@ -1285,184 +1325,39 @@ static int bch2_gc_start(struct bch_fs *c, } } - return 0; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - bool initial, bool metadata_only) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket *g; - struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u, gc_u; - struct bkey_alloc_buf *a; - int ret; + percpu_down_write(&c->mark_lock); /* - * For this to be correct at runtime, we'll need to figure out a way for - * it to actually lock the key in the btree key cache: + * indicate to stripe code that we need to allocate for the gc stripes + * radix tree, too */ - - if (!initial) { - ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_alloc, iter->pos); - if (ret) - return ret; - } - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - old_u = new_u = bch2_alloc_unpack(k); - - percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, iter->pos.offset); - gc_u = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .oldest_gen = g->oldest_gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; - percpu_up_read(&c->mark_lock); - - if (metadata_only && - gc_u.data_type != BCH_DATA_sb && - gc_u.data_type != BCH_DATA_journal && - gc_u.data_type != BCH_DATA_btree) - return 0; - - if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || - gen_after(old_u.gen, gc_u.gen)) - return 0; - -#define copy_bucket_field(_f) \ - if (fsck_err_on(new_u._f != gc_u._f, c, \ - "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ - iter->pos.inode, iter->pos.offset, \ - new_u.gen, \ - bch2_data_types[new_u.data_type], \ - new_u._f, gc_u._f)) \ - new_u._f = gc_u._f; \ - - copy_bucket_field(gen); - copy_bucket_field(data_type); - copy_bucket_field(stripe); - copy_bucket_field(dirty_sectors); - copy_bucket_field(cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); -#undef copy_bucket_field - - new_u.oldest_gen = gc_u.oldest_gen; - - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return 0; - - a = bch2_alloc_pack(trans, new_u); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = initial - ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) - : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); -fsck_err: - return ret; -} - -static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); + gc_pos_set(c, gc_phase(GC_PHASE_START)); for_each_member_device(ca, c, i) { - for_each_btree_key(&trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS| - BTREE_ITER_PREFETCH, k, ret) { - if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) - break; + struct bucket_array *dst = __bucket_array(ca, 1); + struct bucket_array *src = __bucket_array(ca, 0); + size_t b; - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(&trans, &iter, - initial, metadata_only)); - if (ret) - break; - } - bch2_trans_iter_exit(&trans, &iter); + dst->first_bucket = src->first_bucket; + dst->nbuckets = src->nbuckets; - if (ret) { - bch_err(c, "error writing alloc info: %i", ret); - percpu_ref_put(&ca->ref); - break; - } - } + for (b = 0; b < src->nbuckets; b++) { + struct bucket *d = &dst->b[b]; + struct bucket *s = &src->b[b]; - bch2_trans_exit(&trans); - return ret; -} + d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; + d->gen_valid = s->gen_valid; -static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) -{ - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!buckets) { - percpu_ref_put(&ca->ref); - percpu_up_write(&c->mark_lock); - bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; - } - - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = ca->mi.nbuckets; - rcu_assign_pointer(ca->buckets[1], buckets); - }; - - return bch2_alloc_read(c, true, metadata_only); -} - -static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) -{ - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { - struct bucket_array *buckets = __bucket_array(ca, true); - struct bucket *g; - - for_each_bucket(g, buckets) { if (metadata_only && - (g->mark.data_type == BCH_DATA_user || - g->mark.data_type == BCH_DATA_cached || - g->mark.data_type == BCH_DATA_parity)) - continue; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + (s->mark.data_type == BCH_DATA_user || + s->mark.data_type == BCH_DATA_cached)) + d->_mark = s->mark; } }; + + percpu_up_write(&c->mark_lock); + + return 0; } static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, @@ -1535,55 +1430,6 @@ fsck_err: return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, - bool metadata_only) -{ - struct genradix_iter iter; - struct reflink_gc *r; - - genradix_for_each(&c->reflink_gc_table, iter, r) - r->refcount = 0; -} - -static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, - bool metadata_only) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct reflink_gc *r; - int ret = 0; - - if (metadata_only) - return 0; - - bch2_trans_init(&trans, c, 0, 0); - c->reflink_gc_nr = 0; - - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -ENOMEM; - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1647,10 +1493,43 @@ fsck_err: return ret; } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, - bool metadata_only) +static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bool metadata_only) { - genradix_free(&c->gc_stripes); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -ENOMEM; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; } /** @@ -1686,14 +1565,11 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) /* flush interior btree updates: */ closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); - +again: ret = bch2_gc_start(c, metadata_only) ?: - bch2_gc_alloc_start(c, initial, metadata_only) ?: bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; -again: - gc_pos_set(c, gc_phase(GC_PHASE_START)); bch2_mark_superblocks(c); @@ -1731,40 +1607,40 @@ again: if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || (!iter && bch2_test_restart_gc)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto out; - } - /* * XXX: make sure gens we fixed got saved */ - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + if (iter++ <= 2) { + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - bch2_gc_stripes_reset(c, initial, metadata_only); - bch2_gc_alloc_reset(c, initial, metadata_only); - bch2_gc_reflink_reset(c, initial, metadata_only); + percpu_down_write(&c->mark_lock); + bch2_gc_free(c); + percpu_up_write(&c->mark_lock); + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - goto again; + goto again; + } + + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; } out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: - bch2_gc_reflink_done(c, initial, metadata_only) ?: - bch2_gc_alloc_done(c, initial, metadata_only) ?: + percpu_down_write(&c->mark_lock); + ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: + bch2_gc_stripes_done(c, initial, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); + } else { + percpu_down_write(&c->mark_lock); } - percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index db179013..2ae4e523 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2182,23 +2182,6 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; - break; - } - - return NULL; -} - static noinline struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, struct btree_path *path) diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 90ea018d..16ebf1a2 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -135,4 +135,21 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } + + return NULL; +} + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index fb0f64f0..895ff255 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1459,22 +1459,24 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bkey_s_c k; + struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); + struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); int ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, - POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), - BTREE_ITER_WITH_UPDATES| + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + ret = bch2_btree_iter_traverse(iter); if (ret) { bch2_trans_iter_exit(trans, iter); return ret; } - *u = bch2_alloc_unpack(k); + *u = update && !bpos_cmp(update->k.p, pos) + ? bch2_alloc_unpack(bkey_i_to_s_c(update)) + : alloc_mem_to_key(c, iter); + return 0; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 7e4400cc..b818093e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1095,11 +1095,7 @@ use_clean: bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - - down_read(&c->gc_lock); - ret = bch2_alloc_read(c, false, false); - up_read(&c->gc_lock); - + ret = bch2_alloc_read(c); if (ret) goto err; bch_verbose(c, "alloc read done"); @@ -1157,6 +1153,23 @@ use_clean: if (c->opts.verbose || !c->sb.clean) bch_info(c, "journal replay done"); + if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && + !c->opts.nochanges) { + /* + * note that even when filesystem was clean there might be work + * to do here, if we ran gc (because of fsck) which recalculated + * oldest_gen: + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; + ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; + } + bch_verbose(c, "alloc write done"); + } + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c);