diff --git a/.bcachefs_revision b/.bcachefs_revision index 71e83e28..4bc1040c 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -bf340e68c74cdb70c692698ef7367b9dc6f6e61f +b84661c042c7d5caaab3f79661d04789070bea78 diff --git a/cmd_migrate.c b/cmd_migrate.c index fc863f89..4772b3bd 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -328,7 +328,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) { .offset = physical, .dev = 0, - .gen = bucket(ca, b)->mark.gen, + .gen = *bucket_gen(ca, b), }); ret = bch2_disk_reservation_get(c, &res, sectors, 1, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2bfbfadb..6d039ea3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -60,6 +60,7 @@ #define unlikely(x) __builtin_expect(!!(x), 0) #define unreachable() __builtin_unreachable() #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#define fallthrough __attribute__((__fallthrough__)) #define ___PASTE(a,b) a##b #define __PASTE(a,b) ___PASTE(a,b) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 8f10d13b..36c4c884 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, TP_ARGS(trans_fn, caller_ip) ); +DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip), + TP_ARGS(trans_fn, caller_ip) +); + DECLARE_EVENT_CLASS(transaction_restart_iter, TP_PROTO(const char *trans_fn, unsigned long caller_ip, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7ad16c21..0a5ec99e 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -39,15 +39,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; -struct bkey_alloc_buf { - struct bkey_i k; - struct bch_alloc_v3 v; - -#define x(_name, _bits) + _bits / 8 - u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; -#undef x -} __attribute__((packed, aligned(8))); - /* Persistent alloc info: */ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, @@ -254,24 +245,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -static void bch2_alloc_pack(struct bch_fs *c, - struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, + const struct bkey_alloc_unpacked src) { - bch2_alloc_pack_v3(dst, src); + struct bkey_alloc_buf *dst; + + dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (!IS_ERR(dst)) + bch2_alloc_pack_v3(dst, src); + + return dst; } int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, struct bkey_alloc_unpacked *u, unsigned trigger_flags) { - struct bkey_alloc_buf *a; + struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); - a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (IS_ERR(a)) - return PTR_ERR(a); - - bch2_alloc_pack(trans->c, a, *u); - return bch2_trans_update(trans, iter, &a->k, trigger_flags); + return PTR_ERR_OR_ZERO(a) ?: + bch2_trans_update(trans, iter, &a->k, trigger_flags); } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -341,7 +333,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, #undef x } -int bch2_alloc_read(struct bch_fs *c) +int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) { struct btree_trans trans; struct btree_iter iter; @@ -352,108 +344,43 @@ int bch2_alloc_read(struct bch_fs *c) int ret; bch2_trans_init(&trans, c, 0, 0); - down_read(&c->gc_lock); for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { - if (!bkey_is_alloc(k.k)) - continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = bucket(ca, k.k->p.offset); + g = __bucket(ca, k.k->p.offset, gc); u = bch2_alloc_unpack(k); - *bucket_gen(ca, k.k->p.offset) = u.gen; + if (!gc) + *bucket_gen(ca, k.k->p.offset) = u.gen; + g->_mark.gen = u.gen; - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; + g->oldest_gen = !gc ? u.oldest_gen : u.gen; g->gen_valid = 1; - } - bch2_trans_iter_exit(&trans, &iter); - up_read(&c->gc_lock); - bch2_trans_exit(&trans); - - if (ret) { - bch_err(c, "error reading alloc info: %i", ret); - return ret; - } - - return 0; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_alloc, iter->pos); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - old_u = bch2_alloc_unpack(k); - new_u = alloc_mem_to_key(c, iter); - - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return 0; - - ret = bch2_alloc_write(trans, iter, &new_u, - BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); -err: - if (ret == -EINTR) - goto retry; - return ret; -} - -int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - for_each_member_device(ca, c, i) { - bch2_btree_iter_set_pos(&iter, - POS(ca->dev_idx, ca->mi.first_bucket)); - - while (iter.pos.offset < ca->mi.nbuckets) { - ret = bch2_alloc_write_key(&trans, &iter, flags); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } - bch2_btree_iter_advance(&iter); + if (!gc || + (metadata_only && + (u.data_type == BCH_DATA_user || + u.data_type == BCH_DATA_cached || + u.data_type == BCH_DATA_parity))) { + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; } + } -err: bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info: %i", ret); + return ret; } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 86b64177..98c7866e 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ; } +struct bkey_alloc_buf { + struct bkey_i k; + struct bch_alloc_v3 v; + +#define x(_name, _bits) + _bits / 8 + u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +#undef x +} __attribute__((packed, aligned(8))); + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, + const struct bkey_alloc_unpacked); int bch2_alloc_write(struct btree_trans *, struct btree_iter *, struct bkey_alloc_unpacked *, unsigned); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) -{ - struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked ret; - - percpu_down_read(&c->mark_lock); - ca = bch_dev_bkey_exists(c, iter->pos.inode); - g = bucket(ca, iter->pos.offset); - ret = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .oldest_gen = g->oldest_gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; - percpu_up_read(&c->mark_lock); - - return ret; -} - #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); @@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k) k->type == KEY_TYPE_alloc_v3; } -int bch2_alloc_read(struct bch_fs *); +int bch2_alloc_read(struct bch_fs *, bool, bool); static inline void bch2_wake_allocator(struct bch_dev *ca) { @@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_alloc_write_all(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index a28ddcd5..eec02f8a 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -451,7 +451,8 @@ struct bch_dev { * Or rcu_read_lock(), but only for ptr_stale(): */ struct bucket_array __rcu *buckets[2]; - struct bucket_gens *bucket_gens; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; @@ -536,7 +537,6 @@ enum { /* misc: */ BCH_FS_NEED_ANOTHER_GC, BCH_FS_DELETED_NODES, - BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -716,6 +716,7 @@ struct bch_fs { bool btree_trans_barrier_initialized; struct btree_key_cache btree_key_cache; + unsigned btree_key_cache_btrees; struct workqueue_struct *btree_update_wq; struct workqueue_struct *btree_io_complete_wq; @@ -952,6 +953,11 @@ static inline size_t btree_sectors(const struct bch_fs *c) return c->opts.btree_node_size >> 9; } +static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) +{ + return c->btree_key_cache_btrees & (1U << btree); +} + static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 809c9a76..7cab220c 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -533,7 +534,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); if (fsck_err_on(!g->gen_valid, c, @@ -544,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; } else { do_update = true; } @@ -560,13 +559,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; + g->_mark.data_type = 0; + g->_mark.dirty_sectors = 0; + g->_mark.cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } @@ -603,8 +601,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bch2_data_types[data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (data_type == BCH_DATA_btree) { - g2->_mark.data_type = g->_mark.data_type = data_type; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.data_type = data_type; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; @@ -1169,13 +1166,14 @@ static int bch2_gc_done(struct bch_fs *c, unsigned i, dev; int ret = 0; + percpu_down_write(&c->mark_lock); + #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -1185,18 +1183,6 @@ static int bch2_gc_done(struct bch_fs *c, iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ - } -#define copy_bucket_field(_f) \ - if (dst->b[b]._f != src->b[b]._f) { \ - if (verify) \ - fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", dev, b, \ - dst->b[b].mark.gen, \ - bch2_data_types[dst->b[b].mark.data_type],\ - dst->b[b]._f, src->b[b]._f); \ - dst->b[b]._f = src->b[b]._f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) @@ -1207,36 +1193,18 @@ static int bch2_gc_done(struct bch_fs *c, bch2_fs_usage_acc_to_base(c, i); for_each_member_device(ca, c, dev) { - struct bucket_array *dst = __bucket_array(ca, 0); - struct bucket_array *src = __bucket_array(ca, 1); - size_t b; + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); - for (b = 0; b < src->nbuckets; b++) { - copy_bucket_field(_mark.gen); - copy_bucket_field(_mark.data_type); - copy_bucket_field(_mark.stripe); - copy_bucket_field(_mark.dirty_sectors); - copy_bucket_field(_mark.cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); - dst->b[b].oldest_gen = src->b[b].oldest_gen; - } - - { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, - dev_usage_u64s()); - - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); - } + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); } }; @@ -1278,7 +1246,6 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_fs_field #undef copy_dev_field -#undef copy_bucket_field #undef copy_stripe_field #undef copy_field fsck_err: @@ -1286,6 +1253,8 @@ fsck_err: percpu_ref_put(&ca->ref); if (ret) bch_err(c, "%s: ret %i", __func__, ret); + + percpu_up_write(&c->mark_lock); return ret; } @@ -1308,15 +1277,6 @@ static int bch2_gc_start(struct bch_fs *c, BUG_ON(ca->buckets[1]); BUG_ON(ca->usage_gc); - ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets[1]) { - percpu_ref_put(&ca->ref); - bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; - } - ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); @@ -1325,33 +1285,151 @@ static int bch2_gc_start(struct bch_fs *c, } } - percpu_down_write(&c->mark_lock); + return 0; +} + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + bool initial, bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket *g; + struct bkey_s_c k; + struct bkey_alloc_unpacked old_u, new_u, gc_u; + struct bkey_alloc_buf *a; + int ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + old_u = new_u = bch2_alloc_unpack(k); + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, iter->pos.offset); + gc_u = (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, + .gen = g->mark.gen, + .oldest_gen = g->oldest_gen, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; + percpu_up_read(&c->mark_lock); + + if (metadata_only && + gc_u.data_type != BCH_DATA_sb && + gc_u.data_type != BCH_DATA_journal && + gc_u.data_type != BCH_DATA_btree) + return 0; + + if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || + gen_after(old_u.gen, gc_u.gen)) + return 0; + +#define copy_bucket_field(_f) \ + if (fsck_err_on(new_u._f != gc_u._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + new_u.gen, \ + bch2_data_types[new_u.data_type], \ + new_u._f, gc_u._f)) \ + new_u._f = gc_u._f; \ + + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +#undef copy_bucket_field + + new_u.oldest_gen = gc_u.oldest_gen; + + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; + + a = bch2_alloc_pack(trans, new_u); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) + : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; +} + +static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); for_each_member_device(ca, c, i) { - struct bucket_array *dst = __bucket_array(ca, 1); - struct bucket_array *src = __bucket_array(ca, 0); - size_t b; + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; - dst->first_bucket = src->first_bucket; - dst->nbuckets = src->nbuckets; - - for (b = 0; b < src->nbuckets; b++) { - struct bucket *d = &dst->b[b]; - struct bucket *s = &src->b[b]; - - d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; - d->gen_valid = s->gen_valid; - - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) - d->_mark = s->mark; + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, + initial, metadata_only)); + if (ret) + break; } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error writing alloc info: %i", ret); + percpu_ref_put(&ca->ref); + break; + } + } + + bch2_trans_exit(&trans); + return ret; +} + +static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + percpu_up_write(&c->mark_lock); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; + } + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets[1], buckets); }; - percpu_up_write(&c->mark_lock); - - return 0; + return bch2_alloc_read(c, true, metadata_only); } static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) @@ -1423,10 +1501,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bkey_reassemble(new, k); - if (!r->refcount) + if (!r->refcount) { new->k.type = KEY_TYPE_deleted; - else + /* + * XXX ugly: bch2_journal_key_insert() queues up + * the key for the journal replay code, which + * doesn't run the extent overwrite pass + */ + if (initial) + new->k.size = 0; + } else { *bkey_refcount(new) = cpu_to_le64(r->refcount); + } ret = initial ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) @@ -1598,6 +1684,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) !bch2_btree_interior_updates_nr_pending(c)); ret = bch2_gc_start(c, metadata_only) ?: + bch2_gc_alloc_start(c, initial, metadata_only) ?: bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; @@ -1665,16 +1752,15 @@ out: if (!ret) { bch2_journal_block(&c->journal); - percpu_down_write(&c->mark_lock); - ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, initial, metadata_only) ?: + ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: + bch2_gc_reflink_done(c, initial, metadata_only) ?: + bch2_gc_alloc_done(c, initial, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); - } else { - percpu_down_write(&c->mark_lock); } + percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); @@ -1709,9 +1795,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) percpu_down_read(&c->mark_lock); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); - if (gen_after(g->mark.gen, ptr->gen) > 16) { + if (ptr_stale(ca, ptr) > 16) { percpu_up_read(&c->mark_lock); return true; } @@ -1719,10 +1804,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(g->gc_gen, ptr->gen)) - g->gc_gen = ptr->gen; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; } percpu_up_read(&c->mark_lock); @@ -1733,23 +1818,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree * node pointers currently never have cached pointers that can become stale: */ -static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) +static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) { - struct btree_trans trans; + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0, commit_err = 0; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, BTREE_ITER_PREFETCH| BTREE_ITER_NOT_EXTENTS| BTREE_ITER_ALL_SNAPSHOTS); - while ((bch2_trans_begin(&trans), + while ((bch2_trans_begin(trans), k = bch2_btree_iter_peek(&iter)).k) { ret = bkey_err(k); @@ -1765,10 +1849,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_extent_normalize(c, bkey_i_to_s(sk.k)); commit_err = - bch2_trans_update(&trans, &iter, sk.k, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_NOFAIL); + bch2_trans_update(trans, &iter, sk.k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); if (commit_err == -EINTR) { commit_err = 0; continue; @@ -1777,20 +1861,42 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_btree_iter_advance(&iter); } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + int ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + u = bch2_alloc_unpack(k); + + if (u.oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + + u.oldest_gen = ca->oldest_gen[iter->pos.offset]; + + return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); +} + int bch2_gc_gens(struct bch_fs *c) { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - u64 start_time = local_clock(); + u64 b, start_time = local_clock(); unsigned i; int ret; @@ -1800,21 +1906,32 @@ int bch2_gc_gens(struct bch_fs *c) * lock at the start of going RO, thus the gc thread may get stuck: */ down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); + struct bucket_gens *gens; - for_each_bucket(g, buckets) - g->gc_gen = g->mark.gen; - up_read(&ca->bucket_lock); + BUG_ON(ca->oldest_gen); + + ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -ENOMEM; + goto err; + } + + gens = bucket_gens(ca); + + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; } for (i = 0; i < BTREE_ID_NR; i++) if ((1 << i) & BTREE_ID_HAS_PTRS) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = bch2_gc_btree_gens(c, i); + ret = bch2_gc_btree_gens(&trans, i); if (ret) { bch_err(c, "error recalculating oldest_gen: %i", ret); goto err; @@ -1822,12 +1939,28 @@ int bch2_gc_gens(struct bch_fs *c) } for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; - for_each_bucket(g, buckets) - g->oldest_gen = g->gc_gen; - up_read(&ca->bucket_lock); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter)); + if (ret) { + bch_err(c, "error writing oldest_gen: %i", ret); + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + percpu_ref_put(&ca->ref); + break; + } } c->gc_gens_btree = 0; @@ -1837,6 +1970,12 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); err: + for_each_member_device(ca, c, i) { + kvfree(ca->oldest_gen); + ca->oldest_gen = NULL; + } + + bch2_trans_exit(&trans); up_read(&c->gc_lock); return ret; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index efe9b8cb..8505ad5c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1717,8 +1717,8 @@ bch2_btree_path_make_mut(struct btree_trans *trans, return path; } -static struct btree_path * __must_check -btree_path_set_pos(struct btree_trans *trans, +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, bool intent, unsigned long ip) { @@ -1932,7 +1932,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path_pos->btree_id == btree_id && path_pos->level == level) { __btree_path_get(path_pos, intent); - path = btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); } else { path = btree_path_alloc(trans, path_pos); path_pos = NULL; @@ -1983,13 +1983,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct struct bkey_s_c k; - BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - if (!path->cached) { struct btree_path_level *l = path_l(path); - struct bkey_packed *_k = - bch2_btree_node_iter_peek_all(&l->iter, l->b); + struct bkey_packed *_k; + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); @@ -1999,12 +1999,15 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct } else { struct bkey_cached *ck = (void *) path->l[0].b; - EBUG_ON(path->btree_id != ck->key.btree_id || - bkey_cmp(path->pos, ck->key.pos)); + EBUG_ON(ck && + (path->btree_id != ck->key.btree_id || + bkey_cmp(path->pos, ck->key.pos))); - /* BTREE_ITER_CACHED_NOFILL? */ - if (unlikely(!ck->valid)) - goto hole; + /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ + if (unlikely(!ck || !ck->valid)) + return bkey_s_c_null; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); k = bkey_i_to_s_c(ck->k); } @@ -2029,7 +2032,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - iter->path = btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2066,7 +2069,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos = b->key.k.p; - iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; @@ -2128,7 +2131,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * the next child node */ path = iter->path = - btree_path_set_pos(trans, path, bpos_successor(iter->pos), + bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2151,7 +2154,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos = b->key.k.p; - iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; @@ -2247,18 +2250,52 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, return k; } +/* + * Checks btree key cache for key at iter->pos and returns it if present, or + * bkey_s_c_null: + */ +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + int ret; + + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) + return bkey_s_c_null; + + if (!iter->key_cache_path) + iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, + iter->flags & BTREE_ITER_INTENT, 0, + iter->flags|BTREE_ITER_CACHED, + _THIS_IP_); + + iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + iter->key_cache_path->should_be_locked = true; + + return bch2_btree_path_peek_slot(iter->key_cache_path, &u); +} + static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; struct bkey_i *next_update; - struct bkey_s_c k; + struct bkey_s_c k, k2; int ret; EBUG_ON(iter->path->cached || iter->path->level); bch2_btree_iter_verify(iter); while (1) { - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2270,8 +2307,23 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } + iter->path->should_be_locked = true; + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + ret = bkey_err(k2); + if (ret) { + k = k2; + goto out; + } + + k = k2; + iter->k = *k.k; + } + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); @@ -2368,7 +2420,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); iter->update_path = iter->path; - iter->update_path = btree_path_set_pos(trans, + iter->update_path = bch2_btree_path_set_pos(trans, iter->update_path, pos, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2407,7 +2459,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); - iter->path = btree_path_set_pos(trans, iter->path, k.k->p, + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); BUG_ON(!iter->path->nodes_locked); @@ -2471,7 +2523,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) search_key.snapshot = U32_MAX; while (1) { - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2602,7 +2654,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } search_key = btree_iter_search_key(iter); - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2631,6 +2683,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out; } + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + if (!bkey_err(k)) + iter->k = *k.k; + goto out; + } + k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { struct bpos next; @@ -2820,8 +2879,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->update_path) bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, + iter->flags & BTREE_ITER_INTENT); iter->path = NULL; iter->update_path = NULL; + iter->key_cache_path = NULL; } static void __bch2_trans_iter_init(struct btree_trans *trans, @@ -2849,9 +2912,16 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) flags |= BTREE_ITER_WITH_JOURNAL; + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + iter->trans = trans; iter->path = NULL; iter->update_path = NULL; + iter->key_cache_path = NULL; iter->btree_id = btree_id; iter->min_depth = depth; iter->flags = flags; @@ -2902,6 +2972,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = NULL; } void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 5205d53c..759c7b52 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path, return btree_path_node(path, b->c.level + 1); } -static inline int btree_iter_err(const struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -} - /* Iterate over paths within a transaction: */ static inline struct btree_path * @@ -132,6 +127,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, struct btree_path * __must_check bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool, unsigned long); +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, + struct bpos, bool, unsigned long); int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index faed51e7..df016c98 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -208,19 +208,21 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_iter iter; + struct btree_path *path; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; + struct bkey u; int ret; - bch2_trans_iter_init(trans, &iter, ck->key.btree_id, - ck->key.pos, BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + path = bch2_path_get(trans, ck->key.btree_id, + ck->key.pos, 0, 0, 0, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path, 0); if (ret) goto err; + k = bch2_btree_path_peek_slot(path, &u); + if (!bch2_btree_node_relock(trans, ck_path, 0)) { trace_trans_restart_relock_key_cache_fill(trans->fn, _THIS_IP_, ck_path->btree_id, &ck_path->pos); @@ -261,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + path->preserve = false; err: - bch2_trans_iter_exit(trans, &iter); + bch2_path_put(trans, path, 0); return ret; } @@ -384,21 +386,27 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_CACHED_NOFILL| BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); + b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + ret = bch2_btree_iter_traverse(&c_iter); if (ret) goto out; ck = (void *) c_iter.path->l[0].b; - if (!ck || - (journal_seq && ck->journal.seq != journal_seq)) + if (!ck) goto out; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (!evict) - goto out; - goto evict; + if (evict) + goto evict; + goto out; } + BUG_ON(!ck->valid); + + if (journal_seq && ck->journal.seq != journal_seq) + goto out; + /* * Since journal reclaim depends on us making progress here, and the * allocator/copygc depend on journal reclaim making progress, we need @@ -406,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * */ ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 65f460e3..989129f9 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -202,10 +202,10 @@ struct btree_node_iter { */ #define BTREE_ITER_IS_EXTENTS (1 << 4) #define BTREE_ITER_NOT_EXTENTS (1 << 5) -#define BTREE_ITER_ERROR (1 << 6) -#define BTREE_ITER_CACHED (1 << 7) -#define BTREE_ITER_CACHED_NOFILL (1 << 8) -#define BTREE_ITER_CACHED_NOCREATE (1 << 9) +#define BTREE_ITER_CACHED (1 << 6) +#define BTREE_ITER_CACHED_NOFILL (1 << 7) +#define BTREE_ITER_CACHED_NOCREATE (1 << 8) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) #define BTREE_ITER_WITH_UPDATES (1 << 10) #define BTREE_ITER_WITH_JOURNAL (1 << 11) #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) @@ -277,6 +277,7 @@ struct btree_iter { struct btree_trans *trans; struct btree_path *path; struct btree_path *update_path; + struct btree_path *key_cache_path; enum btree_id btree_id:4; unsigned min_depth:4; @@ -636,6 +637,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id) enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -648,6 +650,7 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 5e5a1b5e..d9a406a2 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, - struct bkey_i *, enum btree_update_flags); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 7b8ca115..a0f7a9f0 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -243,6 +243,8 @@ retry: bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(c); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); @@ -265,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); + set_btree_node_accessed(b); set_btree_node_dirty(c, b); set_btree_node_need_write(b); @@ -378,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) while (as->nr_prealloc_nodes) { struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; - six_unlock_write(&b->c.lock); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); if (c->btree_reserve_cache_nr < ARRAY_SIZE(c->btree_reserve_cache)) { @@ -392,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) bch2_open_buckets_put(c, &b->ob); } - btree_node_lock_type(c, b, SIX_LOCK_write); __btree_node_free(c, b); six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); } @@ -403,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as) } static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, - unsigned flags, struct closure *cl) + unsigned flags) { struct bch_fs *c = as->c; + struct closure cl; struct btree *b; int ret; + closure_init_stack(&cl); +retry: + BUG_ON(nr_nodes > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: + * + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) - return ret; + goto err; while (as->nr_prealloc_nodes < nr_nodes) { b = __bch2_btree_node_alloc(c, &as->disk_res, flags & BTREE_INSERT_NOWAIT - ? NULL : cl, flags); + ? NULL : &cl, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); - goto err_free; + goto err; } as->prealloc_nodes[as->nr_prealloc_nodes++] = b; } bch2_btree_cache_cannibalize_unlock(c); + closure_sync(&cl); return 0; -err_free: +err: bch2_btree_cache_cannibalize_unlock(c); - trace_btree_reserve_get_fail(c, nr_nodes, cl); + closure_sync(&cl); + + if (ret == -EAGAIN) + goto retry; + + trace_btree_reserve_get_fail(c, nr_nodes, &cl); return ret; } @@ -935,7 +952,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, { struct bch_fs *c = trans->c; struct btree_update *as; - struct closure cl; u64 start_time = local_clock(); int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; @@ -946,9 +962,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (flags & BTREE_INSERT_JOURNAL_RESERVED) journal_flags |= JOURNAL_RES_GET_RESERVED; - - closure_init_stack(&cl); -retry: + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; /* * XXX: figure out how far we might need to split, @@ -1003,30 +1018,16 @@ retry: if (ret) goto err; + bch2_trans_unlock(trans); + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - bch2_btree_update_free(as); - btree_trans_restart(trans); - return ERR_PTR(ret); - } - - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags); - if (ret) { - trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); - goto err; - } - - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } + journal_flags); + if (ret) { + bch2_btree_update_free(as); + trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return ERR_PTR(ret); } ret = bch2_disk_reservation_get(c, &as->disk_res, @@ -1036,10 +1037,15 @@ retry: if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(as, nr_nodes, flags); if (ret) goto err; + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + goto err; + } + bch2_journal_pin_add(&c->journal, atomic64_read(&c->journal.seq), &as->journal, NULL); @@ -1047,16 +1053,6 @@ retry: return as; err: bch2_btree_update_free(as); - - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = -EINTR; - } - - if (ret == -EINTR && bch2_trans_relock(trans)) - goto retry; - return ERR_PTR(ret); } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 7186457d..9d954537 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -23,6 +23,10 @@ #include #include +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags); + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -650,9 +654,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n", - buf, trans->fn, (void *) i->ip_allocated, invalid); - bch2_fatal_error(c); + bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", + buf, trans->fn, (void *) i->ip_allocated, invalid); return -EINVAL; } btree_insert_entry_checks(trans, i); @@ -1358,8 +1361,9 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, return ret; } -int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags) +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) { struct btree_insert_entry *i, n; @@ -1397,17 +1401,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr !btree_insert_entry_cmp(&n, i)) { BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - /* - * This is a hack to ensure that inode creates update the btree, - * not the key cache, which helps with cache coherency issues in - * other areas: - */ - if (n.cached && !i->cached) { - i->k = n.k; - i->flags = n.flags; - return 0; - } - bch2_path_put(trans, i->path, true); *i = n; } else @@ -1421,12 +1414,17 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { + struct btree_path *path = iter->update_path ?: iter->path; + struct bkey_cached *ck; + int ret; + if (iter->flags & BTREE_ITER_IS_EXTENTS) return bch2_trans_update_extent(trans, iter, k, flags); if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); if (unlikely(ret < 0)) return ret; @@ -1434,8 +1432,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter k->k.type = KEY_TYPE_whiteout; } - return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path, - k, flags); + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + bpos_cmp(iter->key_cache_path->pos, k->k.p)) { + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return -EINTR; + } + + iter->key_cache_path->should_be_locked = true; + } + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags); } void bch2_trans_commit_hook(struct btree_trans *trans, diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index bf5ad436..b9f09b82 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -520,6 +520,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, !old_u.data_type != !new_u.data_type && new.k->type == KEY_TYPE_alloc_v3) { struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; + u64 old_journal_seq = le64_to_cpu(v->journal_seq); BUG_ON(!journal_seq); @@ -529,7 +530,8 @@ static int bch2_mark_alloc(struct btree_trans *trans, * to wait on a journal flush before we can reuse the bucket: */ new_u.journal_seq = !new_u.data_type && - bch2_journal_noflush_seq(&c->journal, journal_seq) + (journal_seq == old_journal_seq || + bch2_journal_noflush_seq(&c->journal, old_journal_seq)) ? 0 : journal_seq; v->journal_seq = cpu_to_le64(new_u.journal_seq); } @@ -2094,7 +2096,7 @@ static void buckets_free_rcu(struct rcu_head *rcu) container_of(rcu, struct bucket_array, rcu); kvpfree(buckets, - sizeof(struct bucket_array) + + sizeof(*buckets) + buckets->nbuckets * sizeof(struct bucket)); } @@ -2103,7 +2105,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets); + kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) @@ -2213,9 +2215,9 @@ err: kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) - call_rcu(&old_buckets->rcu, bucket_gens_free_rcu); + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); if (buckets) - call_rcu(&old_buckets->rcu, buckets_free_rcu); + call_rcu(&buckets->rcu, buckets_free_rcu); return ret; } @@ -2230,6 +2232,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index d35c96bc..7c6c59c7 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -97,12 +97,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return bucket(ca, PTR_BUCKET_NR(ca, ptr)); -} - static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 24139831..2c73dc60 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -30,7 +30,6 @@ struct bucket { u64 io_time[2]; u8 oldest_gen; - u8 gc_gen; unsigned gen_valid:1; u8 stripe_redundancy; u32 stripe; diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index 33ae6370..56b37b24 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -107,6 +107,10 @@ retry: victim = old; } + /* hashed to same slot 3 times: */ + if (!victim) + break; + /* Failed to find an empty slot: */ swap(new, *victim); last_evicted = victim; diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 2cea6945..8279a9ba 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "inconsistency detected - emergency read only"); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); @@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c) void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "fatal error - emergency read only"); } void bch2_io_error_work(struct work_struct *work) diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 472c03d2..91fa1897 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -104,7 +104,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum), - 0 && c->opts.inodes_use_key_cache); + c->opts.inodes_use_key_cache); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); @@ -1471,7 +1471,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode), true); + bch2_inode_rm(c, inode_inum(inode)); } } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 3a7c1468..78e2db6c 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans, u32 snapshot; int ret; - if (0 && trans->c->opts.inodes_use_key_cache) - flags |= BTREE_ITER_CACHED; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) return ret; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), flags); + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -631,20 +629,16 @@ err: return ret; } -int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans trans; struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; - unsigned iter_flags = BTREE_ITER_INTENT; u32 snapshot; int ret; - if (0 && cached && c->opts.inodes_use_key_cache) - iter_flags |= BTREE_ITER_CACHED; - bch2_trans_init(&trans, c, 0, 1024); /* @@ -668,7 +662,8 @@ retry: goto err; bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), iter_flags); + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 723186d8..77957cc7 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -87,7 +87,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum); int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index e566f851..651828b8 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1677,6 +1677,6 @@ no_io: continue_at(cl, journal_write_done, c->io_complete_wq); return; err: - bch2_inconsistent_error(c); + bch2_fatal_error(c); continue_at(cl, journal_write_done, c->io_complete_wq); } diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 92f78907..c82ecff3 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -6,6 +6,7 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" @@ -137,18 +138,106 @@ static inline int fragmentation_cmp(copygc_heap *heap, return cmp_int(l.fragmentation, r.fragmentation); } +static int walk_buckets_to_copygc(struct bch_fs *c) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); + struct copygc_heap_entry e; + + u = bch2_alloc_unpack(k); + + if (u.data_type != BCH_DATA_user || + u.dirty_sectors >= ca->mi.bucket_size || + bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) + continue; + + e = (struct copygc_heap_entry) { + .dev = iter.pos.inode, + .gen = u.gen, + .replicas = 1 + u.stripe_redundancy, + .fragmentation = u.dirty_sectors * (1U << 15) + / ca->mi.bucket_size, + .sectors = u.dirty_sectors, + .offset = bucket_to_sector(ca, iter.pos.offset), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +static int bucket_inorder_cmp(const void *_l, const void *_r) +{ + const struct copygc_heap_entry *l = _l; + const struct copygc_heap_entry *r = _r; + + return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); +} + +static int check_copygc_was_done(struct bch_fs *c, + u64 *sectors_not_moved, + u64 *buckets_not_moved) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct copygc_heap_entry *i; + int ret = 0; + + sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); + + for (i = h->data; i < h->data + h->used; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + + bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); + + ret = lockrestart_do(&trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + break; + + u = bch2_alloc_unpack(k); + + if (u.gen == i->gen && u.dirty_sectors) { + *sectors_not_moved += u.dirty_sectors; + *buckets_not_moved += 1; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct copygc_heap_entry e, *i; - struct bucket_array *buckets; struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; u64 sectors_reserved = 0; u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; - size_t b, heap_size = 0; + size_t heap_size = 0; int ret; bch_move_stats_init(&move_stats, "copygc"); @@ -178,34 +267,12 @@ static int bch2_copygc(struct bch_fs *c) spin_lock(&ca->fs->freelist_lock); sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; spin_unlock(&ca->fs->freelist_lock); + } - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket *g = buckets->b + b; - struct bucket_mark m = READ_ONCE(g->mark); - struct copygc_heap_entry e; - - if (m.owned_by_allocator || - m.data_type != BCH_DATA_user || - m.dirty_sectors >= ca->mi.bucket_size) - continue; - - WARN_ON(m.stripe && !g->stripe_redundancy); - - e = (struct copygc_heap_entry) { - .dev = dev_idx, - .gen = m.gen, - .replicas = 1 + g->stripe_redundancy, - .fragmentation = m.dirty_sectors * (1U << 15) - / ca->mi.bucket_size, - .sectors = m.dirty_sectors, - .offset = bucket_to_sector(ca, b), - }; - heap_add_or_replace(h, e, -fragmentation_cmp, NULL); - } - up_read(&ca->bucket_lock); + ret = walk_buckets_to_copygc(c); + if (ret) { + bch2_fs_fatal_error(c, "error walking buckets to copygc!"); + return ret; } if (!h->used) { @@ -251,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c) writepoint_ptr(&c->copygc_write_point), copygc_pred, NULL, &move_stats); - - for_each_rw_member(ca, c, dev_idx) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for (i = h->data; i < h->data + h->used; i++) { - struct bucket_mark m; - size_t b; - - if (i->dev != dev_idx) - continue; - - b = sector_to_bucket(ca, i->offset); - m = READ_ONCE(buckets->b[b].mark); - - if (i->gen == m.gen && - m.dirty_sectors) { - sectors_not_moved += m.dirty_sectors; - buckets_not_moved++; - } - } - up_read(&ca->bucket_lock); + if (ret) { + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + return ret; } - if (sectors_not_moved && !ret) + ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); + if (ret) { + bch_err(c, "error %i from check_copygc_was_done()", ret); + return ret; + } + + if (sectors_not_moved) bch_warn_ratelimited(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", sectors_not_moved, sectors_to_move, diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index b818093e..7e4400cc 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1095,7 +1095,11 @@ use_clean: bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - ret = bch2_alloc_read(c); + + down_read(&c->gc_lock); + ret = bch2_alloc_read(c, false, false); + up_read(&c->gc_lock); + if (ret) goto err; bch_verbose(c, "alloc read done"); @@ -1153,23 +1157,6 @@ use_clean: if (c->opts.verbose || !c->sb.clean) bch_info(c, "journal replay done"); - if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && - !c->opts.nochanges) { - /* - * note that even when filesystem was clean there might be work - * to do here, if we ran gc (because of fsck) which recalculated - * oldest_gen: - */ - bch_verbose(c, "writing allocation info"); - err = "error writing out alloc info"; - ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error writing alloc info"); - goto err; - } - bch_verbose(c, "alloc write done"); - } - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index a08f1e08..96994b7a 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -414,18 +414,10 @@ err: goto out; } -static int __bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *r, - bool check) -{ - return likely(bch2_replicas_marked(c, r)) ? 0 - : check ? -1 - : bch2_mark_replicas_slowpath(c, r); -} - int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) { - return __bch2_mark_replicas(c, r, false); + return likely(bch2_replicas_marked(c, r)) + ? 0 : bch2_mark_replicas_slowpath(c, r); } /* replicas delta list: */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 586ba60d..d8b72d8d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -762,6 +762,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_opts_apply(&c->opts, opts); + /* key cache currently disabled for inodes, because of snapshots: */ + c->opts.inodes_use_key_cache = 0; + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) + c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);