From 3765483ff0cf9abd0243fcafe11aebd0f9beb03d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Mar 2022 19:21:13 -0400 Subject: [PATCH] Update bcachefs sources to f05b3c1af9 bcachefs: Improve bucket_alloc_fail tracepoint --- .bcachefs_revision | 2 +- include/trace/events/bcachefs.h | 27 +- libbcachefs/alloc_background.c | 1266 +++++++++++++++++-------------- libbcachefs/alloc_background.h | 90 ++- libbcachefs/alloc_foreground.c | 360 +++++++-- libbcachefs/alloc_foreground.h | 14 + libbcachefs/alloc_types.h | 22 - libbcachefs/bcachefs.h | 29 +- libbcachefs/bcachefs_format.h | 73 +- libbcachefs/bkey_methods.c | 28 + libbcachefs/btree_gc.c | 152 ++-- libbcachefs/btree_io.c | 2 +- libbcachefs/btree_types.h | 27 +- libbcachefs/buckets.c | 313 ++++---- libbcachefs/buckets.h | 125 ++- libbcachefs/buckets_types.h | 35 +- libbcachefs/extent_update.c | 13 +- libbcachefs/journal.c | 173 +++-- libbcachefs/journal_io.c | 4 + libbcachefs/journal_sb.c | 222 ++++++ libbcachefs/journal_sb.h | 24 + libbcachefs/lru.c | 203 +++++ libbcachefs/lru.h | 17 + libbcachefs/movinggc.c | 25 +- libbcachefs/opts.h | 2 +- libbcachefs/recovery.c | 35 +- libbcachefs/super-io.c | 87 +-- libbcachefs/super-io.h | 10 +- libbcachefs/super.c | 118 +-- libbcachefs/super_types.h | 1 + libbcachefs/sysfs.c | 47 +- 31 files changed, 2119 insertions(+), 1427 deletions(-) create mode 100644 libbcachefs/journal_sb.c create mode 100644 libbcachefs/journal_sb.h create mode 100644 libbcachefs/lru.c create mode 100644 libbcachefs/lru.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 74f5970f..be0ed057 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -e48731a188639563444d475622782b7963df4b47 +f05b3c1af906802e46f9caca13fb6260d8293fdf diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 05968879..832e9f19 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -491,9 +491,30 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc, TP_ARGS(ca, reserve) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), - TP_ARGS(ca, reserve) +TRACE_EVENT(bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve, + u64 avail, u64 need_journal_commit), + TP_ARGS(ca, reserve, avail, need_journal_commit), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(enum alloc_reserve, reserve ) + __field(u64, avail ) + __field(u64, need_journal_commit ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; + __entry->reserve = reserve; + __entry->avail = avail; + __entry->need_journal_commit = need_journal_commit; + ), + + TP_printk("%d,%d reserve %d avail %llu need_journal_commit %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve, + __entry->avail, + __entry->need_journal_commit) ); DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 4afb2d45..0c334243 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -14,6 +14,7 @@ #include "debug.h" #include "ec.h" #include "error.h" +#include "lru.h" #include "recovery.h" #include "varint.h" @@ -26,19 +27,21 @@ #include #include -const char * const bch2_allocator_states[] = { -#define x(n) #n, - ALLOC_THREAD_STATES() -#undef x - NULL -}; - static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, BCH_ALLOC_FIELDS_V1() #undef x }; +const char * const bch2_bucket_states[] = { + "free", + "need gc gens", + "need discard", + "cached", + "dirty", + NULL +}; + /* Persistent alloc info: */ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, @@ -161,6 +164,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; + out->need_discard = BCH_ALLOC_NEED_DISCARD(a.v); + out->need_inc_gen = BCH_ALLOC_NEED_INC_GEN(a.v); out->journal_seq = le64_to_cpu(a.v->journal_seq); #define x(_name, _bits) \ @@ -197,6 +202,8 @@ static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, a->v.oldest_gen = src.oldest_gen; a->v.data_type = src.data_type; a->v.journal_seq = cpu_to_le64(src.journal_seq); + SET_BCH_ALLOC_NEED_DISCARD(&a->v, src.need_discard); + SET_BCH_ALLOC_NEED_INC_GEN(&a->v, src.need_inc_gen); #define x(_name, _bits) \ nr_fields++; \ @@ -325,22 +332,20 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", + pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %u", u.gen, u.oldest_gen, bch2_data_types[u.data_type], - u.journal_seq); + u.journal_seq, u.need_discard); #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); BCH_ALLOC_FIELDS_V2() #undef x } -int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) +int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked u; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -348,31 +353,8 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, gc); - u = bch2_alloc_unpack(k); - - if (!gc) - *bucket_gen(ca, k.k->p.offset) = u.gen; - - g->_mark.gen = u.gen; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = !gc ? u.oldest_gen : u.gen; - g->gen_valid = 1; - - if (!gc || - (metadata_only && - (u.data_type == BCH_DATA_user || - u.data_type == BCH_DATA_cached || - u.data_type == BCH_DATA_parity))) { - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - } + *bucket_gen(ca, k.k->p.offset) = bch2_alloc_unpack(k).gen; } bch2_trans_iter_exit(&trans, &iter); @@ -384,6 +366,677 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) return ret; } +/* Free space/discard btree: */ + +static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct bkey_alloc_unpacked a, + bool set) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, a.dev); + struct btree_iter iter; + struct bkey_s_c old; + struct bkey_i *k; + enum bucket_state state = bucket_state(a); + enum btree_id btree; + enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; + enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + struct printbuf buf = PRINTBUF; + int ret; + + if (state != BUCKET_free && + state != BUCKET_need_discard) + return 0; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.type = new_type; + + switch (state) { + case BUCKET_free: + btree = BTREE_ID_freespace; + k->k.p = alloc_freespace_pos(a); + bch2_key_resize(&k->k, 1); + break; + case BUCKET_need_discard: + btree = BTREE_ID_need_discard; + k->k.p = POS(a.dev, a.bucket); + break; + default: + return 0; + } + + bch2_trans_iter_init(trans, &iter, btree, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + old = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(old); + if (ret) + goto err; + + if (ca->mi.freespace_initialized && + bch2_fs_inconsistent_on(old.k->type != old_type, c, + "incorrect key when %s %s btree (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", + bch2_btree_ids[btree], + bch2_bkey_types[old.k->type], + bch2_bkey_types[old_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = -EIO; + goto err; + } + + ret = bch2_trans_update(trans, &iter, k, 0); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +int bch2_trans_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); + struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(bkey_i_to_s_c(new)); + u64 old_lru, new_lru; + bool need_repack = false; + int ret = 0; + + if (new_u.dirty_sectors > old_u.dirty_sectors || + new_u.cached_sectors > old_u.cached_sectors) { + new_u.read_time = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_u.write_time = max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + new_u.need_inc_gen = true; + new_u.need_discard = true; + need_repack = true; + } + + if (old_u.data_type && !new_u.data_type && + old_u.gen == new_u.gen && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_u.gen++; + new_u.need_inc_gen = false; + need_repack = true; + } + + if (bucket_state(old_u) != bucket_state(new_u) || + (bucket_state(new_u) == BUCKET_free && + alloc_freespace_genbits(old_u) != alloc_freespace_genbits(new_u))) { + ret = bch2_bucket_do_index(trans, old, old_u, false) ?: + bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_u, true); + if (ret) + return ret; + } + + old_lru = alloc_lru_idx(old_u); + new_lru = alloc_lru_idx(new_u); + + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, + old_lru, &new_lru); + if (ret) + return ret; + + if (new_lru && new_u.read_time != new_lru) { + new_u.read_time = new_lru; + need_repack = true; + } + } + + if (need_repack && !bkey_deleted(&new->k)) + bch2_alloc_pack_v3((void *) new, new_u); + + return 0; +} + +static int bch2_check_alloc_key(struct btree_trans *trans, + struct btree_iter *alloc_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter discard_iter, freespace_iter, lru_iter; + struct bkey_alloc_unpacked a; + unsigned discard_key_type, freespace_key_type; + struct bkey_s_c alloc_k, k; + struct printbuf buf = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret; + + alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; + + ret = bkey_err(alloc_k); + if (ret) + return ret; + + a = bch2_alloc_unpack(alloc_k); + discard_key_type = bucket_state(a) == BUCKET_need_discard + ? KEY_TYPE_set : 0; + freespace_key_type = bucket_state(a) == BUCKET_free + ? KEY_TYPE_set : 0; + + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, + alloc_k.k->p, 0); + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, + alloc_freespace_pos(a), 0); + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, + POS(a.dev, a.read_time), 0); + + k = bch2_btree_iter_peek_slot(&discard_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != discard_key_type, c, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = discard_key_type; + update->k.p = discard_iter.pos; + + ret = bch2_trans_update(trans, &discard_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } + + k = bch2_btree_iter_peek_slot(&freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != freespace_key_type, c, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = freespace_key_type; + update->k.p = freespace_iter.pos; + bch2_key_resize(&update->k, 1); + + ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } + + if (bucket_state(a) == BUCKET_cached) { + if (fsck_err_on(!a.read_time, c, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + + a.read_time = atomic64_read(&c->io_clock[READ].now); + + ret = bch2_lru_change(trans, a.dev, a.bucket, + 0, &a.read_time) ?: + bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN); + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } + + k = bch2_btree_iter_peek_slot(&lru_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != KEY_TYPE_lru || + le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != a.bucket, c, + "incorrect/missing lru entry\n" + " %s\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + u64 read_time = a.read_time; + + ret = bch2_lru_change(trans, a.dev, a.bucket, + 0, &a.read_time) ?: + (a.read_time != read_time + ? bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN) + : 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + printbuf_exit(&buf2); + printbuf_exit(&buf); + return ret; +} + +static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) +{ + struct bch_dev *ca; + + if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) + return false; + + ca = bch_dev_bkey_exists(c, pos.inode); + return pos.offset >= ca->mi.first_bucket && + pos.offset < ca->mi.nbuckets; +} + +static int bch2_check_freespace_key(struct btree_trans *trans, + struct btree_iter *freespace_iter, + bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c k, freespace_k; + struct bkey_alloc_unpacked a; + u64 genbits; + struct bpos pos; + struct bkey_i *update; + struct printbuf buf = PRINTBUF; + int ret; + + freespace_k = bch2_btree_iter_peek(freespace_iter); + if (!freespace_k.k) + return 1; + + ret = bkey_err(freespace_k); + if (ret) + return ret; + + pos = freespace_iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = freespace_iter->pos.offset & (~0ULL << 56); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + "%llu:%llu set in freespace btree but device or bucket does not exist", + pos.inode, pos.offset)) + goto delete; + + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto err; + + a = bch2_alloc_unpack(k); + + if (fsck_err_on(bucket_state(a) != BUCKET_free || + genbits != alloc_freespace_genbits(a), c, + "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + bucket_state(a) == BUCKET_free, + genbits >> 56, alloc_freespace_genbits(a) >> 56)) + goto delete; +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +delete: + update = bch2_trans_kmalloc(trans, sizeof(*update)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, 1); + + ret = bch2_trans_update(trans, freespace_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + goto out; +} + +int bch2_check_alloc_info(struct bch_fs *c, bool initial) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_alloc_key(&trans, &iter)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_freespace_key(&trans, &iter, initial)); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} + +static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_dev *ca, bool *discard_done) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked a; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; + + a = bch2_alloc_unpack(k); + + if (a.need_inc_gen) { + a.gen++; + a.need_inc_gen = false; + goto write; + } + + BUG_ON(a.journal_seq > c->journal.flushed_seq_ondisk); + + if (bch2_fs_inconsistent_on(!a.need_discard, c, + "%s\n incorrectly set in need_discard btree", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto out; + } + + if (!*discard_done && ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL, 0); + *discard_done = true; + + ret = bch2_trans_relock(trans); + if (ret) + goto out; + } + + a.need_discard = false; +write: + ret = bch2_alloc_write(trans, &iter, &a, 0); +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = NULL; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_need_discard, + POS_MIN, 0, k, ret) { + bool discard_done = false; + + if (ca && k.k->p.inode != ca->dev_idx) { + percpu_ref_put(&ca->io_ref); + ca = NULL; + } + + if (!ca) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { + ca = NULL; + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + k.k->p.inode, k.k->p.offset) || + bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) + continue; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ca) + percpu_ref_put(&ca->io_ref); + + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} + +void bch2_do_discards(struct bch_fs *c) +{ + if (percpu_ref_tryget(&c->writes) && + !queue_work(system_long_wq, &c->discard_work)) + percpu_ref_put(&c->writes); +} + +static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) +{ + struct bch_fs *c = trans->c; + struct btree_iter lru_iter, alloc_iter = { NULL }; + struct bkey_s_c k; + struct bkey_alloc_unpacked a; + u64 bucket, idx; + int ret; + + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, + POS(ca->dev_idx, 0), 0); + k = bch2_btree_iter_peek(&lru_iter); + ret = bkey_err(k); + if (ret) + goto out; + + if (!k.k || k.k->p.inode != ca->dev_idx) + goto out; + + if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c, + "non lru key in lru btree")) + goto out; + + idx = k.k->p.offset; + bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + POS(ca->dev_idx, bucket), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto out; + + a = bch2_alloc_unpack(k); + + if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a), c, + "invalidating bucket with wrong lru idx (got %llu should be %llu", + idx, alloc_lru_idx(a))) + goto out; + + a.gen++; + a.need_inc_gen = false; + a.data_type = 0; + a.dirty_sectors = 0; + a.cached_sectors = 0; + a.read_time = atomic64_read(&c->io_clock[READ].now); + a.write_time = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_alloc_write(trans, &alloc_iter, &a, + BTREE_TRIGGER_BUCKET_INVALIDATE); +out: + bch2_trans_iter_exit(trans, &alloc_iter); + bch2_trans_iter_exit(trans, &lru_iter); + return ret; +} + +static void bch2_do_invalidates_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans trans; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) + while (!ret && should_invalidate_buckets(ca)) + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + invalidate_one_bucket(&trans, ca)); + + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} + +void bch2_do_invalidates(struct bch_fs *c) +{ + if (percpu_ref_tryget(&c->writes)) + queue_work(system_long_wq, &c->invalidate_work); +} + +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked a; + struct bch_member *m; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (iter.pos.offset >= ca->mi.nbuckets) + break; + + a = bch2_alloc_unpack(k); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_bucket_do_index(&trans, k, a, true)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) { + bch_err(ca, "error initializing free space: %i", ret); + return ret; + } + + mutex_lock(&c->sb_lock); + m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_fs_freespace_init(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + bool doing_init = false; + + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ + + for_each_member_device(ca, c, i) { + if (ca->mi.freespace_initialized) + continue; + + if (!doing_init) { + bch_info(c, "initializing freespace"); + doing_init = true; + } + + ret = bch2_dev_freespace_init(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; + } + } + + if (doing_init) { + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch_verbose(c, "done initializing freespace"); + } + + return ret; +} + /* Bucket IO clocks: */ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, @@ -420,481 +1073,6 @@ out: return ret; } -/* Background allocator thread: */ - -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ - -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, - struct bucket_mark m) -{ - u8 gc_gen; - - if (!is_available_bucket(m)) - return false; - - if (m.owned_by_allocator) - return false; - - if (ca->buckets_nouse && - test_bit(b, ca->buckets_nouse)) - return false; - - if (ca->new_fs_bucket_idx) { - /* - * Device or filesystem is still being initialized, and we - * haven't fully marked superblocks & journal: - */ - if (is_superblock_bucket(ca, b)) - return false; - - if (b < ca->new_fs_bucket_idx) - return false; - } - - gc_gen = bucket_gc_gen(bucket(ca, b)); - - ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; - ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; - - return gc_gen < BUCKET_GC_GEN_MAX; -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - */ - -static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, - u64 now, u64 last_seq_ondisk) -{ - unsigned used = m.cached_sectors; - - if (used) { - /* - * Prefer to keep buckets that have been read more recently, and - * buckets that have more data in them: - */ - u64 last_read = max_t(s64, 0, now - g->io_time[READ]); - u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - - return -last_read_scaled; - } else { - /* - * Prefer to use buckets with smaller gc_gen so that we don't - * have to walk the btree and recalculate oldest_gen - but shift - * off the low bits so that buckets will still have equal sort - * keys when there's only a small difference, so that we can - * keep sequential buckets together: - */ - return bucket_gc_gen(g) >> 4; - } -} - -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) -{ - return cmp_int(l.key, r.key) ?: - cmp_int(r.nr, l.nr) ?: - cmp_int(l.bucket, r.bucket); -} - -static inline int bucket_idx_cmp(const void *_l, const void *_r) -{ - const struct alloc_heap_entry *l = _l, *r = _r; - - return cmp_int(l->bucket, r->bucket); -} - -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - u64 now, last_seq_ondisk; - size_t b, i, nr = 0; - - down_read(&ca->bucket_lock); - - buckets = bucket_array(ca); - ca->alloc_heap.used = 0; - now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.flushed_seq_ondisk; - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket *g = &buckets->b[b]; - struct bucket_mark m = READ_ONCE(g->mark); - unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); - - cond_resched(); - - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; - - if (!m.data_type && - bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - last_seq_ondisk, - ca->dev_idx, b)) { - ca->buckets_waiting_on_journal++; - continue; - } - - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } - } - - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { - nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); - } - - up_read(&ca->bucket_lock); -} - -static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - size_t i, nr = 0; - - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; - ca->buckets_waiting_on_journal = 0; - - find_reclaimable_buckets_lru(c, ca); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - return nr; -} - -static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_alloc_unpacked *u) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - *u = bch2_alloc_unpack(k); - u->gen++; - u->data_type = 0; - u->dirty_sectors = 0; - u->cached_sectors = 0; - u->read_time = atomic64_read(&c->io_clock[READ].now); - u->write_time = atomic64_read(&c->io_clock[WRITE].now); - - ret = bch2_alloc_write(trans, &iter, u, - BTREE_TRIGGER_BUCKET_INVALIDATE); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, unsigned flags) -{ - struct bkey_alloc_unpacked u; - size_t b; - u64 commit_seq = 0; - int ret = 0; - - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; - - BUG_ON(!ca->alloc_heap.used || - !ca->alloc_heap.data[0].nr); - b = ca->alloc_heap.data[0].bucket; - - /* first, put on free_inc and mark as owned by allocator: */ - percpu_down_read(&c->mark_lock); - - bch2_mark_alloc_bucket(c, ca, b, true); - - spin_lock(&c->freelist_lock); - verify_not_on_freelist(c, ca, b); - BUG_ON(!fifo_push(&ca->free_inc, b)); - spin_unlock(&c->freelist_lock); - - percpu_up_read(&c->mark_lock); - - ret = bch2_trans_do(c, NULL, &commit_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - flags, - bucket_invalidate_btree(&trans, ca, b, &u)); - - if (!ret) { - /* remove from alloc_heap: */ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; - - top->bucket++; - top->nr--; - - if (!top->nr) - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - - /* - * If we invalidating cached data then we need to wait on the - * journal commit: - */ - if (u.data_type) - *journal_seq = max(*journal_seq, commit_seq); - - /* - * We already waiting on u.alloc_seq when we filtered out - * buckets that need journal commit: - */ - BUG_ON(*journal_seq > u.journal_seq); - } else { - size_t b2; - - /* remove from free_inc: */ - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - bch2_mark_alloc_bucket(c, ca, b, false); - - BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); - BUG_ON(b != b2); - - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); - } - - return ret < 0 ? ret : 0; -} - -/* - * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: - */ -static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - u64 journal_seq = 0; - int ret = 0; - - /* Only use nowait if we've already invalidated at least one bucket: */ - while (!ret && - !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) { - if (kthread_should_stop()) { - ret = 1; - break; - } - - ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, - (!fifo_empty(&ca->free_inc) - ? BTREE_INSERT_NOWAIT : 0)); - /* - * We only want to batch up invalidates when they're going to - * require flushing the journal: - */ - if (!journal_seq) - break; - } - - /* If we used NOWAIT, don't return the error: */ - if (!fifo_empty(&ca->free_inc)) - ret = 0; - if (ret < 0) - bch_err(ca, "error invalidating buckets: %i", ret); - if (ret) - return ret; - - if (journal_seq) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) { - bch_err(ca, "journal error: %i", ret); - return ret; - } - - return 0; -} - -static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -{ - if (ca->allocator_state != new_state) { - ca->allocator_state = new_state; - closure_wake_up(&ca->fs->freelist_wait); - } -} - -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - unsigned i; - int ret = 0; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (i == RESERVE_MOVINGGC && - !test_bit(BCH_FS_STARTED, &c->flags)) - continue; - - if (fifo_push(&ca->free[i], b)) { - fifo_pop(&ca->free_inc, b); - ret = 1; - break; - } - } - spin_unlock(&c->freelist_lock); - - ca->allocator_state = ret - ? ALLOCATOR_running - : ALLOCATOR_blocked_full; - closure_wake_up(&c->freelist_wait); - return ret; -} - -static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (!c->opts.nochanges && - ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), - ca->mi.bucket_size, GFP_NOFS, 0); -} - -static bool allocator_thread_running(struct bch_dev *ca) -{ - unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) - ? ALLOCATOR_running - : ALLOCATOR_stopped; - alloc_thread_set_state(ca, state); - return state == ALLOCATOR_running; -} - -static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -{ - s64 available = dev_buckets_reclaimable(ca) - - (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); - bool ret = available > 0; - - alloc_thread_set_state(ca, ret - ? ALLOCATOR_running - : ALLOCATOR_blocked); - return ret; -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - unsigned long gc_count = c->gc_count; - size_t nr; - int ret; - - set_freezable(); - - while (1) { - ret = kthread_wait_freezable(allocator_thread_running(ca)); - if (ret) - goto stop; - - while (!ca->alloc_heap.used) { - cond_resched(); - - ret = kthread_wait_freezable(buckets_available(ca, gc_count)); - if (ret) - goto stop; - - gc_count = c->gc_count; - nr = find_reclaimable_buckets(c, ca); - - if (!nr && ca->buckets_waiting_on_journal) { - ret = bch2_journal_flush(&c->journal); - if (ret) - goto stop; - } else if (nr < (ca->mi.nbuckets >> 6) && - ca->buckets_waiting_on_journal >= nr / 2) { - bch2_journal_flush_async(&c->journal, NULL); - } - - if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || - ca->inc_gen_really_needs_gc) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } - - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); - } - - ret = bch2_invalidate_buckets(c, ca); - if (ret) - goto stop; - - while (!fifo_empty(&ca->free_inc)) { - u64 b = fifo_peek(&ca->free_inc); - - discard_one_bucket(c, ca, b); - - ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); - if (ret) - goto stop; - } - } -stop: - alloc_thread_set_state(ca, ALLOCATOR_stopped); - return 0; -} - /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -903,7 +1081,7 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i, j; + unsigned i; lockdep_assert_held(&c->state_lock); @@ -934,8 +1112,9 @@ void bch2_recalc_capacity(struct bch_fs *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_NONE; j++) - dev_reserve += ca->free[j].size; + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ @@ -991,8 +1170,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; - BUG_ON(ca->alloc_thread); - /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -1066,62 +1243,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -{ - if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_running); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch-alloc/%s", ca->name); - if (IS_ERR(p)) { - bch_err(ca->fs, "error creating allocator thread: %li", - PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 3eaa6d20..06539e03 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -8,8 +8,6 @@ #include "debug.h" #include "super.h" -extern const char * const bch2_allocator_states[]; - struct bkey_alloc_unpacked { u64 journal_seq; u64 bucket; @@ -17,6 +15,8 @@ struct bkey_alloc_unpacked { u8 gen; u8 oldest_gen; u8 data_type; + bool need_discard:1; + bool need_inc_gen:1; #define x(_name, _bits) u##_bits _name; BCH_ALLOC_FIELDS_V2() #undef x @@ -25,6 +25,50 @@ struct bkey_alloc_unpacked { /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U +static inline u8 alloc_gc_gen(struct bkey_alloc_unpacked a) +{ + return a.gen - a.oldest_gen; +} + +enum bucket_state { + BUCKET_free, + BUCKET_need_gc_gens, + BUCKET_need_discard, + BUCKET_cached, + BUCKET_dirty, +}; + +extern const char * const bch2_bucket_states[]; + +static inline enum bucket_state bucket_state(struct bkey_alloc_unpacked a) +{ + if (a.dirty_sectors || a.stripe) + return BUCKET_dirty; + if (a.cached_sectors) + return BUCKET_cached; + BUG_ON(a.data_type); + if (a.need_discard) + return BUCKET_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BUCKET_need_gc_gens; + return BUCKET_free; +} + +static inline u64 alloc_lru_idx(struct bkey_alloc_unpacked a) +{ + return bucket_state(a) == BUCKET_cached ? a.read_time : 0; +} + +static inline u64 alloc_freespace_genbits(struct bkey_alloc_unpacked a) +{ + return ((u64) alloc_gc_gen(a) >> 4) << 56; +} + +static inline struct bpos alloc_freespace_pos(struct bkey_alloc_unpacked a) +{ + return POS(a.dev, a.bucket | alloc_freespace_genbits(a)); +} + /* returns true if not equal */ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, struct bkey_alloc_unpacked r) @@ -65,18 +109,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ } @@ -87,44 +134,31 @@ static inline bool bkey_is_alloc(const struct bkey *k) k->type == KEY_TYPE_alloc_v3; } -int bch2_alloc_read(struct bch_fs *, bool, bool); +int bch2_alloc_read(struct bch_fs *); -static inline void bch2_wake_allocator(struct bch_dev *ca) +int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); +int bch2_check_alloc_info(struct bch_fs *, bool); +void bch2_do_discards(struct bch_fs *); + +static inline bool should_invalidate_buckets(struct bch_dev *ca) { - struct task_struct *p; + struct bch_dev_usage u = bch2_dev_usage_read(ca); - rcu_read_lock(); - p = rcu_dereference(ca->alloc_thread); - if (p) - wake_up_process(p); - rcu_read_unlock(); + return u.d[BCH_DATA_cached].buckets && + u.buckets_unavailable + u.d[BCH_DATA_cached].buckets < + ca->mi.nbuckets >> 7; } -static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (bch2_expensive_debug_checks) { - size_t iter; - long i; - unsigned j; +void bch2_do_invalidates(struct bch_fs *); - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} +int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_stop(struct bch_dev *); -int bch2_dev_allocator_start(struct bch_dev *); - void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 9b81ed26..178d7c05 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -14,13 +14,18 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "io.h" +#include "journal.h" #include #include @@ -78,7 +83,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; ob->data_type = 0; @@ -178,39 +182,28 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) } } -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, - struct closure *cl) +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + struct bkey_alloc_unpacked a, + size_t *need_journal_commit, + struct closure *cl) { struct open_bucket *ob; - long b = 0; + + if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse))) + return NULL; + + if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) + return NULL; + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) { + (*need_journal_commit)++; + return NULL; + } spin_lock(&c->freelist_lock); - if (may_alloc_partial) { - int i; - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } - } - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -219,36 +212,17 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, c->blocked_allocate_open_bucket = local_clock(); spin_unlock(&c->freelist_lock); + trace_open_bucket_alloc_fail(ca, reserve); return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) - goto out; - - switch (reserve) { - case RESERVE_BTREE_MOVINGGC: - case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) - goto out; - break; - default: - break; + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) { + spin_unlock(&c->freelist_lock); + return NULL; } - if (cl) - closure_wait(&c->freelist_wait, cl); - - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); - - spin_unlock(&c->freelist_lock); - - trace_bucket_alloc_fail(ca, reserve); - return ERR_PTR(-FREELIST_EMPTY); -out: - verify_not_on_freelist(c, ca, b); - ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); @@ -257,8 +231,8 @@ out: ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; - ob->gen = *bucket_gen(ca, b); - ob->bucket = b; + ob->gen = a.gen; + ob->bucket = a.bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; @@ -280,12 +254,246 @@ out: spin_unlock(&c->freelist_lock); - bch2_wake_allocator(ca); - trace_bucket_alloc(ca, reserve); return ob; } +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum alloc_reserve reserve, u64 free_entry, + size_t *need_journal_commit, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob; + struct bkey_alloc_unpacked a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + a = bch2_alloc_unpack(k); + + if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c, + "non free bucket in freespace btree (state %s)\n" + " %s\n" + " at %llu (genbits %u)", + bch2_bucket_states[bucket_state(a)], + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + free_entry, genbits)) { + ob = ERR_PTR(-EIO); + goto err; + } + + if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c, + "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " %s", + genbits, alloc_freespace_genbits(a) >> 56, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ob = ERR_PTR(-EIO); + goto err; + } + + if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c, + "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)", + b, ca->mi.first_bucket, ca->mi.nbuckets)) { + ob = ERR_PTR(-EIO); + goto err; + } + + ob = __try_alloc_bucket(c, ca, reserve, a, need_journal_commit, cl); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) +{ + struct open_bucket *ob; + int i; + + spin_lock(&c->freelist_lock); + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } + + spin_unlock(&c->freelist_lock); + return NULL; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_trans_early(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *b, + size_t *need_journal_commit, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + *b = max_t(u64, *b, ca->mi.first_bucket); + *b = max_t(u64, *b, ca->new_fs_bucket_idx); + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *b), + BTREE_ITER_SLOTS, k, ret) { + struct bkey_alloc_unpacked a; + + if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + a = bch2_alloc_unpack(k); + + if (bucket_state(a) != BUCKET_free) + continue; + + ob = __try_alloc_bucket(trans->c, ca, reserve, a, + need_journal_commit, cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + *b = iter.pos.offset; + + return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); +} + +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *b, + size_t *need_journal_commit, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + if (unlikely(!ca->mi.freespace_initialized)) + return bch2_bucket_alloc_trans_early(trans, ca, reserve, b, + need_journal_commit, cl); + + BUG_ON(ca->new_fs_bucket_idx); + + for_each_btree_key(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, *b), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (*b = max(*b, bkey_start_offset(k.k)); + *b != k.k->p.offset && !ob; + (*b)++) { + if (btree_trans_too_many_iters(trans)) { + ob = ERR_PTR(-EINTR); + break; + } + + ob = try_alloc_bucket(trans, ca, reserve, *b, + need_journal_commit, cl); + } + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ob ?: ERR_PTR(ret); +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + * */ +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct open_bucket *ob = NULL; + size_t need_journal_commit = 0; + u64 avail = dev_buckets_available(ca, reserve); + u64 b = 0; + int ret; + + if (may_alloc_partial) { + ob = try_alloc_partial_bucket(c, ca, reserve); + if (ob) + return ob; + } +again: + if (!avail) { + if (cl) { + closure_wait(&c->freelist_wait, cl); + /* recheck after putting ourself on waitlist */ + avail = dev_buckets_available(ca, reserve); + if (avail) { + closure_wake_up(&c->freelist_wait); + goto again; + } + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-FREELIST_EMPTY); + goto err; + } + + ret = bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, + ca, reserve, &b, + &need_journal_commit, cl))); + + if (need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); +err: + if (!ob) + ob = ERR_PTR(ret ?: -FREELIST_EMPTY); + + if (ob == ERR_PTR(-FREELIST_EMPTY)) { + trace_bucket_alloc_fail(ca, reserve, avail, need_journal_commit); + atomic_long_inc(&c->bucket_alloc_fail); + } + + return ob; +} + static int __dev_stripe_cmp(struct dev_stripe_state *stripe, unsigned l, unsigned r) { @@ -313,7 +521,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca); + u64 free_space = dev_buckets_available(ca, RESERVE_NONE); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -364,6 +572,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c, { struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; struct bch_dev *ca; int ret = -INSUFFICIENT_DEVICES; unsigned i; @@ -373,30 +582,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c, for (i = 0; i < devs_sorted.nr; i++) { struct open_bucket *ob; - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + if (!ca) continue; - if (!ca->mi.durability && *have_cache) + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); continue; + } ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment(ca, stripe); + percpu_ref_put(&ca->ref); + if (IS_ERR(ob)) { ret = PTR_ERR(ob); if (cl) - return ret; + break; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(ca, stripe); - - if (*nr_effective >= nr_replicas) - return 0; + if (*nr_effective >= nr_replicas) { + ret = 0; + break; + } } return ret; @@ -564,9 +786,6 @@ static int open_bucket_add_buckets(struct bch_fs *c, if (*nr_effective >= nr_replicas) return 0; - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from @@ -580,9 +799,6 @@ retry_blocking: goto retry_blocking; } - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; } @@ -863,7 +1079,7 @@ err: case -INSUFFICIENT_DEVICES: return ERR_PTR(-EROFS); default: - BUG(); + return ERR_PTR(ret); } } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index d466bda9..f51cec5e 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -115,6 +115,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke return false; } +static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) +{ + bool ret; + + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); + + return ret; +} + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 409232e3..22e1fbda 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -10,18 +10,6 @@ struct ec_bucket_buf; -#define ALLOC_THREAD_STATES() \ - x(stopped) \ - x(running) \ - x(blocked) \ - x(blocked_full) - -enum allocator_states { -#define x(n) ALLOCATOR_##n, - ALLOC_THREAD_STATES() -#undef x -}; - enum alloc_reserve { RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE = -1, @@ -30,8 +18,6 @@ enum alloc_reserve { RESERVE_NR = 2, }; -typedef FIFO(long) alloc_fifo; - #define OPEN_BUCKETS_COUNT 1024 #define WRITE_POINT_HASH_NR 32 @@ -94,12 +80,4 @@ struct write_point_specifier { unsigned long v; }; -struct alloc_heap_entry { - size_t bucket; - size_t nr; - unsigned long key; -}; - -typedef HEAP(struct alloc_heap_entry) alloc_heap; - #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 211fd5ad..a4ef9aab 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -391,6 +391,9 @@ enum gc_phase { GC_PHASE_BTREE_reflink, GC_PHASE_BTREE_subvolumes, GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, GC_PHASE_PENDING_DELETE, }; @@ -447,7 +450,7 @@ struct bch_dev { * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets[2]; + struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; @@ -459,34 +462,17 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - struct task_struct __rcu *alloc_thread; - /* - * free: Buckets that are ready to be used - * - * free_inc: Incoming buckets - these are buckets that currently have - * cached data in them, and we can't reuse them until after we write - * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) - */ - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; unsigned nr_open_buckets; + unsigned nr_btree_reserve; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial_nr; - size_t fifo_last_bucket; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; - enum allocator_states allocator_state; - - alloc_heap alloc_heap; - atomic64_t rebalance_work; struct journal_device journal; @@ -508,8 +494,6 @@ struct bch_dev { enum { /* startup: */ BCH_FS_ALLOC_CLEAN, - BCH_FS_ALLOCATOR_RUNNING, - BCH_FS_ALLOCATOR_STOPPING, BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_TOPOLOGY_REPAIR_DONE, @@ -773,6 +757,8 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; + struct work_struct invalidate_work; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; @@ -911,6 +897,7 @@ struct bch_fs { atomic_long_t read_realloc_races; atomic_long_t extent_migrate_done; atomic_long_t extent_migrate_raced; + atomic_long_t bucket_alloc_fail; unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 5153f0e4..bb54ac17 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -347,7 +347,9 @@ static inline void bkey_init(struct bkey *k) x(subvolume, 21) \ x(snapshot, 22) \ x(inode_v2, 23) \ - x(alloc_v3, 24) + x(alloc_v3, 24) \ + x(set, 25) \ + x(lru, 26) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -377,6 +379,10 @@ struct bch_hash_whiteout { struct bch_val v; }; +struct bch_set { + struct bch_val v; +}; + /* Extents */ /* @@ -877,8 +883,8 @@ struct bch_alloc_v2 { #define BCH_ALLOC_FIELDS_V2() \ x(read_time, 64) \ x(write_time, 64) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ x(stripe, 32) \ x(stripe_redundancy, 8) @@ -893,11 +899,13 @@ struct bch_alloc_v3 { __u8 data[]; } __attribute__((packed, aligned(8))); +LE32_BITMASK(BCH_ALLOC_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + enum { #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, BCH_ALLOC_FIELDS_V1() #undef x - BCH_ALLOC_FIELD_NR }; /* Quotas: */ @@ -1015,6 +1023,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +/* LRU btree: */ + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __attribute__((packed, aligned(8))); + +#define LRU_ID_STRIPES (1U << 16) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1023,16 +1040,17 @@ struct bch_sb_field { __le32 type; }; -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) +#define BCH_SB_FIELDS() \ + x(journal, 0) \ + x(members, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1041,6 +1059,14 @@ enum bch_sb_field_type { BCH_SB_FIELD_NR }; +/* + * Most superblock fields are replicated in all device's superblocks - a few are + * not: + */ +#define BCH_SINGLE_DEVICE_SB_FIELDS \ + ((1U << BCH_SB_FIELD_journal)| \ + (1U << BCH_SB_FIELD_journal_v2)) + /* BCH_SB_FIELD_journal: */ struct bch_sb_field_journal { @@ -1048,6 +1074,15 @@ struct bch_sb_field_journal { __le64 buckets[0]; }; +struct bch_sb_field_journal_v2 { + struct bch_sb_field field; + + struct bch_sb_field_journal_v2_entry { + __le64 start; + __le64 nr; + } d[0]; +}; + /* BCH_SB_FIELD_members: */ #define BCH_MIN_NR_NBUCKETS (1 << 6) @@ -1069,6 +1104,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags[0], 30, 31) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -1287,7 +1324,8 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_reflink_p_fix = 16, bcachefs_metadata_version_subvol_dirent = 17, bcachefs_metadata_version_inode_v2 = 18, - bcachefs_metadata_version_max = 19, + bcachefs_metadata_version_freespace = 19, + bcachefs_metadata_version_max = 20, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1804,7 +1842,10 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(stripes, 6) \ x(reflink, 7) \ x(subvolumes, 8) \ - x(snapshots, 9) + x(snapshots, 9) \ + x(lru, 10) \ + x(freespace, 11) \ + x(need_discard, 12) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index e83aeb68..3c1bf331 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -9,6 +9,7 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "lru.h" #include "quota.h" #include "reflink.h" #include "subvolume.h" @@ -85,6 +86,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ } +static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_val_bytes(k.k)) + return "nonempty value"; + return NULL; +} + +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +#define bch2_bkey_ops_set (struct bkey_ops) { \ + .key_invalid = key_type_set_invalid, \ + .key_merge = key_type_set_merge, \ +} + const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() @@ -147,6 +166,15 @@ static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_snapshots] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_lru] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_lru), + [BKEY_TYPE_freespace] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_need_discard] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), [BKEY_TYPE_btree] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 73b947a4..5c54a0ca 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -571,37 +571,37 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; g->gen_valid = true; + g->gen = p.ptr.gen; } else { do_update = true; } } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, + p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g->_mark.gen = p.ptr.gen; g->gen_valid = true; - g->_mark.data_type = 0; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; } } - if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, (printbuf_reset(&buf), @@ -609,30 +609,30 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, do_update = true; if (fsck_err_on(!p.ptr.cached && - gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + gen_cmp(p.ptr.gen, g->gen) < 0, c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, + p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) continue; - if (fsck_err_on(g->mark.data_type && - g->mark.data_type != data_type, c, + if (fsck_err_on(g->data_type && + g->data_type != data_type, c, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->mark.data_type], + bch2_data_types[g->data_type], bch2_data_types[data_type], (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { - g->_mark.data_type = data_type; + g->data_type = data_type; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; @@ -692,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); - ptr->gen = g->mark.gen; + ptr->gen = g->gen; } } else { bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ @@ -701,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || (!ptr->cached && - gen_cmp(ptr->gen, g->mark.gen) < 0) || - gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->mark.data_type && - g->mark.data_type != data_type); + gen_cmp(ptr->gen, g->gen) < 0) || + gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type); })); again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); @@ -1163,10 +1163,10 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(ca, c, i) { - kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - ca->buckets[1] = NULL; + ca->buckets_gc = NULL; free_percpu(ca->usage_gc); ca->usage_gc = NULL; @@ -1295,7 +1295,7 @@ static int bch2_gc_start(struct bch_fs *c, } for_each_member_device(ca, c, i) { - BUG_ON(ca->buckets[1]); + BUG_ON(ca->buckets_gc); BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); @@ -1315,9 +1315,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket *g; + struct bucket gc; struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u, gc_u; + struct bkey_alloc_unpacked old_u, new_u; struct bkey_alloc_buf *a; int ret; @@ -1329,39 +1329,27 @@ static int bch2_alloc_write_key(struct btree_trans *trans, old_u = new_u = bch2_alloc_unpack(k); percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, iter->pos.offset); - gc_u = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; + gc = *gc_bucket(ca, iter->pos.offset); percpu_up_read(&c->mark_lock); if (metadata_only && - gc_u.data_type != BCH_DATA_sb && - gc_u.data_type != BCH_DATA_journal && - gc_u.data_type != BCH_DATA_btree) + gc.data_type != BCH_DATA_sb && + gc.data_type != BCH_DATA_journal && + gc.data_type != BCH_DATA_btree) return 0; - if (gen_after(old_u.gen, gc_u.gen)) + if (gen_after(old_u.gen, gc.gen)) return 0; #define copy_bucket_field(_f) \ - if (fsck_err_on(new_u._f != gc_u._f, c, \ + if (fsck_err_on(new_u._f != gc._f, c, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ - new_u.gen, \ - bch2_data_types[new_u.data_type], \ - new_u._f, gc_u._f)) \ - new_u._f = gc_u._f; \ + gc.gen, \ + bch2_data_types[gc.data_type], \ + new_u._f, gc._f)) \ + new_u._f = gc._f; \ copy_bucket_field(gen); copy_bucket_field(data_type); @@ -1379,7 +1367,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (IS_ERR(a)) return PTR_ERR(a); - ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, &a->k, 0); fsck_err: return ret; } @@ -1426,7 +1414,13 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; + struct bkey_alloc_unpacked u; unsigned i; + int ret; for_each_member_device(ca, c, i) { struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + @@ -1434,17 +1428,45 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) GFP_KERNEL|__GFP_ZERO); if (!buckets) { percpu_ref_put(&ca->ref); - percpu_up_write(&c->mark_lock); bch_err(c, "error allocating ca->buckets[gc]"); return -ENOMEM; } buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; - rcu_assign_pointer(ca->buckets[1], buckets); + rcu_assign_pointer(ca->buckets_gc, buckets); }; - return bch2_alloc_read(c, true, metadata_only); + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); + + g->gen_valid = 1; + g->gen = u.gen; + + if (metadata_only && + (u.data_type == BCH_DATA_user || + u.data_type == BCH_DATA_cached || + u.data_type == BCH_DATA_parity)) { + g->data_type = u.data_type; + g->dirty_sectors = u.dirty_sectors; + g->cached_sectors = u.cached_sectors; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info at gc start: %i", ret); + + return ret; } static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) @@ -1453,17 +1475,17 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) unsigned i; for_each_member_device(ca, c, i) { - struct bucket_array *buckets = __bucket_array(ca, true); + struct bucket_array *buckets = gc_bucket_array(ca); struct bucket *g; for_each_bucket(g, buckets) { if (metadata_only && - (g->mark.data_type == BCH_DATA_user || - g->mark.data_type == BCH_DATA_cached || - g->mark.data_type == BCH_DATA_parity)) + (g->data_type == BCH_DATA_user || + g->data_type == BCH_DATA_cached || + g->data_type == BCH_DATA_parity)) continue; - g->_mark.dirty_sectors = 0; - g->_mark.cached_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; } }; } @@ -1673,9 +1695,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) */ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { - struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i, iter = 0; + unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1776,13 +1797,6 @@ out: trace_gc_end(c); bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - /* - * Wake up allocator in case it was waiting for buckets - * because of not being able to inc gens - */ - for_each_member_device(ca, c, i) - bch2_wake_allocator(ca); - /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: @@ -1891,7 +1905,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i u.oldest_gen = ca->oldest_gen[iter->pos.offset]; - return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); + return bch2_alloc_write(trans, iter, &u, 0); } int bch2_gc_gens(struct bch_fs *c) diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index e6cea4c6..1df454f2 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -930,7 +930,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "error decrypting btree node: %i", ret)) goto fsck_err; - btree_err_on(btree_node_is_extents(b) && + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), BTREE_ERR_FATAL, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 575635b5..788b9811 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -596,24 +596,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - switch (type) { - case BKEY_TYPE_extents: - case BKEY_TYPE_reflink: - return true; - default: - return false; - } -} - -static inline bool btree_node_is_extents(struct btree *b) -{ - return btree_node_type_is_extents(btree_node_type(b)); -} - #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ (1U << BKEY_TYPE_inodes)| \ (1U << BKEY_TYPE_stripes)| \ (1U << BKEY_TYPE_reflink)| \ @@ -629,6 +614,16 @@ static inline bool btree_node_is_extents(struct btree *b) (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + return (1U << type) & BTREE_ID_IS_EXTENTS; +} + #define BTREE_ID_HAS_SNAPSHOTS \ ((1U << BTREE_ID_extents)| \ (1U << BTREE_ID_inodes)| \ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index d5226375..fbce6cdf 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -279,29 +279,24 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -static inline int is_unavailable_bucket(struct bucket_mark m) +static inline int is_unavailable_bucket(struct bkey_alloc_unpacked a) { - return !is_available_bucket(m); + return a.dirty_sectors || a.stripe; } static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bucket_mark m) + struct bkey_alloc_unpacked a) { - return m.dirty_sectors - ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) : 0; } -static inline int is_stripe_data_bucket(struct bucket_mark m) +static inline enum bch_data_type bucket_type(struct bkey_alloc_unpacked a) { - return m.stripe && m.data_type != BCH_DATA_parity; -} - -static inline enum bch_data_type bucket_type(struct bucket_mark m) -{ - return m.cached_sectors && !m.dirty_sectors + return a.cached_sectors && !a.dirty_sectors ? BCH_DATA_cached - : m.data_type; + : a.data_type; } static inline void account_bucket(struct bch_fs_usage *fs_usage, @@ -316,7 +311,8 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new, + struct bkey_alloc_unpacked old, + struct bkey_alloc_unpacked new, u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; @@ -347,9 +343,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); preempt_enable(); +} - if (!is_available_bucket(old) && is_available_bucket(new)) - bch2_wake_allocator(ca); +static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) +{ + struct bkey_alloc_unpacked old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; + struct bkey_alloc_unpacked new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, + .cached_sectors = new.cached_sectors, + .stripe = new.stripe, + }; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); } static inline int __update_replicas(struct bch_fs *c, @@ -484,19 +499,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator) -{ - struct bucket *g = bucket(ca, b); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - new.owned_by_allocator = owned_by_allocator; - })); - - BUG_ON(owned_by_allocator == old.owned_by_allocator); -} - int bch2_mark_alloc(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -507,8 +509,6 @@ int bch2_mark_alloc(struct btree_trans *trans, struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev); - struct bucket *g; - struct bucket_mark old_m, m; int ret = 0; if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket || @@ -555,28 +555,46 @@ int bch2_mark_alloc(struct btree_trans *trans, } } + if (!new_u.data_type && + (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if ((flags & BTREE_TRIGGER_INSERT) && + new_u.need_discard && + !new_u.journal_seq) + bch2_do_discards(c); + + if (!old_u.data_type && + new_u.data_type && + should_invalidate_buckets(ca)) + bch2_do_invalidates(c); + + if (bucket_state(new_u) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } + percpu_down_read(&c->mark_lock); if (!gc && new_u.gen != old_u.gen) *bucket_gen(ca, new_u.bucket) = new_u.gen; - g = __bucket(ca, new_u.bucket, gc); + bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc); - old_m = bucket_cmpxchg(g, m, ({ - m.gen = new_u.gen; - m.data_type = new_u.data_type; - m.dirty_sectors = new_u.dirty_sectors; - m.cached_sectors = new_u.cached_sectors; - m.stripe = new_u.stripe != 0; - })); + if (gc) { + struct bucket *g = gc_bucket(ca, new_u.bucket); - bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); + bucket_lock(g); - g->io_time[READ] = new_u.read_time; - g->io_time[WRITE] = new_u.write_time; - g->oldest_gen = new_u.oldest_gen; - g->gen_valid = 1; - g->stripe = new_u.stripe; - g->stripe_redundancy = new_u.stripe_redundancy; + g->gen_valid = 1; + g->gen = new_u.gen; + g->data_type = new_u.data_type; + g->stripe = new_u.stripe; + g->stripe_redundancy = new_u.stripe_redundancy; + g->dirty_sectors = new_u.dirty_sectors; + g->cached_sectors = new_u.cached_sectors; + + bucket_unlock(g); + } percpu_up_read(&c->mark_lock); /* @@ -585,9 +603,9 @@ int bch2_mark_alloc(struct btree_trans *trans, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_m.cached_sectors) { + old_u.cached_sectors) { ret = update_cached_sectors(c, new, ca->dev_idx, - -old_m.cached_sectors, + -old_u.cached_sectors, journal_seq, gc); if (ret) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); @@ -595,29 +613,18 @@ int bch2_mark_alloc(struct btree_trans *trans, } trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), - old_m.cached_sectors); + old_u.cached_sectors); } return 0; } -#define checked_add(a, b) \ -({ \ - unsigned _res = (unsigned) (a) + (b); \ - bool overflow = _res > U16_MAX; \ - if (overflow) \ - _res = U16_MAX; \ - (a) = _res; \ - overflow; \ -}) - void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type data_type, unsigned sectors, struct gc_pos pos, unsigned flags) { - struct bucket *g; - struct bucket_mark old, new; + struct bucket old, new, *g; bool overflow; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -632,10 +639,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, percpu_down_read(&c->mark_lock); g = gc_bucket(ca, b); - old = bucket_cmpxchg(g, new, ({ - new.data_type = data_type; - overflow = checked_add(new.dirty_sectors, sectors); - })); + + bucket_lock(g); + old = *g; + + g->data_type = data_type; + g->dirty_sectors += sectors; + overflow = g->dirty_sectors < sectors; + + new = *g; + bucket_unlock(g); bch2_fs_inconsistent_on(old.data_type && old.data_type != data_type, c, @@ -649,7 +662,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type ?: data_type], old.dirty_sectors, sectors); - bch2_dev_usage_update(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, old, new, 0, true); percpu_up_read(&c->mark_lock); } @@ -669,7 +682,7 @@ static int check_bucket_ref(struct bch_fs *c, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 b_gen, u8 bucket_data_type, - u16 dirty_sectors, u16 cached_sectors) + u32 dirty_sectors, u32 cached_sectors) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); @@ -737,7 +750,7 @@ static int check_bucket_ref(struct bch_fs *c, goto err; } - if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { + if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", @@ -768,8 +781,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g; - struct bucket_mark new, old; + struct bucket old, new, *g; struct printbuf buf = PRINTBUF; int ret = 0; @@ -781,34 +793,38 @@ static int mark_stripe_bucket(struct btree_trans *trans, buf.atomic++; g = PTR_GC_BUCKET(ca, ptr); - if (g->mark.dirty_sectors || + if (g->dirty_sectors || (g->stripe && g->stripe != k.k->p.offset)) { bch2_fs_inconsistent(c, "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EINVAL; goto err; } - old = bucket_cmpxchg(g, new, ({ - ret = check_bucket_ref(c, k, ptr, sectors, data_type, - new.gen, new.data_type, - new.dirty_sectors, new.cached_sectors); - if (ret) - goto err; + bucket_lock(g); + old = *g; - new.dirty_sectors += sectors; - if (data_type) - new.data_type = data_type; + ret = check_bucket_ref(c, k, ptr, sectors, data_type, + new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); + if (ret) { + bucket_unlock(g); + goto err; + } - new.stripe = true; - })); + new.dirty_sectors += sectors; + if (data_type) + new.data_type = data_type; g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + new = *g; + bucket_unlock(g); + + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); printbuf_exit(&buf); @@ -820,9 +836,9 @@ static int __mark_pointer(struct btree_trans *trans, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 *bucket_data_type, - u16 *dirty_sectors, u16 *cached_sectors) + u32 *dirty_sectors, u32 *cached_sectors) { - u16 *dst_sectors = !ptr->cached + u32 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, @@ -846,11 +862,9 @@ static int bch2_mark_pointer(struct btree_trans *trans, { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g; + struct bucket old, new, *g; u8 bucket_data_type; - u64 v; int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -858,30 +872,27 @@ static int bch2_mark_pointer(struct btree_trans *trans, percpu_down_read(&c->mark_lock); g = PTR_GC_BUCKET(ca, &p.ptr); - v = atomic64_read(&g->_mark.v); - do { - new.v.counter = old.v.counter = v; - bucket_data_type = new.data_type; + bucket_lock(g); + old = *g; - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, new.gen, - &bucket_data_type, - &new.dirty_sectors, - &new.cached_sectors); - if (ret) - goto err; + bucket_data_type = g->data_type; - new.data_type = bucket_data_type; + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (ret) { + bucket_unlock(g); + goto err; + } - if (flags & BTREE_TRIGGER_NOATOMIC) { - g->_mark = new; - break; - } - } while ((v = atomic64_cmpxchg(&g->_mark.v, - old.v.counter, - new.v.counter)) != old.v.counter); + g->data_type = bucket_data_type; - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + new = *g; + bucket_unlock(g); + + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); @@ -2041,16 +2052,6 @@ recalculate: /* Startup/shutdown: */ -static void buckets_free_rcu(struct rcu_head *rcu) -{ - struct bucket_array *buckets = - container_of(rcu, struct bucket_array, rcu); - - kvpfree(buckets, - sizeof(*buckets) + - buckets->nbuckets * sizeof(struct bucket)); -} - static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = @@ -2061,46 +2062,19 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bucket_array *buckets = NULL, *old_buckets = NULL; struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; - alloc_heap alloc_heap; - - size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); - size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), - btree_reserve * 2); - bool resize = ca->buckets[0] != NULL; + bool resize = ca->bucket_gens != NULL; int ret = -ENOMEM; - unsigned i; - memset(&free, 0, sizeof(free)); - memset(&free_inc, 0, sizeof(free_inc)); - memset(&alloc_heap, 0, sizeof(alloc_heap)); - - if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + - nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO)) || - !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO)) || (c->opts.buckets_nouse && !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO))) || - !init_fifo(&free[RESERVE_MOVINGGC], - copygc_reserve, GFP_KERNEL) || - !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) + GFP_KERNEL|__GFP_ZERO)))) goto err; - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = nbuckets; bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -2112,15 +2086,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) percpu_down_write(&c->mark_lock); } - old_buckets = bucket_array(ca); old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(buckets->nbuckets, old_buckets->nbuckets); + size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - memcpy(buckets->b, - old_buckets->b, - n * sizeof(struct bucket)); memcpy(bucket_gens->b, old_bucket_gens->b, n); @@ -2130,47 +2100,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets[0], buckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); - buckets = old_buckets; bucket_gens = old_bucket_gens; swap(ca->buckets_nouse, buckets_nouse); + nbuckets = ca->mi.nbuckets; + if (resize) { percpu_up_write(&c->mark_lock); + up_write(&ca->bucket_lock); up_write(&c->gc_lock); } - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - fifo_move(&free[i], &ca->free[i]); - swap(ca->free[i], free[i]); - } - fifo_move(&free_inc, &ca->free_inc); - swap(ca->free_inc, free_inc); - spin_unlock(&c->freelist_lock); - - /* with gc lock held, alloc_heap can't be in use: */ - swap(ca->alloc_heap, alloc_heap); - - nbuckets = ca->mi.nbuckets; - - if (resize) - up_write(&ca->bucket_lock); - ret = 0; err: - free_heap(&alloc_heap); - free_fifo(&free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); - if (buckets) - call_rcu(&buckets->rcu, buckets_free_rcu); return ret; } @@ -2179,17 +2127,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->alloc_heap); - free_fifo(&ca->free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), sizeof(struct bucket_gens) + ca->mi.nbuckets); - kvpfree(rcu_dereference_protected(ca->buckets[0], 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); for (i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 392e03d4..4a3d6bf1 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -15,54 +15,34 @@ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -#define bucket_cmpxchg(g, new, expr) \ -({ \ - struct bucket *_g = g; \ - u64 _v = atomic64_read(&(g)->_mark.v); \ - struct bucket_mark _old; \ - \ - do { \ - (new).v.counter = _old.v.counter = _v; \ - expr; \ - } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ - _old.v.counter, \ - (new).v.counter)) != _old.v.counter);\ - _old; \ -}) - -static inline struct bucket_array *__bucket_array(struct bch_dev *ca, - bool gc) +static inline void bucket_unlock(struct bucket *b) { - return rcu_dereference_check(ca->buckets[gc], + smp_store_release(&b->lock, 0); +} + +static inline void bucket_lock(struct bucket *b) +{ + while (xchg(&b->lock, 1)) + cpu_relax(); +} + +static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->buckets_gc, !ca->fs || percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return __bucket_array(ca, false); -} - -static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) -{ - struct bucket_array *buckets = __bucket_array(ca, gc); + struct bucket_array *buckets = gc_bucket_array(ca); BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); return buckets->b + b; } -static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) -{ - return __bucket(ca, b, true); -} - -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -{ - return __bucket(ca, b, false); -} - static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, @@ -70,7 +50,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); - } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) @@ -81,16 +60,6 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bucket *g) -{ - return g->mark.gen - g->oldest_gen; -} - static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { @@ -141,62 +110,55 @@ static inline u8 ptr_stale(struct bch_dev *ca, return ret; } -/* bucket gc marks */ - -static inline bool is_available_bucket(struct bucket_mark mark) -{ - return !mark.dirty_sectors && !mark.stripe; -} - /* Device usage: */ struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage stats) + struct bch_dev_usage stats, + enum alloc_reserve reserve) { - u64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 reserved = 0; + + switch (reserve) { + case RESERVE_NONE: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case RESERVE_MOVINGGC: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_BTREE: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_BTREE_MOVINGGC: + break; + default: + BUG(); + } if (WARN_ONCE(stats.buckets_unavailable > total, "buckets_unavailable overflow (%llu > %llu)\n", stats.buckets_unavailable, total)) return 0; - return total - stats.buckets_unavailable; + return max_t(s64, 0, + total - + stats.buckets_unavailable - + ca->nr_open_buckets - + reserved); } -static inline u64 dev_buckets_available(struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum alloc_reserve reserve) { - return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); -} - -static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, - struct bch_dev_usage stats) -{ - struct bch_fs *c = ca->fs; - s64 available = __dev_buckets_available(ca, stats); - unsigned i; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - available -= fifo_used(&ca->free[i]); - available -= fifo_used(&ca->free_inc); - available -= ca->nr_open_buckets; - spin_unlock(&c->freelist_lock); - - return max(available, 0LL); -} - -static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) -{ - return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); } /* Filesystem usage: */ static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + READ_ONCE(c->replicas.nr); } @@ -224,7 +186,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 2c73dc60..e79a3379 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -7,32 +7,15 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 -struct bucket_mark { - union { - atomic64_t v; - - struct { - u8 gen; - u8 data_type:3, - owned_by_allocator:1, - stripe:1; - u16 dirty_sectors; - u16 cached_sectors; - }; - }; -}; - struct bucket { - union { - struct bucket_mark _mark; - const struct bucket_mark mark; - }; - - u64 io_time[2]; - u8 oldest_gen; - unsigned gen_valid:1; - u8 stripe_redundancy; - u32 stripe; + u8 lock; + u8 gen_valid:1; + u8 data_type:7; + u8 gen; + u8 stripe_redundancy; + u32 stripe; + u32 dirty_sectors; + u32 cached_sectors; }; struct bucket_array { @@ -111,7 +94,7 @@ struct copygc_heap_entry { u8 dev; u8 gen; u8 replicas; - u16 fragmentation; + u32 fragmentation; u32 sectors; u64 offset; }; diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 58b2c96f..2fd5d967 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - unsigned ret = 0; + unsigned ret = 0, lru = 0; bkey_extent_entry_for_each(ptrs, entry) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: + /* Might also be updating LRU btree */ + if (entry->ptr.cached) + lru++; + + fallthrough; case BCH_EXTENT_ENTRY_stripe_ptr: ret++; } } - return ret; + /* + * Updating keys in the alloc btree may also update keys in the + * freespace or discard btrees: + */ + return lru + ret * 2; } static int count_iters_for_insert(struct btree_trans *trans, diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index eb556ecc..340f0bed 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -15,8 +15,8 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "super-io.h" #include @@ -767,86 +767,75 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; u64 *new_bucket_seq = NULL, *new_buckets = NULL; + struct open_bucket **ob = NULL; + long *bu = NULL; + unsigned i, nr_got = 0, nr_want = nr - ja->nr; + unsigned old_nr = ja->nr; + unsigned old_discard_idx = ja->discard_idx; + unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; + unsigned old_dirty_idx = ja->dirty_idx; + unsigned old_cur_idx = ja->cur_idx; int ret = 0; - /* don't handle reducing nr of buckets yet: */ - if (nr <= ja->nr) - return 0; + bch2_journal_block(j); + bch2_journal_flush_all_pins(j); + bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); + ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) { + if (!bu || !ob || !new_buckets || !new_bucket_seq) { ret = -ENOMEM; - goto err; + goto err_unblock; } - journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) { - ret = -ENOSPC; - goto err; + for (nr_got = 0; nr_got < nr_want; nr_got++) { + if (new_fs) { + bu[nr_got] = bch2_bucket_alloc_new_fs(ca); + if (bu[nr_got] < 0) { + ret = -ENOSPC; + break; + } + } else { + ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE, + false, cl); + if (IS_ERR(ob[nr_got])) { + ret = cl ? -EAGAIN : -ENOSPC; + break; + } + + bu[nr_got] = ob[nr_got]->bucket; + } } + if (!nr_got) + goto err_unblock; + /* * We may be called from the device add path, before the new device has * actually been added to the running filesystem: */ if (!new_fs) - spin_lock(&c->journal.lock); + spin_lock(&j->lock); memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); - if (!new_fs) - spin_unlock(&c->journal.lock); + for (i = 0; i < nr_got; i++) { + unsigned pos = ja->discard_idx ?: ja->nr; + long b = bu[i]; - while (ja->nr < nr) { - struct open_bucket *ob = NULL; - unsigned pos; - long b; - - if (new_fs) { - b = bch2_bucket_alloc_new_fs(ca); - if (b < 0) { - ret = -ENOSPC; - goto err; - } - } else { - rcu_read_lock(); - ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, - false, cl); - rcu_read_unlock(); - if (IS_ERR(ob)) { - ret = cl ? -EAGAIN : -ENOSPC; - goto err; - } - - b = ob->bucket; - } - - if (c) - spin_lock(&c->journal.lock); - - /* - * XXX - * For resize at runtime, we should be writing the new - * superblock before inserting into the journal array - */ - - pos = ja->discard_idx ?: ja->nr; __array_insert_item(ja->buckets, ja->nr, pos); __array_insert_item(ja->bucket_seq, ja->nr, pos); - __array_insert_item(journal_buckets->buckets, ja->nr, pos); ja->nr++; ja->buckets[pos] = b; ja->bucket_seq[pos] = 0; - journal_buckets->buckets[pos] = cpu_to_le64(b); if (pos <= ja->discard_idx) ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -856,29 +845,54 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + } - if (c) - spin_unlock(&c->journal.lock); + ret = bch2_journal_buckets_to_sb(c, ca); + if (ret) { + /* Revert: */ + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = old_nr; + ja->discard_idx = old_discard_idx; + ja->dirty_idx_ondisk = old_dirty_idx_ondisk; + ja->dirty_idx = old_dirty_idx; + ja->cur_idx = old_cur_idx; + } - if (!new_fs) { + if (!new_fs) + spin_unlock(&j->lock); + + bch2_journal_unblock(j); + + if (ret) + goto err; + + if (!new_fs) { + for (i = 0; i < nr_got; i++) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_mark_metadata_bucket(&trans, ca, - b, BCH_DATA_journal, + bu[i], BCH_DATA_journal, ca->mi.bucket_size)); - - bch2_open_bucket_put(c, ob); - - if (ret) + if (ret) { + bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); goto err; + } } } err: - bch2_sb_resize_journal(&ca->disk_sb, - ja->nr + sizeof(*journal_buckets) / sizeof(u64)); + if (ob && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); + kfree(new_bucket_seq); kfree(new_buckets); + kfree(ob); + kfree(bu); return ret; +err_unblock: + bch2_journal_unblock(j); + goto err; } /* @@ -891,11 +905,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, struct journal_device *ja = &ca->journal; struct closure cl; unsigned current_nr; - int ret; + int ret = 0; + + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + return 0; closure_init_stack(&cl); - do { + while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { struct disk_reservation disk_res = { 0, 0 }; closure_sync(&cl); @@ -923,7 +941,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, if (ja->nr != current_nr) bch2_write_super(c); mutex_unlock(&c->sb_lock); - } while (ret == -EAGAIN); + } return ret; } @@ -1092,9 +1110,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(sb); + struct bch_sb_field_journal_v2 *journal_buckets_v2 = + bch2_sb_get_journal_v2(sb); unsigned i; - ja->nr = bch2_nr_journal_buckets(journal_buckets); + ja->nr = 0; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + + for (i = 0; i < nr; i++) + ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); + } else if (journal_buckets) { + ja->nr = bch2_nr_journal_buckets(journal_buckets); + } ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) @@ -1109,8 +1138,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->buckets) return -ENOMEM; - for (i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + unsigned j, dst = 0; + + for (i = 0; i < nr; i++) + for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + ja->buckets[dst++] = + le64_to_cpu(journal_buckets_v2->d[i].start) + j; + } else if (journal_buckets) { + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + } return 0; } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index fb24ca21..bacb8058 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" @@ -1372,6 +1373,9 @@ static void journal_write_done(struct closure *cl) if (!JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; + + bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c new file mode 100644 index 00000000..0a8a0077 --- /dev/null +++ b/libbcachefs/journal_sb.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "journal_sb.h" + +#include + +/* BCH_SB_FIELD_journal: */ + +static int u64_cmp(const void *_l, const void *_r) +{ + const u64 *l = _l; + const u64 *r = _r; + + return cmp_int(*l, *r); +} + +static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + u64 *b; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return 0; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + if (!b[0]) { + pr_buf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0] < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) { + pr_buf(err, "duplicate journal buckets %llu", b[i]); + goto err; + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + unsigned i, nr = bch2_nr_journal_buckets(journal); + + pr_buf(out, "Buckets: "); + for (i = 0; i < nr; i++) + pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); + pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_journal_validate, + .to_text = bch2_sb_journal_to_text, +}; + +struct u64_range { + u64 start; + u64 end; +}; + +static int u64_range_cmp(const void *_l, const void *_r) +{ + const struct u64_range *l = _l; + const struct u64_range *r = _r; + + return cmp_int(l->start, r->start); +} + +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + struct u64_range *b; + + nr = bch2_sb_field_journal_v2_nr_entries(journal); + if (!nr) + return 0; + + b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + b[i].start = le64_to_cpu(journal->d[i].start); + b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + } + + sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + + if (!b[0].start) { + pr_buf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0].start < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) { + if (b[i].end == b[i + 1].start) { + pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + + if (b[i].end > b[i + 1].start) { + pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); + + pr_buf(out, "Buckets: "); + for (i = 0; i < nr; i++) + pr_buf(out, " %llu-%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); + pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { + .validate = bch2_sb_journal_v2_validate, + .to_text = bch2_sb_journal_v2_to_text, +}; + +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr = 1; + + lockdep_assert_held(&c->sb_lock); + + if (!ja->nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); + return 0; + } + + for (i = 0; i + 1 < ja->nr; i++) + if (ja->buckets[i] + 1 != ja->buckets[i + 1]) + nr++; + + j = bch2_sb_resize_journal_v2(&ca->disk_sb, + (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); + if (!j) + return -ENOSPC; + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + + j->d[dst].start = le64_to_cpu(ja->buckets[0]); + j->d[dst].nr = le64_to_cpu(1); + + for (i = 1; i < ja->nr; i++) { + if (ja->buckets[i] == ja->buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; + j->d[dst].start = le64_to_cpu(ja->buckets[i]); + j->d[dst].nr = le64_to_cpu(1); + } + } + + return 0; +} diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h new file mode 100644 index 00000000..a39192e9 --- /dev/null +++ b/libbcachefs/journal_sb.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "super-io.h" +#include "vstructs.h" + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) +{ + if (!j) + return 0; + + return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal; +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; + +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c new file mode 100644 index 00000000..1772ccb2 --- /dev/null +++ b/libbcachefs/lru.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" + +const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + if (bkey_val_bytes(k.k) < sizeof(*lru)) + return "incorrect value size"; + + return NULL; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + pr_buf(out, "idx %llu", le64_to_cpu(lru->idx)); +} + +static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + u64 existing_idx; + int ret = 0; + + if (!time) + return 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + POS(id, time), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_lru) { + bch2_fs_inconsistent(c, + "pointer to nonexistent lru %llu:%llu", + id, time); + ret = -EIO; + goto err; + } + + existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + if (existing_idx != idx) { + bch2_fs_inconsistent(c, + "lru %llu:%llu with wrong backpointer: got %llu, should be %llu", + id, time, existing_idx, idx); + ret = -EIO; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_lru *lru; + int ret = 0; + + if (!*time) + return 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, + POS(lru_id, *time), + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES, k, ret) + if (bkey_deleted(k.k)) + break; + + if (ret) + goto err; + + BUG_ON(iter.pos.inode != lru_id); + *time = iter.pos.offset; + + lru = bch2_trans_kmalloc(trans, sizeof(*lru)); + ret = PTR_ERR_OR_ZERO(lru); + if (ret) + goto err; + + bkey_lru_init(&lru->k_i); + lru->k.p = iter.pos; + lru->v.idx = cpu_to_le64(idx); + + ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, + u64 old_time, u64 *new_time) +{ + if (old_time == *new_time) + return 0; + + return lru_delete(trans, id, idx, old_time) ?: + lru_set(trans, id, idx, new_time); +} + +static int bch2_check_lru_key(struct btree_trans *trans, + struct btree_iter *lru_iter, bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c lru_k, k; + struct bkey_alloc_unpacked a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + u64 idx; + int ret; + + lru_k = bch2_btree_iter_peek(lru_iter); + if (!lru_k.k) + return 0; + + ret = bkey_err(lru_k); + if (ret) + return ret; + + idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(lru_k.k->p.inode, idx), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + a = bch2_alloc_unpack(k); + + if (fsck_err_on(bucket_state(a) != BUCKET_cached || + a.read_time != lru_k.k->p.offset, c, + "incorrect lru entry %s\n" + " for %s", + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.p = lru_iter->pos; + + ret = bch2_trans_update(trans, lru_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +int bch2_check_lrus(struct bch_fs *c, bool initial) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_lru_key(&trans, &iter, initial)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; + +} diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h new file mode 100644 index 00000000..4db6a839 --- /dev/null +++ b/libbcachefs/lru.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H + +const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_lru (struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ +} + +int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *); + +int bch2_check_lrus(struct bch_fs *, bool); + +#endif /* _BCACHEFS_LRU_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index c82ecff3..466975a3 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -119,18 +119,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, return DATA_SKIP; } -static bool have_copygc_reserve(struct bch_dev *ca) -{ - bool ret; - - spin_lock(&ca->fs->freelist_lock); - ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_state != ALLOCATOR_running; - spin_unlock(&ca->fs->freelist_lock); - - return ret; -} - static inline int fragmentation_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -165,7 +153,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c) .dev = iter.pos.inode, .gen = u.gen, .replicas = 1 + u.stripe_redundancy, - .fragmentation = u.dirty_sectors * (1U << 15) + .fragmentation = (u64) u.dirty_sectors * (1ULL << 31) / ca->mi.bucket_size, .sectors = u.dirty_sectors, .offset = bucket_to_sector(ca, iter.pos.offset), @@ -262,11 +250,10 @@ static int bch2_copygc(struct bch_fs *c) } for_each_rw_member(ca, c, dev_idx) { - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); + s64 avail = min(dev_buckets_available(ca, RESERVE_MOVINGGC), + ca->mi.nbuckets >> 6); - spin_lock(&ca->fs->freelist_lock); - sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; - spin_unlock(&ca->fs->freelist_lock); + sectors_reserved += avail * ca->mi.bucket_size; } ret = walk_buckets_to_copygc(c); @@ -367,8 +354,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); - fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * - ca->mi.bucket_size) >> 1); + fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_NONE) * + ca->mi.bucket_size) >> 1); fragmented = usage.d[BCH_DATA_user].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 033115f7..70b507fb 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -265,7 +265,7 @@ enum opt_type { x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, true, \ NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT, \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 88797155..fe2c5cb6 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "lru.h" #include "move.h" #include "quota.h" #include "recovery.h" @@ -1027,8 +1028,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); c->opts.version_upgrade = true; c->opts.fsck = true; - } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { - bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); + } else if (c->sb.version < bcachefs_metadata_version_freespace) { + bch_info(c, "filesystem version is prior to freespace - upgrading"); c->opts.version_upgrade = true; } } @@ -1137,7 +1138,7 @@ use_clean: err = "error reading allocation information"; down_read(&c->gc_lock); - ret = bch2_alloc_read(c, false, false); + ret = bch2_alloc_read(c); up_read(&c->gc_lock); if (ret) @@ -1165,13 +1166,27 @@ use_clean: bool metadata_only = c->opts.norecovery; bch_info(c, "checking allocations"); - err = "error in mark and sweep"; + err = "error checking allocations"; ret = bch2_gc(c, true, metadata_only); if (ret) goto err; bch_verbose(c, "done checking allocations"); } + if (c->opts.fsck && + c->sb.version >= bcachefs_metadata_version_freespace) { + bch_info(c, "checking need_discard and freespace btrees"); + err = "error checking need_discard and freespace btrees"; + ret = bch2_check_alloc_info(c, true); + if (ret) + goto err; + + ret = bch2_check_lrus(c, true); + if (ret) + goto err; + bch_verbose(c, "done checking need_discard and freespace btrees"); + } + bch2_stripes_heap_start(c); clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); @@ -1196,6 +1211,11 @@ use_clean: if (c->opts.verbose || !c->sb.clean) bch_info(c, "journal replay done"); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); @@ -1368,6 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c) * Write out the superblock and journal buckets, now that we can do * btree updates */ + bch_verbose(c, "marking superblocks"); err = "error marking superblock and journal"; for_each_member_device(ca, c, i) { ret = bch2_trans_mark_dev_sb(c, ca); @@ -1379,6 +1400,12 @@ int bch2_fs_initialize(struct bch_fs *c) ca->new_fs_bucket_idx = 0; } + bch_verbose(c, "initializing freespace"); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + err = "error creating root snapshot node"; ret = bch2_fs_initialize_subvolumes(c); if (ret) diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index e17ce91c..95af515a 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -10,6 +10,7 @@ #include "io.h" #include "journal.h" #include "journal_io.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" @@ -424,7 +425,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) memcpy(dst->compat, src->compat, sizeof(dst->compat)); for (i = 0; i < BCH_SB_FIELD_NR; i++) { - if (i == BCH_SB_FIELD_journal) + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) continue; src_f = bch2_sb_field_get(src, i); @@ -898,85 +899,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) mutex_unlock(&c->sb_lock); } -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - - return l < r ? -1 : l > r ? 1 : 0; -} - -static int bch2_sb_journal_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - int ret = -EINVAL; - unsigned nr; - unsigned i; - u64 *b; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return 0; - - b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); - if (!b) - return -ENOMEM; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - if (!b[0]) { - pr_buf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0] < le16_to_cpu(m->first_bucket)) { - pr_buf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m->first_bucket)); - goto err; - } - - if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { - pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m->nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - pr_buf(err, "duplicate journal buckets %llu", b[i]); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - unsigned i, nr = bch2_nr_journal_buckets(journal); - - pr_buf(out, "Buckets: "); - for (i = 0; i < nr; i++) - pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); - pr_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_journal_validate, - .to_text = bch2_sb_journal_to_text, -}; - /* BCH_SB_FIELD_members: */ static int bch2_sb_members_validate(struct bch_sb *sb, @@ -1130,6 +1052,11 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m)); pr_newline(out); + pr_buf(out, "Freespace initialized:"); + pr_tab(out); + pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + pr_newline(out); + pr_indent_pop(out, 2); } } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 50f31a3b..14a25f6f 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -/* BCH_SB_FIELD_journal: */ - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - /* BCH_SB_FIELD_members: */ static inline bool bch2_member_exists(struct bch_member *m) @@ -112,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .durability = BCH_MEMBER_DURABILITY(mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), }; } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 46947163..6464e8c0 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -199,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ bch2_journal_flush_all_pins(&c->journal); - /* - * If the allocator threads didn't all start up, the btree updates to - * write out alloc info aren't going to work: - */ - if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) - goto nowrote_alloc; - bch_verbose(c, "flushing journal and stopping allocators"); bch2_journal_flush_all_pins(&c->journal); - set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); do { clean_passes++; @@ -234,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete"); set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -nowrote_alloc: + closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); flush_work(&c->btree_interior_update_work); - for_each_member_device(ca, c, i) - bch2_dev_allocator_stop(ca); - - clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); - bch2_fs_journal_stop(&c->journal); /* @@ -280,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c) /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch2_dev_allocator_stop()). */ percpu_ref_kill(&c->writes); @@ -412,19 +394,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for_each_rw_member(ca, c, i) { - ret = bch2_dev_allocator_start(ca); - if (ret) { - bch_err(c, "error starting allocator threads"); - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - - for_each_rw_member(ca, c, i) - bch2_wake_allocator(ca); + bch2_do_discards(c); if (!early) { ret = bch2_fs_read_write_late(c); @@ -941,20 +911,6 @@ int bch2_fs_start(struct bch_fs *c) set_bit(BCH_FS_STARTED, &c->flags); - /* - * Allocator threads don't start filling copygc reserve until after we - * set BCH_FS_STARTED - wake them now: - * - * XXX ugly hack: - * Need to set ca->allocator_state here instead of relying on the - * allocator threads to do it to avoid racing with the copygc threads - * checking it and thinking they have no alloc reserve: - */ - for_each_online_member(ca, c, i) { - ca->allocator_state = ALLOCATOR_running; - bch2_wake_allocator(ca); - } - if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { @@ -1046,8 +1002,6 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - bch2_dev_allocator_stop(ca); - cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1162,6 +1116,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, @@ -1211,12 +1168,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; - if (ca->mi.state == BCH_MEMBER_STATE_rw && - bch2_dev_allocator_start(ca)) { - bch2_dev_free(ca); - goto err; - } - bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1402,14 +1353,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) /* * The allocator thread itself allocates btree nodes, so stop it first: */ - bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); bch2_copygc_start(c); } -static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1417,8 +1367,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - - return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1445,7 +1393,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) - ret = __bch2_dev_read_write(c, ca); + __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1468,30 +1416,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) { - struct btree_trans trans; - size_t i; + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); int ret; - bch2_trans_init(&trans, c, 0, 0); - - for (i = 0; i < ca->mi.nbuckets; i++) { - ret = lockrestart_do(&trans, - bch2_btree_key_cache_flush(&trans, - BTREE_ID_alloc, POS(ca->dev_idx, i))); - if (ret) - break; - } - bch2_trans_exit(&trans); - - if (ret) { + ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) bch_err(c, "error %i removing dev alloc info", ret); - return ret; - } - return bch2_btree_delete_range(c, BTREE_ID_alloc, - POS(ca->dev_idx, 0), - POS(ca->dev_idx + 1, 0), - 0, NULL); + return ret; } int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -1709,15 +1647,16 @@ have_slot: goto err_late; } + ret = bch2_fs_freespace_init(c); + if (ret) { + bch_err(c, "device add error: error initializing free space: %i", ret); + goto err_late; + } + ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) { - bch_err(c, "device add error: error going RW on new device: %i", ret); - goto err_late; - } - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; @@ -1777,11 +1716,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) - goto err; - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index d8b159a5..89419fc7 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -32,6 +32,7 @@ struct bch_member_cpu { u8 discard; u8 data_allowed; u8 durability; + u8 freespace_initialized; u8 valid; }; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 3d6ece51..bed48afb 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -170,7 +170,6 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); @@ -185,11 +184,11 @@ read_attribute(internal_uuid); read_attribute(has_data); read_attribute(alloc_debug); -write_attribute(wake_allocator); read_attribute(read_realloc_races); read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); +read_attribute(bucket_alloc_fail); rw_attribute(discard); rw_attribute(label); @@ -376,6 +375,8 @@ SHOW(bch2_fs) atomic_long_read(&c->extent_migrate_done)); sysfs_print(extent_migrate_raced, atomic_long_read(&c->extent_migrate_raced)); + sysfs_print(bucket_alloc_fail, + atomic_long_read(&c->bucket_alloc_fail)); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); @@ -572,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_read_realloc_races, &sysfs_extent_migrate_done, &sysfs_extent_migrate_raced, + &sysfs_bucket_alloc_fail, &sysfs_gc_gens_pos, @@ -698,24 +700,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -{ - enum alloc_reserve i; - - spin_lock(&ca->fs->freelist_lock); - - pr_buf(out, "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - for (i = 0; i < RESERVE_NR; i++) - pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); - - spin_unlock(&ca->fs->freelist_lock); -} - static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; @@ -741,9 +725,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "ec\t%16llu\n" "available%15llu\n" "\n" - "free_inc\t\t%zu/%zu\n" - "free[RESERVE_MOVINGGC]\t%zu/%zu\n" - "free[RESERVE_NONE]\t%zu/%zu\n" "freelist_wait\t\t%s\n" "open buckets allocated\t%u\n" "open buckets this dev\t%u\n" @@ -751,13 +732,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "open_buckets_wait\t%s\n" "open_buckets_btree\t%u\n" "open_buckets_user\t%u\n" - "btree reserve cache\t%u\n" - "thread state:\t\t%s\n", + "btree reserve cache\t%u\n", stats.buckets_ec, - __dev_buckets_available(ca, stats), - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + __dev_buckets_available(ca, stats, RESERVE_NONE), c->freelist_wait.list.first ? "waiting" : "empty", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ca->nr_open_buckets, @@ -765,8 +742,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) c->open_buckets_wait.list.first ? "waiting" : "empty", nr[BCH_DATA_btree], nr[BCH_DATA_user], - c->btree_reserve_cache_nr, - bch2_allocator_states[ca->allocator_state]); + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -841,9 +817,6 @@ SHOW(bch2_dev) clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_reserve_stats) - reserve_stats_to_text(out, ca); - if (attr == &sysfs_alloc_debug) dev_alloc_debug_to_text(out, ca); @@ -883,9 +856,6 @@ STORE(bch2_dev) return ret; } - if (attr == &sysfs_wake_allocator) - bch2_wake_allocator(ca); - return size; } SYSFS_OPS(bch2_dev); @@ -911,11 +881,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - &sysfs_reserve_stats, - /* debug: */ &sysfs_alloc_debug, - &sysfs_wake_allocator, NULL };