From fc06a0ea5e552663e9e47de941fbc7e621d4ca46 Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Mon, 17 Jun 2024 11:31:26 -0400 Subject: [PATCH] Update bcachefs sources to c56e1ec97dfd bcachefs: Fix bch2_sb_downgrade_update() Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- .bcachefs_revision | 2 +- c_src/libbcachefs.h | 1 + include/linux/srcu.h | 2 + libbcachefs/alloc_background.c | 46 ++++- libbcachefs/alloc_background.h | 8 +- libbcachefs/bcachefs.h | 11 +- libbcachefs/bcachefs_format.h | 8 +- libbcachefs/bkey.c | 2 +- libbcachefs/bkey.h | 1 - libbcachefs/bkey_methods.c | 6 +- libbcachefs/bkey_methods.h | 3 +- libbcachefs/btree_cache.c | 17 +- libbcachefs/btree_cache.h | 2 + libbcachefs/btree_gc.c | 37 +++- libbcachefs/btree_gc.h | 11 +- libbcachefs/btree_gc_types.h | 13 +- libbcachefs/btree_io.c | 8 +- libbcachefs/btree_iter.c | 38 ++-- libbcachefs/btree_iter.h | 5 +- libbcachefs/btree_key_cache.c | 83 ++++---- libbcachefs/btree_locking.h | 1 + libbcachefs/btree_node_scan.c | 9 +- libbcachefs/btree_trans_commit.c | 2 +- libbcachefs/btree_update_interior.c | 18 ++ libbcachefs/btree_update_interior.h | 1 + libbcachefs/buckets.c | 304 ++++++++++++++++------------ libbcachefs/buckets.h | 17 +- libbcachefs/buckets_types.h | 2 + libbcachefs/chardev.c | 2 +- libbcachefs/data_update.c | 3 +- libbcachefs/disk_accounting.c | 250 ++++++++++++----------- libbcachefs/disk_accounting.h | 46 +++-- libbcachefs/disk_accounting_types.h | 11 +- libbcachefs/ec.c | 26 ++- libbcachefs/error.c | 2 + libbcachefs/error.h | 2 + libbcachefs/extents.c | 9 +- libbcachefs/eytzinger.h | 11 + libbcachefs/fs-common.h | 1 - libbcachefs/fs-io-buffered.c | 9 +- libbcachefs/fs-ioctl.c | 17 +- libbcachefs/fs.c | 7 +- libbcachefs/fsck.c | 3 + libbcachefs/inode.h | 1 - libbcachefs/io_read.c | 39 ++-- libbcachefs/io_write.c | 19 +- libbcachefs/journal.c | 3 + libbcachefs/lru.h | 3 - libbcachefs/movinggc.c | 7 +- libbcachefs/recovery.c | 8 +- libbcachefs/replicas.c | 15 +- libbcachefs/sb-downgrade.c | 2 +- libbcachefs/sb-errors_format.h | 4 +- libbcachefs/snapshot.c | 7 - libbcachefs/super-io.c | 7 +- libbcachefs/super.c | 13 +- libbcachefs/util.h | 17 +- 57 files changed, 728 insertions(+), 474 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index ee1ada75..3b47f3a1 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -4f69e2d39ad3d12f724e00fafb910f85af79a7cc +c56e1ec97dfdbb888691d2fc6ebb06d7df25e8dc diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h index 60332bb8..fc6eb8bf 100644 --- a/c_src/libbcachefs.h +++ b/c_src/libbcachefs.h @@ -4,6 +4,7 @@ #include <linux/uuid.h> #include <stdbool.h> +#include "libbcachefs/bcachefs.h" #include "libbcachefs/bcachefs_format.h" #include "libbcachefs/bcachefs_ioctl.h" #include "libbcachefs/inode.h" diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e667df22..1c816804 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -26,6 +26,8 @@ static inline unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) return 0; } +static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) {} + static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {} static inline int init_srcu_struct(struct srcu_struct *ssp) diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index c220e9c9..e5f856f4 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -260,6 +260,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + c, err, + alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.v->io_time[i], LRU_TIME_MAX); + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > offsetof(struct bch_alloc_v4, stripe_sectors) ? a.v->stripe_sectors @@ -810,6 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); @@ -824,8 +833,8 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -848,7 +857,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -925,9 +934,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if (new_a->gen != old_a->gen) { - percpu_down_read(&c->mark_lock); - *bucket_gen(ca, new.k->p.offset) = new_a->gen; - percpu_up_read(&c->mark_lock); + rcu_read_lock(); + u8 *gen = bucket_gen(ca, new.k->p.offset); + if (unlikely(!gen)) { + rcu_read_unlock(); + goto invalid_bucket; + } + *gen = new_a->gen; + rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) @@ -939,7 +953,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) bch2_discard_one_bucket_fast(c, new.k->p); @@ -955,13 +969,23 @@ int bch2_trigger_alloc(struct btree_trans *trans, if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { rcu_read_lock(); struct bucket *g = gc_bucket(ca, new.k->p.offset); + if (unlikely(!g)) { + rcu_read_unlock(); + goto invalid_bucket; + } g->gen_valid = 1; g->gen = new_a->gen; rcu_read_unlock(); } err: + printbuf_exit(&buf); bch2_dev_put(ca); return ret; +invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); + ret = -EIO; + goto err; } /* @@ -1609,7 +1633,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) @@ -2006,8 +2030,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.dirty_sectors = 0; a->v.stripe_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -2235,7 +2259,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 59c08c79..dcf58c38 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -162,7 +162,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min(bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 1650446a..5b7ee707 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -731,7 +731,7 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - struct bch_accounting_mem accounting[2]; + struct bch_accounting_mem accounting; struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas_gc; @@ -790,7 +790,8 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; - struct workqueue_struct *io_complete_wq; + struct workqueue_struct *btree_read_complete_wq; + struct workqueue_struct *btree_write_submit_wq; struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; @@ -937,6 +938,7 @@ struct bch_fs { * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. */ + struct rw_semaphore gc_lock; struct mutex gc_gens_lock; /* IO PATH */ @@ -1206,6 +1208,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) +{ + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); +} + static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 0da82744..66ba8fb4 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -477,6 +477,9 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1347,9 +1350,10 @@ enum btree_id { /* * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with bitfields + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: */ -#define BTREE_ID_NR_MAX 64 +#define BTREE_ID_NR_MAX 63 static inline bool btree_id_is_alloc(enum btree_id id) { diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index f46978e5..94a1d198 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 3dc4cf69..fcd43915 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -8,7 +8,6 @@ #include "btree_types.h" #include "util.h" #include "vstructs.h" -#include "bcachefs.h" enum bch_validate_flags { BCH_VALIDATE_write = (1U << 0), diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index e838ac8f..5f07cf85 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -399,8 +399,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 726ef748..baef0722 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 6017e95d..f5d85b50 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) @@ -1256,6 +1257,14 @@ const char *bch2_btree_id_str(enum btree_id btree) return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; } +void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) +{ + if (btree < BTREE_ID_NR) + prt_str(out, __bch2_btree_ids[btree]); + else + prt_printf(out, "(unknown btree %u)", btree); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { prt_printf(out, "%s level %u/%u\n ", diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index fed35de3..c0eb87a0 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -132,6 +132,8 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) } const char *bch2_btree_id_str(enum btree_id); +void bch2_btree_id_to_text(struct printbuf *, enum btree_id); + void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 517c2ca2..2e9ccb20 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -45,6 +45,22 @@ #define DROP_PREV_NODE 11 #define DID_FILL_FROM_SCAN 12 +static const char * const bch2_gc_phase_strs[] = { +#define x(n) #n, + GC_PHASES() +#undef x + NULL +}; + +void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) +{ + prt_str(out, bch2_gc_phase_strs[p->phase]); + prt_char(out, ' '); + bch2_btree_id_to_text(out, p->btree); + prt_printf(out, " l=%u ", p->level); + bch2_bpos_to_text(out, p->pos); +} + static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { return (struct bkey_s) {{{ @@ -721,7 +737,7 @@ static int bch2_mark_superblocks(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { - bch2_accounting_free(&c->accounting[1]); + bch2_accounting_gc_free(c); genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); @@ -770,6 +786,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, const struct bch_alloc_v4 *old; int ret; + if (!bucket_valid(ca, k.k->p.offset)) + return 0; + old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; @@ -894,6 +913,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c) buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; + buckets->nbuckets_minus_first = + buckets->nbuckets - buckets->first_bucket; rcu_assign_pointer(ca->buckets_gc, buckets); } @@ -1085,9 +1106,12 @@ int bch2_check_allocations(struct bch_fs *c) lockdep_assert_held(&c->state_lock); + down_write(&c->gc_lock); + bch2_btree_interior_updates_flush(c); - ret = bch2_gc_start(c) ?: + ret = bch2_gc_accounting_start(c) ?: + bch2_gc_start(c) ?: bch2_gc_alloc_start(c) ?: bch2_gc_reflink_start(c); if (ret) @@ -1107,7 +1131,7 @@ int bch2_check_allocations(struct bch_fs *c) c->gc_count++; ret = bch2_gc_alloc_done(c) ?: - bch2_accounting_gc_done(c) ?: + bch2_gc_accounting_done(c) ?: bch2_gc_stripes_done(c) ?: bch2_gc_reflink_done(c); out: @@ -1118,6 +1142,13 @@ out: bch2_gc_free(c); percpu_up_write(&c->mark_lock); + up_write(&c->gc_lock); + + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ + closure_wake_up(&c->freelist_wait); bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 1bdf841d..8a47e8bd 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -47,15 +47,6 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, }; } -/* - * GC position of the pointers within a btree node: note, _not_ for &b->key - * itself, that lives in the parent node: - */ -static inline struct gc_pos gc_pos_btree_node(struct btree *b) -{ - return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); -} - static inline int gc_btree_order(enum btree_id btree) { if (btree == BTREE_ID_alloc) @@ -87,6 +78,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } +void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); + int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); void bch2_fs_gc_init(struct bch_fs *); diff --git a/libbcachefs/btree_gc_types.h b/libbcachefs/btree_gc_types.h index b82c24bc..c24dd6ed 100644 --- a/libbcachefs/btree_gc_types.h +++ b/libbcachefs/btree_gc_types.h @@ -4,11 +4,16 @@ #include <linux/generic-radix-tree.h> +#define GC_PHASES() \ + x(not_running) \ + x(start) \ + x(sb) \ + x(btree) + enum gc_phase { - GC_PHASE_not_running, - GC_PHASE_start, - GC_PHASE_sb, - GC_PHASE_btree, +#define x(n) GC_PHASE_##n, + GC_PHASES() +#undef x }; struct gc_pos { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index b79dfcd2..e092f541 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1392,7 +1392,7 @@ static void btree_node_read_endio(struct bio *bio) bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } struct btree_node_read_all { @@ -1659,7 +1659,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->io_complete_wq); + c->btree_read_complete_wq); } return 0; @@ -1740,7 +1740,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } } @@ -2234,7 +2234,7 @@ do_write: atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->io_complete_wq, &wbio->work); + queue_work(c->btree_write_submit_wq, &wbio->work); return; err: set_btree_node_noevict(b); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index bd6bd073..c68cc714 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; - unsigned i; - EBUG_ON(path->btree_id >= BTREE_ID_NR); - - for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && bch2_btree_id_root(c, path->btree_id)->b->c.level > i); @@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -330,7 +325,7 @@ out: } void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) + struct bpos pos) { bch2_trans_verify_not_unlocked(trans); @@ -341,19 +336,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, btree_trans_sort_paths(trans); trans_for_each_path_inorder(trans, path, iter) { - int cmp = cmp_int(path->btree_id, id) ?: - cmp_int(path->cached, key_cache); - - if (cmp > 0) - break; - if (cmp < 0) - continue; - - if (!btree_node_locked(path, 0) || + if (path->btree_id != id || + !btree_node_locked(path, 0) || !path->should_be_locked) continue; - if (!key_cache) { + if (!path->cached) { if (bkey_ge(pos, path->l[0].b->data->min_key) && bkey_le(pos, path->l[0].b->key.k.p)) return; @@ -366,9 +354,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, bch2_dump_trans_paths_updates(trans); bch2_bpos_to_text(&buf, pos); - panic("not locked: %s %s%s\n", - bch2_btree_id_str(id), buf.buf, - key_cache ? " cached" : ""); + panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } #else @@ -1488,6 +1474,14 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra path->level); bch2_bpos_to_text(out, path->pos); + if (!path->cached && btree_node_locked(path, path->level)) { + prt_char(out, ' '); + struct btree *b = path_l(path)->b; + bch2_bpos_to_text(out, b->data->min_key); + prt_char(out, '-'); + bch2_bpos_to_text(out, b->key.k.p); + } + #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif @@ -3413,8 +3407,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) bch2_time_stats_exit(&s->lock_hold_times); } - if (c->btree_trans_barrier_initialized) + if (c->btree_trans_barrier_initialized) { + synchronize_srcu_expedited(&c->btree_trans_barrier); cleanup_srcu_struct(&c->btree_trans_barrier); + } mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_trans_pool); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index bdb3cd2e..c7725865 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -268,12 +268,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, - struct bpos, bool); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos, bool key_cache) {} + struct bpos pos) {} #endif void bch2_btree_path_fix_key_modified(struct btree_trans *trans, diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 34056aae..8b2fd0ae 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -32,12 +32,22 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; +static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, + struct bkey_cached *ck, + enum btree_node_locked_type lock_held) +{ + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; + mark_btree_node_locked(trans, path, 0, lock_held); +} + __flatten inline struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) @@ -258,9 +268,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, return ERR_PTR(ret); } - path->l[0].b = (void *) ck; - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); ret = bch2_btree_node_lock_write(trans, path, &ck->c); if (unlikely(ret)) { @@ -488,7 +496,7 @@ retry: if (!ck) goto retry; - mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -506,12 +514,8 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); } - - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; fill: path->uptodate = BTREE_ITER_UPTODATE; @@ -558,30 +562,25 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path } retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) { + if (!ck) return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); - } else { - enum six_lock_type lock_want = __btree_lock_want(path, 0); - ret = btree_node_lock(trans, path, (void *) ck, 0, - lock_want, _THIS_IP_); - EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (ret) - return ret; + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } + if (ret) + return ret; - mark_btree_node_locked(trans, path, 0, - (enum btree_node_locked_type) lock_want); + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; } - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; + btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); fill: if (!ck->valid) return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); @@ -840,7 +839,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_nonpcpu--; bc->freed++; } @@ -854,7 +852,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_pcpu--; bc->freed++; } @@ -876,23 +873,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { bc->skipped_dirty++; - goto next; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); bc->skipped_accessed++; - goto next; - } else if (bkey_cached_lock_for_evict(ck)) { + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); bc->moved_to_freelist++; - } else { - bc->skipped_lock_fail++; + freed++; } scanned++; if (scanned >= nr) break; -next: + pos = next; } @@ -917,6 +913,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } @@ -1025,9 +1029,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 7f41545b..2530fdb2 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -136,6 +136,7 @@ static inline void btree_node_unlock(struct btree_trans *trans, int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED); if (lock_type != BTREE_NODE_UNLOCKED) { six_unlock_type(&path->l[level].b->c.lock, lock_type); diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index 45cb8149..2cb0442f 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans, struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); bool ret = !IS_ERR_OR_NULL(b); - if (ret) { - f->sectors_written = b->written; - six_unlock_read(&b->c.lock); - } + if (!ret) + return ret; + + f->sectors_written = b->written; + six_unlock_read(&b->c.lock); /* * We might update this node's range; if that happens, we need the node diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 843558d9..8ab85f21 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -599,7 +599,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { trans_for_each_update(trans, i) if (btree_node_type_has_triggers(i->bkey_type) && - gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a4c92753..9575fb65 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -565,6 +565,10 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * { struct bch_fs *c = as->c; + if (as->took_gc_lock) + up_read(&c->gc_lock); + as->took_gc_lock = false; + bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -1113,6 +1117,10 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * BUG_ON(as->mode == BTREE_UPDATE_none); + if (as->took_gc_lock) + up_read(&as->c->gc_lock); + as->took_gc_lock = false; + bch2_btree_reserve_put(as, trans); continue_at(&as->cl, btree_update_set_nodes_written, @@ -1184,6 +1192,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } + if (!down_read_trylock(&c->gc_lock)) { + ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); + if (ret) { + up_read(&c->gc_lock); + return ERR_PTR(ret); + } + } + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); @@ -1192,6 +1208,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->ip_started = _RET_IP_; as->mode = BTREE_UPDATE_none; as->flags = flags; + as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level_start = level_start; as->update_level_end = level_end; @@ -1760,6 +1777,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t int live_u64s_added, u64s_added; int ret; + lockdep_assert_held(&c->gc_lock); BUG_ON(!btree_node_intent_locked(path, b->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 9caf45b2..b5b76ce0 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -54,6 +54,7 @@ struct btree_update { enum btree_update_mode mode; enum bch_trans_commit_flags flags; unsigned nodes_written:1; + unsigned took_gc_lock:1; enum btree_id btree_id; unsigned update_level_start; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 7c6cf94d..a426b28e 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -84,6 +84,156 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) } } +static int bch2_check_fix_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + bool *do_update) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (!ca) { + if (fsck_err(trans, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + return 0; + } + + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + if (!g) { + if (fsck_err(trans, ptr_to_invalid_device, + "pointer to invalid bucket on device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + goto out; + } + + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); + + if (fsck_err_on(!g->gen_valid, + trans, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + *do_update = true; + } + } + + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->stripe_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } + } + + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + trans, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + goto out; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + trans, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->stripe_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } + } + + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, + trans, ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), + trans, ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + } +out: +fsck_err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; +} + int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c k, enum btree_iter_update_trigger_flags flags) @@ -99,130 +249,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, percpu_down_read(&c->mark_lock); bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - continue; - } - - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c); - - if (fsck_err_on(!g->gen_valid, - trans, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - trans, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto next; - - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, - trans, ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), - trans, ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - } -next: - bch2_dev_put(ca); + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) + goto err; } if (do_update) { @@ -337,7 +366,6 @@ found: bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: -fsck_err: percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -524,6 +552,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_fs *c = trans->c; @@ -556,11 +585,19 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); alloc_to_bucket(g, new); bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); if (!ret) @@ -568,6 +605,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -901,6 +939,9 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) + goto err_unlock; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -909,19 +950,15 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * g->data_type != data_type, c, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; + bch2_data_type_str(data_type))) goto err; - } if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; + g->dirty_sectors, sectors)) goto err; - } g->data_type = data_type; g->dirty_sectors += sectors; @@ -932,8 +969,9 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * return ret; err: bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); - return ret; + return -EIO; } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1175,6 +1213,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { down_write(&ca->bucket_lock); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index ff813dc6..4a14741b 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) + return NULL; return buckets->b + b; } @@ -110,7 +111,8 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - gens->first_bucket >= gens->nbuckets_minus_first) + return NULL; return gens->b + b; } @@ -170,19 +172,22 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } -static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); + if (!gen) + return -1; + return gen_after(*gen, ptr->gen); } /** * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { rcu_read_lock(); - u8 ret = dev_ptr_stale_rcu(ca, ptr); + int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 405c8533..c9698cdf 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -23,6 +23,7 @@ struct bucket_array { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; struct bucket b[]; }; @@ -30,6 +31,7 @@ struct bucket_gens { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; u8 b[]; }; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 70db3a73..4248c251 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -515,7 +515,7 @@ static long bch2_ioctl_data(struct bch_fs *c, static long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { - struct bch_ioctl_fs_usage arg; + struct bch_ioctl_fs_usage arg = {}; darray_char replicas = {}; u32 replica_entries_bytes; int ret = 0; diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 0d807c2c..1a0072ee 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -202,9 +202,8 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ -restart_drop_extra_replicas: - rcu_read_lock(); +restart_drop_extra_replicas: bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index b9296195..bd7a0c77 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -218,7 +218,46 @@ int bch2_accounting_update_sb(struct btree_trans *trans) return 0; } -static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) +{ + struct bch_accounting_mem *acc = &c->accounting; + + /* raced with another insert, already present: */ + if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p) < acc->k.nr) + return 0; + + struct accounting_mem_entry n = { + .pos = a.k->p, + .version = a.k->version, + .nr_counters = bch2_accounting_counters(a.k), + .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL), + }; + + if (!n.v[0]) + goto err; + + if (acc->gc_running) { + n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!n.v[1]) + goto err; + } + + if (darray_push(&acc->k, n)) + goto err; + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + return 0; +err: + free_percpu(n.v[1]); + free_percpu(n.v[0]); + return -BCH_ERR_ENOMEM_disk_accounting; +} + +int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) { struct bch_replicas_padded r; @@ -226,52 +265,46 @@ static int __bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_ !bch2_replicas_marked_locked(c, &r.e)) return -BCH_ERR_btree_insert_need_mark_replicas; - struct bch_accounting_mem *acc = &c->accounting[gc]; - unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k); - - u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!new_counters) - return -BCH_ERR_ENOMEM_disk_accounting; - - preempt_disable(); - memcpy(this_cpu_ptr(new_counters), - bch2_acc_percpu_u64s(acc->v, acc->nr_counters), - acc->nr_counters * sizeof(u64)); - preempt_enable(); - - struct accounting_pos_offset n = { - .pos = a.k->p, - .version = a.k->version, - .offset = acc->nr_counters, - .nr_counters = bch2_accounting_counters(a.k), - }; - if (darray_push(&acc->k, n)) { - free_percpu(new_counters); - return -BCH_ERR_ENOMEM_disk_accounting; - } - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); - - free_percpu(acc->v); - acc->v = new_counters; - acc->nr_counters = new_nr_counters; - - for (unsigned i = 0; i < n.nr_counters; i++) - this_cpu_add(acc->v[n.offset + i], a.v->d[i]); - return 0; -} - -int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) -{ percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_mod_slowpath(c, a, gc); + int ret = __bch2_accounting_mem_insert(c, a); percpu_up_write(&c->mark_lock); percpu_down_read(&c->mark_lock); return ret; } +static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) +{ + for (unsigned i = 0; i < e->nr_counters; i++) + if (percpu_u64_get(e->v[0] + i) || + (e->v[1] && + percpu_u64_get(e->v[1] + i))) + return false; + return true; +} + +void bch2_accounting_mem_gc(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + + percpu_down_write(&c->mark_lock); + struct accounting_mem_entry *dst = acc->k.data; + + darray_for_each(acc->k, src) { + if (accounting_mem_entry_is_zero(src)) { + free_percpu(src->v[0]); + free_percpu(src->v[1]); + } else { + *dst++ = *src; + } + } + + acc->k.nr = dst - acc->k.data; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + percpu_up_write(&c->mark_lock); +} + /* * Read out accounting keys for replicas entries, as an array of * bch_replicas_usage entries. @@ -282,7 +315,7 @@ int bch2_accounting_mem_mod_slowpath(struct bch_fs *c, struct bkey_s_c_accountin */ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(usage); @@ -297,7 +330,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) if (!accounting_to_replicas(&u.r.r, i->pos)) continue; - bch2_accounting_mem_read(c, i->pos, &u.r.sectors, 1); + bch2_accounting_mem_read_counters(acc, i - acc->k.data, &u.r.sectors, 1, false); ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); if (ret) @@ -316,7 +349,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(out_buf); @@ -338,7 +371,8 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc bkey_accounting_init((void *) &darray_top(*out_buf)); set_bkey_val_u64s(&a_out->k, i->nr_counters); a_out->k.p = i->pos; - bch2_accounting_mem_read(c, i->pos, a_out->v.d, i->nr_counters); + bch2_accounting_mem_read_counters(acc, i - acc->k.data, + a_out->v.d, i->nr_counters, false); if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) out_buf->nr += bkey_bytes(&a_out->k); @@ -353,7 +387,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; percpu_down_read(&c->mark_lock); out->atomic++; @@ -365,7 +399,7 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) bch2_accounting_key_to_text(out, &acc_k); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false); + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); prt_str(out, ":"); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) @@ -377,81 +411,64 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) percpu_up_read(&c->mark_lock); } -/* Ensures all counters in @src exist in @dst: */ -static int copy_counters(struct bch_accounting_mem *dst, - struct bch_accounting_mem *src) +static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { - unsigned orig_dst_k_nr = dst->k.nr; - unsigned dst_counters = dst->nr_counters; - - darray_for_each(src->k, i) - if (eytzinger0_find(dst->k.data, orig_dst_k_nr, sizeof(dst->k.data[0]), - accounting_pos_cmp, &i->pos) >= orig_dst_k_nr) { - if (darray_push(&dst->k, ((struct accounting_pos_offset) { - .pos = i->pos, - .offset = dst_counters, - .nr_counters = i->nr_counters }))) - goto err; - - dst_counters += i->nr_counters; - } - - if (dst->k.nr == orig_dst_k_nr) - return 0; - - u64 __percpu *new_counters = __alloc_percpu_gfp(dst_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!new_counters) - goto err; - - preempt_disable(); - memcpy(this_cpu_ptr(new_counters), - bch2_acc_percpu_u64s(dst->v, dst->nr_counters), - dst->nr_counters * sizeof(u64)); - preempt_enable(); - - free_percpu(dst->v); - dst->v = new_counters; - dst->nr_counters = dst_counters; - - eytzinger0_sort(dst->k.data, dst->k.nr, sizeof(dst->k.data[0]), accounting_pos_cmp, NULL); - - return 0; -err: - dst->k.nr = orig_dst_k_nr; - return -BCH_ERR_ENOMEM_disk_accounting; + darray_for_each(acc->k, e) { + free_percpu(e->v[gc]); + e->v[gc] = NULL; + } } -int bch2_accounting_gc_done(struct bch_fs *c) +int bch2_gc_accounting_start(struct bch_fs *c) { - struct bch_accounting_mem *dst = &c->accounting[0]; - struct bch_accounting_mem *src = &c->accounting[1]; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; + struct bch_accounting_mem *acc = &c->accounting; int ret = 0; percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) { + e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!e->v[1]) { + bch2_accounting_free_counters(acc, true); + ret = -BCH_ERR_ENOMEM_disk_accounting; + break; + } + } - ret = copy_counters(dst, src) ?: - copy_counters(src, dst); - if (ret) - goto err; + acc->gc_running = !ret; + percpu_up_write(&c->mark_lock); - BUG_ON(dst->k.nr != src->k.nr); + return ret; +} - for (unsigned i = 0; i < src->k.nr; i++) { - BUG_ON(src->k.data[i].nr_counters != dst->k.data[i].nr_counters); - BUG_ON(!bpos_eq(dst->k.data[i].pos, src->k.data[i].pos)); +int bch2_gc_accounting_done(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct bpos pos = POS_MIN; + int ret = 0; + + percpu_down_write(&c->mark_lock); + while (1) { + unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &pos); + + if (idx >= acc->k.nr) + break; + + struct accounting_mem_entry *e = acc->k.data + idx; + pos = bpos_successor(e->pos); struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, src->k.data[i].pos); + bpos_to_disk_accounting_pos(&acc_k, e->pos); - unsigned nr = src->k.data[i].nr_counters; u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, dst_v, nr, false); - bch2_accounting_mem_read_counters(c, i, src_v, nr, true); + unsigned nr = e->nr_counters; + bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); if (memcmp(dst_v, src_v, nr * sizeof(u64))) { printbuf_reset(&buf); @@ -470,8 +487,10 @@ int bch2_accounting_gc_done(struct bch_fs *c) src_v[j] -= dst_v[j]; if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); + percpu_down_write(&c->mark_lock); if (ret) goto err; @@ -535,7 +554,7 @@ fsck_err: */ int bch2_accounting_read(struct bch_fs *c) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, @@ -595,7 +614,7 @@ int bch2_accounting_read(struct bch_fs *c) bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false); + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: @@ -748,15 +767,20 @@ void bch2_verify_accounting_clean(struct bch_fs *c) WARN_ON(mismatch); } -void bch2_accounting_free(struct bch_accounting_mem *acc) +void bch2_accounting_gc_free(struct bch_fs *c) { - darray_exit(&acc->k); - free_percpu(acc->v); - acc->v = NULL; - acc->nr_counters = 0; + lockdep_assert_held(&c->mark_lock); + + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, true); + acc->gc_running = false; } void bch2_fs_accounting_exit(struct bch_fs *c) { - bch2_accounting_free(&c->accounting[0]); + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, false); + darray_exit(&acc->k); } diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h index ab1f74cb..3d3f25e0 100644 --- a/libbcachefs/disk_accounting.h +++ b/libbcachefs/disk_accounting.h @@ -104,22 +104,29 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) return bpos_cmp(*l, *r); } -int bch2_accounting_mem_mod_slowpath(struct bch_fs *, struct bkey_s_c_accounting, bool); +int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); +void bch2_accounting_mem_gc(struct bch_fs *); static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) { - struct bch_accounting_mem *acc = &c->accounting[gc]; - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p); - if (unlikely(idx >= acc->k.nr)) - return bch2_accounting_mem_mod_slowpath(c, a, gc); + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx; - unsigned offset = acc->k.data[idx].offset; + EBUG_ON(gc && !acc->gc_running); - EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters); + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { + int ret = bch2_accounting_mem_insert(c, a, gc); + if (ret) + return ret; + } + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(acc->v[offset + i], a.v->d[i]); + this_cpu_add(e->v[gc][i], a.v->d[i]); return 0; } @@ -166,37 +173,38 @@ static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey return ret; } -static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned idx, - u64 *v, unsigned nr, bool gc) +static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, + unsigned idx, u64 *v, unsigned nr, bool gc) { memset(v, 0, sizeof(*v) * nr); - struct bch_accounting_mem *acc = &c->accounting[gc]; if (unlikely(idx >= acc->k.nr)) return; - unsigned offset = acc->k.data[idx].offset; - nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters); + struct accounting_mem_entry *e = &acc->k.data[idx]; + + nr = min_t(unsigned, nr, e->nr_counters); for (unsigned i = 0; i < nr; i++) - v[i] = percpu_u64_get(acc->v + offset + i); + v[i] = percpu_u64_get(e->v[gc] + i); } static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); - bch2_accounting_mem_read_counters(c, idx, v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); } int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); -int bch2_accounting_gc_done(struct bch_fs *); +int bch2_gc_accounting_start(struct bch_fs *); +int bch2_gc_accounting_done(struct bch_fs *); int bch2_accounting_read(struct bch_fs *); @@ -205,7 +213,7 @@ int bch2_dev_usage_init(struct bch_dev *, bool); void bch2_verify_accounting_clean(struct bch_fs *c); -void bch2_accounting_free(struct bch_accounting_mem *); +void bch2_accounting_gc_free(struct bch_fs *); void bch2_fs_accounting_exit(struct bch_fs *); #endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/libbcachefs/disk_accounting_types.h b/libbcachefs/disk_accounting_types.h index 5656ac54..1687a451 100644 --- a/libbcachefs/disk_accounting_types.h +++ b/libbcachefs/disk_accounting_types.h @@ -4,17 +4,16 @@ #include "darray.h" -struct accounting_pos_offset { +struct accounting_mem_entry { struct bpos pos; struct bversion version; - u32 offset:24, - nr_counters:8; + unsigned nr_counters; + u64 __percpu *v[2]; }; struct bch_accounting_mem { - DARRAY(struct accounting_pos_offset) k; - u64 __percpu *v; - unsigned nr_counters; + DARRAY(struct accounting_mem_entry) k; + bool gc_running; }; #endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f254a53c..3c3a2a7e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -269,6 +269,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); @@ -290,17 +291,26 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -705,10 +715,12 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(ca->fs, - "error %s stripe: stale pointer after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to"); + "error %s stripe: stale/invalid pointer (%i) after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to", + stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } @@ -734,10 +746,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, return; } - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + "error %s stripe: stale pointer (%i)", + rw == READ ? "reading from" : "writing to", + stale); clear_bit(idx, buf->valid); return; } diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 285b7251..9d7cc79e 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -224,6 +224,8 @@ int __bch2_fsck_err(struct bch_fs *c, int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; + might_sleep(); + if (!c) c = trans->c; diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 157ec3fa..ead36936 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -147,6 +147,8 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define __fsck_err_on(cond, c, _flags, _err_type, ...) \ ({ \ + might_sleep(); \ + \ if (type_is(c, struct bch_fs *)) \ WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 46903792..410b8bd8 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr))) + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr)); + dev_ptr_stale_rcu(ca, ptr) > 0); rcu_read_unlock(); return bkey_deleted(k.k); @@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr)) + int stale = dev_ptr_stale_rcu(ca, ptr); + if (stale > 0) prt_printf(out, " stale"); + else if (stale) + prt_printf(out, " invalid"); } rcu_read_unlock(); --out->atomic; diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index 24840aee..c6e078f2 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -284,6 +284,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, return eytzinger0_next(idx, nr); } +static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + + if (idx < nr && !cmp(base + idx * size, search)) + return idx; + + return eytzinger0_next(idx, nr); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h index 2064ef5b..dde23785 100644 --- a/libbcachefs/fs-common.h +++ b/libbcachefs/fs-common.h @@ -2,7 +2,6 @@ #ifndef _BCACHEFS_FS_COMMON_H #define _BCACHEFS_FS_COMMON_H -#include "libbcachefs/dirent.h" struct posix_acl; #define BCH_CREATE_TMPFILE (1U << 0) diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 54873ecc..1355d618 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -678,8 +678,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_pagecache_add_get(inode); folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); if (IS_ERR_OR_NULL(folio)) goto err_unlock; @@ -820,9 +820,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &fs); + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping), &fs); if (ret) goto out; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 23427a87..aea8132d 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -372,8 +372,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) return ret; } -static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -470,9 +470,12 @@ retry: !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + down_write(&c->snapshot_create_lock); inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); + up_write(&c->snapshot_create_lock); + error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; @@ -493,16 +496,6 @@ err1: return error; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - down_write(&c->snapshot_create_lock); - long ret = __bch2_ioctl_subvolume_create(c, filp, arg); - up_write(&c->snapshot_create_lock); - - return ret; -} - static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 514bf83e..4a3e9f42 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -58,9 +58,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, BUG_ON(bi->bi_inum != inode->v.i_ino); - bch2_assert_pos_locked(trans, BTREE_ID_inodes, - POS(0, bi->bi_inum), - c->opts.inodes_use_key_cache); + bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); @@ -229,7 +227,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { @@ -1953,6 +1953,7 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); sb->s_uuid = c->sb.user_uuid; + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 037b64c1..cc4f0963 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1728,6 +1728,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) trans_was_restarted(trans, restart_count); } +noinline_for_stack static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1824,6 +1825,7 @@ out_noiter: return ret; } +noinline_for_stack static int check_dirent_target(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1898,6 +1900,7 @@ found: return ret; } +noinline_for_stack static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d) { diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index b62111bf..679f5f5e 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -5,7 +5,6 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" -#include "subvolume_types.h" enum bch_validate_flags; extern const char * const bch2_inode_opts[]; diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index f5748679..c97fa700 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -84,9 +84,10 @@ struct promote_op { }; static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), + .automatic_shrinking = true, }; static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, @@ -776,18 +777,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); + u8 *gen = bucket_gen(ca, iter.pos.offset); + if (gen) { - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); + printbuf_indent_add(&buf, 2); - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { - prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *gen); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + } else { + prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", + iter.pos.inode, iter.pos.offset); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "first bucket %u nbuckets %llu\n", + ca->mi.first_bucket, ca->mi.nbuckets); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index db9cda7b..c6197e6a 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -1219,7 +1219,7 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; if (op->flags & BCH_WRITE_MOVE) return; @@ -1298,7 +1298,8 @@ retry: BUCKET_NOCOW_LOCK_UPDATE); rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); + u8 *gen = bucket_gen(ca, i->b.offset); + stale = !gen ? -1 : gen_after(*gen, i->gen); rcu_read_unlock(); if (unlikely(stale)) { @@ -1379,8 +1380,18 @@ err_bucket_stale: break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + } else { + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + } + printbuf_exit(&buf); + goto err_get_ioref; } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b9577161..d5a9f3ad 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1166,6 +1166,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index fb11ab0d..bd71ba77 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 10bfb31c..eb49dd04 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -35,9 +35,10 @@ struct buckets_in_flight { }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), - .key_len = sizeof(struct move_bucket_key), + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, }; static struct move_bucket_in_flight * diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 3ebffd04..097ef7d1 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -398,6 +398,12 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); if (ret) @@ -452,7 +458,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 06f6d48f..10c96cb2 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -420,10 +420,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; - unsigned i, nr; + unsigned nr; int ret = 0; - bch2_journal_meta(&c->journal); + bch2_accounting_mem_gc(c); retry: nr = READ_ONCE(c->replicas.nr); new.entry_size = READ_ONCE(c->replicas.entry_size); @@ -444,7 +444,7 @@ retry: goto retry; } - for (i = 0; i < c->replicas.nr; i++) { + for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); @@ -454,10 +454,13 @@ retry: memcpy(&k.replicas, e, replicas_entry_bytes(e)); - u64 v = 0; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), &v, 1); + struct bpos p = disk_accounting_pos_to_bpos(&k); - if (e->data_type == BCH_DATA_journal || v) + struct bch_accounting_mem *acc = &c->accounting; + bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p) >= acc->k.nr; + + if (e->data_type == BCH_DATA_journal || !kill) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index 739919e2..511e80b3 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -303,7 +303,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); dst->recovery_passes[1] = 0; dst->nr_errors = cpu_to_le16(src->nr_errors); for (unsigned i = 0; i < src->nr_errors; i++) diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index ced43190..9dd2b7ae 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -276,7 +276,9 @@ x(subvol_inode_bad, 270) \ x(alloc_key_stripe_sectors_wrong, 271) \ x(accounting_mismatch, 272) \ - x(accounting_replicas_not_marked, 273) + x(accounting_replicas_not_marked, 273) \ + x(invalid_btree_id, 274) \ + x(alloc_key_io_time_bad, 275) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 169fd458..fa7ad586 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -1567,13 +1567,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - trans = bch2_trans_get(c); /* diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 055478d2..b156fc85 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -649,9 +649,10 @@ reread: bytes = vstruct_bytes(sb->sb); - if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index a7f0245f..bfdec48e 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -583,8 +583,10 @@ static void __bch2_fs_free(struct bch_fs *c) if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); - if (c->io_complete_wq) - destroy_workqueue(c->io_complete_wq); + if (c->btree_write_submit_wq) + destroy_workqueue(c->btree_write_submit_wq); + if (c->btree_read_complete_wq) + destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -763,6 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_waitqueue_head(&c->ro_ref_wait); sema_init(&c->online_fsck_mutex, 1); + init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); c->journal_keys.initial_ref_held = true; @@ -876,8 +879,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -906,9 +911,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: diff --git a/libbcachefs/util.h b/libbcachefs/util.h index f4dd09c4..76ffe08e 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -697,14 +697,19 @@ do { \ } \ } while (0) +#define per_cpu_sum(_p) \ +({ \ + typeof(*_p) _ret = 0; \ + \ + int cpu; \ + for_each_possible_cpu(cpu) \ + _ret += *per_cpu_ptr(_p, cpu); \ + _ret; \ +}) + static inline u64 percpu_u64_get(u64 __percpu *src) { - u64 ret = 0; - int cpu; - - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(src, cpu); - return ret; + return per_cpu_sum(src); } static inline void percpu_u64_set(u64 __percpu *dst, u64 src)