From 9fce394ca6d0082ced3612a627cd16e06d84244a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Jan 2021 23:38:05 -0500 Subject: [PATCH] Update bcachefs sources to 313b24b652 bcachefs: Fix an assertion --- .bcachefs_revision | 2 +- libbcachefs/alloc_background.c | 123 +++++++--------- libbcachefs/alloc_background.h | 1 - libbcachefs/alloc_foreground.c | 40 ++--- libbcachefs/bkey.h | 5 + libbcachefs/btree_gc.c | 102 ++++++------- libbcachefs/btree_gc.h | 2 +- libbcachefs/btree_io.c | 17 --- libbcachefs/btree_io.h | 1 - libbcachefs/btree_update_leaf.c | 26 +++- libbcachefs/buckets.c | 253 +++++++++++++++++++++++--------- libbcachefs/buckets.h | 26 ++-- libbcachefs/buckets_types.h | 13 +- libbcachefs/chardev.c | 6 +- libbcachefs/ec.c | 219 ++++++++++++++++----------- libbcachefs/ec.h | 6 +- libbcachefs/ec_types.h | 3 +- libbcachefs/extent_update.c | 15 +- libbcachefs/fs-io.c | 30 +++- libbcachefs/io.c | 3 - libbcachefs/journal.c | 21 ++- libbcachefs/journal.h | 5 - libbcachefs/journal_io.c | 109 ++++++++------ libbcachefs/journal_types.h | 1 + libbcachefs/move.c | 2 +- libbcachefs/movinggc.c | 2 +- libbcachefs/recovery.c | 18 +-- libbcachefs/replicas.c | 16 +- libbcachefs/super.c | 28 ++-- libbcachefs/sysfs.c | 91 +++++------- 30 files changed, 665 insertions(+), 521 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 79c81a65..ee5b7e5c 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -7d57e9b703cf8bda52c3894b5a18e74329914823 +313b24b652d521c6ba4a965f7033c73575923a91 diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 60c2c38b..896ec023 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -54,10 +54,10 @@ static void pd_controllers_update(struct work_struct *work) * reclaimed by copy GC */ fragmented += max_t(s64, 0, (bucket_to_sector(ca, - stats.buckets[BCH_DATA_user] + - stats.buckets[BCH_DATA_cached]) - - (stats.sectors[BCH_DATA_user] + - stats.sectors[BCH_DATA_cached])) << 9); + stats.d[BCH_DATA_user].buckets + + stats.d[BCH_DATA_cached].buckets) - + (stats.d[BCH_DATA_user].sectors + + stats.d[BCH_DATA_cached].sectors)) << 9); } bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); @@ -217,7 +217,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, 0); + g = bucket(ca, k.k->p.offset); u = bch2_alloc_unpack(k); g->_mark.gen = u.gen; @@ -278,7 +278,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *ba; struct bucket *g; struct bucket_mark m; struct bkey_alloc_unpacked old_u, new_u; @@ -302,9 +301,7 @@ retry: percpu_down_read(&c->mark_lock); ca = bch_dev_bkey_exists(c, iter->pos.inode); - ba = bucket_array(ca); - - g = &ba->b[iter->pos.offset]; + g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); new_u = alloc_mem_to_key(g, m); percpu_up_read(&c->mark_lock); @@ -326,54 +323,36 @@ err: return ret; } -int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) +int bch2_alloc_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; - u64 first_bucket, nbuckets; - int ret = 0; - - percpu_down_read(&c->mark_lock); - first_bucket = bucket_array(ca)->first_bucket; - nbuckets = bucket_array(ca)->nbuckets; - percpu_up_read(&c->mark_lock); - - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, - POS(ca->dev_idx, first_bucket), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - while (iter->pos.offset < nbuckets) { - bch2_trans_cond_resched(&trans); - - ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret) - break; - bch2_btree_iter_next_slot(iter); - } - - bch2_trans_exit(&trans); - - return ret; -} - -int bch2_alloc_write(struct bch_fs *c, unsigned flags) -{ struct bch_dev *ca; unsigned i; int ret = 0; + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + for_each_member_device(ca, c, i) { - bch2_dev_alloc_write(c, ca, flags); - if (ret) { - percpu_ref_put(&ca->io_ref); - break; + bch2_btree_iter_set_pos(iter, + POS(ca->dev_idx, ca->mi.first_bucket)); + + while (iter->pos.offset < ca->mi.nbuckets) { + bch2_trans_cond_resched(&trans); + + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + bch2_btree_iter_next_slot(iter); } } - +err: + bch2_trans_exit(&trans); return ret; } @@ -552,7 +531,8 @@ out: static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { unsigned long gc_count = c->gc_count; - u64 available; + s64 available; + unsigned i; int ret = 0; ca->allocator_state = ALLOCATOR_BLOCKED; @@ -568,8 +548,15 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - available = max_t(s64, 0, dev_buckets_available(ca) - - ca->inc_gen_really_needs_gc); + available = dev_buckets_available(ca); + available -= ca->inc_gen_really_needs_gc; + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + available -= fifo_used(&ca->free[i]); + spin_unlock(&c->freelist_lock); + + available = max(available, 0LL); if (available > fifo_free(&ca->free_inc) || (available && @@ -598,6 +585,9 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, if (!is_available_bucket(mark)) return false; + if (mark.owned_by_allocator) + return false; + if (ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse)) return false; @@ -894,33 +884,32 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - verify_not_on_freelist(c, ca, b); - - BUG_ON(!fifo_push(&ca->free_inc, b)); - g = bucket(ca, b); m = READ_ONCE(g->mark); - invalidating_cached_data = m.cached_sectors != 0; + BUG_ON(m.dirty_sectors); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_lock(&c->freelist_lock); + verify_not_on_freelist(c, ca, b); + BUG_ON(!fifo_push(&ca->free_inc, b)); + spin_unlock(&c->freelist_lock); /* * If we're not invalidating cached data, we only increment the bucket * gen in memory here, the incremented gen will be updated in the btree * by bch2_trans_mark_pointer(): */ - - if (!invalidating_cached_data) - bch2_invalidate_bucket(c, ca, b, &m); - else - bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); - - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); - - if (!invalidating_cached_data) + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + percpu_up_read(&c->mark_lock); goto out; + } + + percpu_up_read(&c->mark_lock); /* * If the read-only path is trying to shut down, we can't be generating diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index d10ff56e..f60fcebf 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -98,7 +98,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index dcbe0404..8f0b94f5 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) rcu_read_lock(); buckets = bucket_array(ca); - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark)) + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) + if (is_available_bucket(buckets->b[b].mark) && + !buckets->b[b].mark.owned_by_allocator) goto success; b = -1; success: @@ -224,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bool may_alloc_partial, struct closure *cl) { - struct bucket_array *buckets; struct open_bucket *ob; - long bucket = 0; + long b = 0; spin_lock(&c->freelist_lock); @@ -260,13 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) goto out; switch (reserve) { case RESERVE_BTREE_MOVINGGC: case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) goto out; break; default: @@ -284,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, trace_bucket_alloc_fail(ca, reserve); return ERR_PTR(-FREELIST_EMPTY); out: - verify_not_on_freelist(c, ca, bucket); + verify_not_on_freelist(c, ca, b); ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - buckets = bucket_array(ca); ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->ptr = (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = buckets->b[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), + .gen = bucket(ca, b)->mark.gen, + .offset = bucket_to_sector(ca, b), .dev = ca->dev_idx, }; @@ -489,16 +488,20 @@ bucket_alloc_from_stripe(struct bch_fs *c, devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); for (i = 0; i < devs_sorted.nr; i++) - open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; if (ob->ptr.dev == devs_sorted.devs[i] && - !test_and_set_bit(h->s->data_block_idx[ec_idx], - h->s->blocks_allocated)) + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) goto got_bucket; + } goto out_put_head; got_bucket: ca = bch_dev_bkey_exists(c, ob->ptr.dev); - ob->ec_idx = h->s->data_block_idx[ec_idx]; + ob->ec_idx = ec_idx; ob->ec = h->s; add_new_bucket(c, ptrs, devs_may_alloc, @@ -636,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); - open_bucket_for_each(c, &ob->ec->blocks, ob2, j) - drop |= ob2->ptr.dev == ca->dev_idx; - open_bucket_for_each(c, &ob->ec->parity, ob2, j) + for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { + if (!ob->ec->blocks[j]) + continue; + + ob2 = c->open_buckets + ob->ec->blocks[j]; drop |= ob2->ptr.dev == ca->dev_idx; + } mutex_unlock(&ob->ec->lock); } diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 2d2c6403..2c3b73a6 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r) return bkey_cmp(l, r) < 0 ? l : r; } +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bkey_cmp(l, r) > 0 ? l : r; +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index d0635a08..efeaec3d 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -205,13 +205,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - bool initial, bool metadata_only) + bool initial) { struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 + unsigned depth = bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -326,13 +325,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, static int bch2_gc_btree_init(struct bch_fs *c, struct journal_keys *journal_keys, - enum btree_id btree_id, - bool metadata_only) + enum btree_id btree_id) { struct btree *b; - unsigned target_depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 + unsigned target_depth = bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; int ret = 0; @@ -377,7 +374,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) } static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) + bool initial) { enum btree_id ids[BTREE_ID_NR]; unsigned i; @@ -390,8 +387,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, enum btree_id id = ids[i]; int ret = initial ? bch2_gc_btree_init(c, journal_keys, - id, metadata_only) - : bch2_gc_btree(c, id, initial, metadata_only); + id) + : bch2_gc_btree(c, id, initial); if (ret) return ret; } @@ -558,12 +555,11 @@ static void bch2_gc_free(struct bch_fs *c) } static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) + bool initial) { struct bch_dev *ca; - bool verify = !metadata_only && - (!initial || - (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); + bool verify = (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); unsigned i; int ret = 0; @@ -580,10 +576,9 @@ static int bch2_gc_done(struct bch_fs *c, if (verify) \ fsck_err(c, "stripe %zu has wrong "_msg \ ": got %u, should be %u", \ - dst_iter.pos, ##__VA_ARGS__, \ + iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - dst->dirty = true; \ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_bucket_field(_f) \ @@ -602,29 +597,32 @@ static int bch2_gc_done(struct bch_fs *c, #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - if (!metadata_only) { - struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); - struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + { + struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; - while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && - (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { - BUG_ON(src_iter.pos != dst_iter.pos); + while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { + dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); - copy_stripe_field(alive, "alive"); - copy_stripe_field(sectors, "sectors"); - copy_stripe_field(algorithm, "algorithm"); - copy_stripe_field(nr_blocks, "nr_blocks"); - copy_stripe_field(nr_redundant, "nr_redundant"); - copy_stripe_field(blocks_nonempty, - "blocks_nonempty"); + if (dst->alive != src->alive || + dst->sectors != src->sectors || + dst->algorithm != src->algorithm || + dst->nr_blocks != src->nr_blocks || + dst->nr_redundant != src->nr_redundant) { + bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); + ret = -EINVAL; + goto fsck_err; + } for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); - genradix_iter_advance(&dst_iter, &c->stripes[0]); - genradix_iter_advance(&src_iter, &c->stripes[1]); + dst->blocks_nonempty = 0; + for (i = 0; i < dst->nr_blocks; i++) + dst->blocks_nonempty += dst->block_sectors[i] != 0; + + genradix_iter_advance(&iter, &c->stripes[1]); } } @@ -658,28 +656,20 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); - if (!metadata_only) { - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], - "persistent_reserved[%i]", i); - } + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); char buf[80]; - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - bch2_replicas_entry_to_text(&PBUF(buf), e); copy_fs_field(replicas[i], "%s", buf); @@ -695,8 +685,7 @@ fsck_err: return ret; } -static int bch2_gc_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i; @@ -760,13 +749,6 @@ static int bch2_gc_start(struct bch_fs *c, d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; d->gen_valid = s->gen_valid; - - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) { - d->_mark = s->mark; - d->_mark.owned_by_allocator = 0; - } } }; @@ -794,7 +776,7 @@ static int bch2_gc_start(struct bch_fs *c, * uses, GC could skip past them */ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) + bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -810,13 +792,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); again: - ret = bch2_gc_start(c, metadata_only); + ret = bch2_gc_start(c); if (ret) goto out; bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); + ret = bch2_gc_btrees(c, journal_keys, initial); if (ret) goto out; @@ -855,7 +837,7 @@ out: bch2_journal_block(&c->journal); percpu_down_write(&c->mark_lock); - ret = bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_done(c, initial); bch2_journal_unblock(&c->journal); } else { diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 3694a3df..f0435a58 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -7,7 +7,7 @@ void bch2_coalesce(struct bch_fs *); struct journal_keys; -int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); +int bch2_gc(struct bch_fs *, struct journal_keys *, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index b94f0807..65f7e366 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1828,23 +1828,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } -void bch2_btree_verify_flushed(struct bch_fs *c) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) { - unsigned long flags = READ_ONCE(b->flags); - - BUG_ON((flags & (1 << BTREE_NODE_dirty)) || - (flags & (1 << BTREE_NODE_write_in_flight))); - } - rcu_read_unlock(); -} - void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) { struct bucket_table *tbl; diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 1a4b11e9..3b61555e 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -185,7 +185,6 @@ do { \ void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); -void bch2_btree_verify_flushed(struct bch_fs *); void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index c490df47..967e1e4d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -836,7 +836,7 @@ int __bch2_trans_commit(struct btree_trans *trans) int ret = 0; if (!trans->nr_updates) - goto out_noupdates; + goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&trans->c->gc_lock); @@ -850,7 +850,7 @@ int __bch2_trans_commit(struct btree_trans *trans) unlikely(!percpu_ref_tryget(&trans->c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) - return ret; + goto out_reset; } #ifdef CONFIG_BCACHEFS_DEBUG @@ -962,7 +962,7 @@ out: if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); -out_noupdates: +out_reset: bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); return ret; @@ -981,10 +981,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .trigger_flags = flags, .iter = iter, .k = k }; - EBUG_ON(bkey_cmp(iter->pos, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_start_pos(&k->k) - : k->k.p)); +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(bkey_cmp(iter->pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&k->k) + : k->k.p)); + + trans_for_each_update(trans, i) { + BUG_ON(bkey_cmp(i->iter->pos, + (i->iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&i->k->k) + : i->k->k.p)); + + BUG_ON(i != trans->updates && + btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0); + } +#endif iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index ed07dfee..cb0f0e09 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -376,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m) return !is_available_bucket(m); } -static inline int is_fragmented_bucket(struct bucket_mark m, - struct bch_dev *ca) +static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bucket_mark m) { - if (!m.owned_by_allocator && - m.data_type == BCH_DATA_user && - bucket_sectors_used(m)) - return max_t(int, 0, (int) ca->mi.bucket_size - - bucket_sectors_used(m)); - return 0; + return bucket_sectors_used(m) + ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + : 0; } static inline int is_stripe_data_bucket(struct bucket_mark m) @@ -392,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m) return m.stripe && m.data_type != BCH_DATA_parity; } -static inline int bucket_stripe_sectors(struct bucket_mark m) -{ - return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; -} - static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors @@ -456,7 +448,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, if (type == BCH_DATA_sb || type == BCH_DATA_journal) fs_usage->hidden += size; - dev_usage->buckets[type] += nr; + dev_usage->d[type].buckets += nr; } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, @@ -481,19 +473,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + u->buckets_ec += (int) new.stripe - (int) old.stripe; u->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); - u->buckets_ec += (int) new.stripe - (int) old.stripe; - u->sectors_ec += bucket_stripe_sectors(new) - - bucket_stripe_sectors(old); - - u->sectors[old.data_type] -= old.dirty_sectors; - u->sectors[new.data_type] += new.dirty_sectors; - u->sectors[BCH_DATA_cached] += + u->d[old.data_type].sectors -= old.dirty_sectors; + u->d[new.data_type].sectors += new.dirty_sectors; + u->d[BCH_DATA_cached].sectors += (int) new.cached_sectors - (int) old.cached_sectors; - u->sectors_fragmented += - is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); + + u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) @@ -650,46 +641,6 @@ unwind: ret; \ }) -static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *ret, - bool gc) -{ - struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); - struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - BUG_ON(!is_available_bucket(new)); - - new.owned_by_allocator = true; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - new.gen++; - })); - - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - - if (old.cached_sectors) - update_cached_sectors(c, fs_usage, ca->dev_idx, - -((s64) old.cached_sectors)); - - if (!gc) - *ret = old; - return 0; -} - -void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) -{ - do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, - ca, b, old); - - if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, bucket_to_sector(ca, b), - old->cached_sectors); -} - static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) @@ -1269,9 +1220,15 @@ static int bch2_mark_stripe(struct bch_fs *c, m->blocks_nonempty = 0; for (i = 0; i < new_s->nr_blocks; i++) { - m->block_sectors[i] = - stripe_blockcount_get(new_s, i); - m->blocks_nonempty += !!m->block_sectors[i]; + unsigned s = stripe_blockcount_get(new_s, i); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + if (!gc) + m->block_sectors[i] = s; + m->blocks_nonempty += !!s; } if (gc && old_s) @@ -2100,6 +2057,168 @@ int bch2_trans_mark_update(struct btree_trans *trans, return ret; } +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), + }; + int ret = 0; + + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (ret) + return ret; + + if (u.data_type && u.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type ?: type], + u.dirty_sectors, sectors, ca->mi.bucket_size, + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + if (u.data_type == type && + u.dirty_sectors == sectors) + goto out; + + u.data_type = type; + u.dirty_sectors = sectors; + + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + +int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + return __bch2_trans_do(trans, res, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, + ca->mi.bucket_size)); + +} + +static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) +{ + int ret; + + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + if (b != *bucket) { + if (*bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + *bucket, type, *bucket_sectors); + if (ret) + return ret; + } + + *bucket = b; + *bucket_sectors = 0; + } + + *bucket_sectors += sectors; + start += sectors; + } while (!ret && start < end); + + return 0; +} + +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; + unsigned i, bucket_sectors = 0; + int ret; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + if (bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) + return ret; + } + + return 0; +} + +int bch2_trans_mark_dev_sb(struct bch_fs *c, + struct disk_reservation *res, + struct bch_dev *ca) +{ + return bch2_trans_do(c, res, NULL, 0, + __bch2_trans_mark_dev_sb(&trans, res, ca)); +} + /* Disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) @@ -2115,7 +2234,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) #define SECTORS_CACHE 1024 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, int flags) + u64 sectors, int flags) { struct bch_fs_pcpu *pcpu; u64 old, v, get; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 3a5ed1fc..37346240 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -153,18 +153,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark) return mark.dirty_sectors + mark.cached_sectors; } -static inline bool bucket_unused(struct bucket_mark mark) -{ - return !mark.owned_by_allocator && - !mark.data_type && - !bucket_sectors_used(mark); -} - static inline bool is_available_bucket(struct bucket_mark mark) { - return (!mark.owned_by_allocator && - !mark.dirty_sectors && - !mark.stripe); + return !mark.dirty_sectors && !mark.stripe; } static inline bool bucket_needs_journal_commit(struct bucket_mark m, @@ -245,8 +236,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, - size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, @@ -270,6 +259,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, + struct disk_reservation *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, + struct bch_dev *); + /* disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); @@ -284,8 +279,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) int bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - unsigned, int); + struct disk_reservation *, + u64, int); static inline struct disk_reservation bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) @@ -302,8 +297,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) static inline int bch2_disk_reservation_get(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, - unsigned nr_replicas, + u64 sectors, unsigned nr_replicas, int flags) { *res = bch2_disk_reservation_init(c, nr_replicas); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index d6057d22..5fbe940a 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -52,16 +52,15 @@ struct bucket_array { }; struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; - /* _compressed_ sectors: */ - u64 sectors[BCH_DATA_NR]; - u64 sectors_fragmented; - - u64 buckets_ec; - u64 sectors_ec; + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ + u64 fragmented; + } d[BCH_DATA_NR]; }; struct bch_fs_usage { diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index e7c8969a..49842ec8 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; arg.ec_buckets = src.buckets_ec; - arg.ec_sectors = src.sectors_ec; + arg.ec_sectors = 0; for (i = 0; i < BCH_DATA_NR; i++) { - arg.buckets[i] = src.buckets[i]; - arg.sectors[i] = src.sectors[i]; + arg.buckets[i] = src.d[i].buckets; + arg.sectors[i] = src.d[i].sectors; } percpu_ref_put(&ca->ref); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 75f39e99..9c7cc788 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -684,13 +684,14 @@ static void ec_stripe_delete_work(struct work_struct *work) /* stripe creation: */ static int ec_stripe_bkey_insert(struct bch_fs *c, - struct ec_stripe_new *s, - struct bkey_i_stripe *stripe) + struct bkey_i_stripe *stripe, + struct disk_reservation *res) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bpos start_pos = POS(0, c->ec_stripe_hint); + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; bch2_trans_init(&trans, c, 0, 0); @@ -701,7 +702,7 @@ retry: BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { - start_pos = POS_MIN; + start_pos = min_pos; bch2_btree_iter_set_pos(iter, start_pos); continue; } @@ -726,7 +727,7 @@ found_slot: bch2_trans_update(&trans, iter, &stripe->k_i, 0); - ret = bch2_trans_commit(&trans, &s->res, NULL, + ret = bch2_trans_commit(&trans, res, NULL, BTREE_INSERT_NOFAIL); err: bch2_trans_iter_put(&trans, iter); @@ -740,6 +741,47 @@ err: return ret; } +static int ec_stripe_bkey_update(struct btree_trans *trans, + struct bkey_i_stripe *new) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c k; + const struct bch_stripe *existing; + unsigned i; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_EC, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || k.k->type != KEY_TYPE_stripe) { + bch_err(c, "error updating stripe: not found"); + ret = -ENOENT; + goto err; + } + + existing = bkey_s_c_to_stripe(k).v; + + if (existing->nr_blocks != new->v.nr_blocks) { + bch_err(c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, + stripe_blockcount_get(existing, i)); + + bch2_trans_update(trans, iter, &new->k_i, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static void extent_stripe_ptr_add(struct bkey_s_extent e, struct ec_stripe_buf *s, struct bch_extent_ptr *ptr, @@ -866,9 +908,6 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (!percpu_ref_tryget(&c->writes)) goto err; - BUG_ON(bitmap_weight(s->blocks_allocated, - s->blocks.nr) != s->blocks.nr); - ec_generate_ec(&s->new_stripe); ec_generate_checksums(&s->new_stripe); @@ -884,9 +923,9 @@ static void ec_stripe_create(struct ec_stripe_new *s) } ret = s->have_existing_stripe - ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i, - &s->res, NULL, BTREE_INSERT_NOFAIL) - : ec_stripe_bkey_insert(c, s, &s->new_stripe.key); + ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_bkey_update(&trans, &s->new_stripe.key)) + : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); goto err_put_writes; @@ -902,11 +941,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) spin_lock(&c->ec_stripes_heap_lock); m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); -#if 0 - pr_info("created a %s stripe %llu", - s->have_existing_stripe ? "existing" : "new", - s->stripe.key.k.p.offset); -#endif + BUG_ON(m->on_heap); bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); spin_unlock(&c->ec_stripes_heap_lock); @@ -915,12 +950,17 @@ err_put_writes: err: bch2_disk_reservation_put(c, &s->res); - open_bucket_for_each(c, &s->blocks, ob, i) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } + for (i = 0; i < v->nr_blocks; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; - bch2_open_buckets_put(c, &s->parity); + if (i < nr_data) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } else { + bch2_open_bucket_put(c, ob); + } + } bch2_keylist_free(&s->keys, s->inline_keys); @@ -1179,7 +1219,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) if (h->s && h->s->allocated && bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr) == h->s->blocks.nr) + h->s->nr_data) == h->s->nr_data) ec_stripe_set_pending(c, h); mutex_unlock(&h->lock); @@ -1216,64 +1256,82 @@ static enum bucket_alloc_ret new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, struct closure *cl) { - struct bch_devs_mask devs; + struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - unsigned i, nr_have, nr_data = - min_t(unsigned, h->nr_active_devs, - BCH_BKEY_PTRS_MAX) - h->redundancy; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; enum bucket_alloc_ret ret = ALLOC_SUCCESS; - devs = h->devs; - - for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) { - __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); - --nr_data; + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (test_bit(i, h->s->blocks_gotten)) { + __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } } - BUG_ON(h->s->blocks.nr > nr_data); - BUG_ON(h->s->parity.nr > h->redundancy); - - open_bucket_for_each(c, &h->s->parity, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - open_bucket_for_each(c, &h->s->blocks, ob, i) - __clear_bit(ob->ptr.dev, devs.d); + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); percpu_down_read(&c->mark_lock); rcu_read_lock(); - if (h->s->parity.nr < h->redundancy) { - nr_have = h->s->parity.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->parity, + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->parity_stripe, &devs, - h->redundancy, - &nr_have, + h->s->nr_parity, + &nr_have_parity, &have_cache, h->copygc ? RESERVE_MOVINGGC : RESERVE_NONE, 0, cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data + h->s->nr_parity, + h->s->nr_data); + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } - if (h->s->blocks.nr < nr_data) { - nr_have = h->s->blocks.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->blocks, + buckets.nr = 0; + if (nr_have_data < h->s->nr_data) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->block_stripe, &devs, - nr_data, - &nr_have, + h->s->nr_data, + &nr_have_data, &have_cache, h->copygc ? RESERVE_MOVINGGC : RESERVE_NONE, 0, cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data, 0); + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } @@ -1325,8 +1383,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, struct closure *cl) { struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, data_idx = 0; + unsigned i; s64 idx; int ret; @@ -1361,9 +1418,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, BUG(); } + BUG_ON(h->s->existing_stripe.size != h->blocksize); + BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); + for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { - if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) + if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { + __set_bit(i, h->s->blocks_gotten); __set_bit(i, h->s->blocks_allocated); + } ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); } @@ -1401,20 +1463,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, goto out; } - open_bucket_for_each(c, &h->s->blocks, ob, i) { - data_idx = find_next_zero_bit(h->s->blocks_allocated, - h->s->nr_data, data_idx); - BUG_ON(data_idx >= h->s->nr_data); - - h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr; - h->s->data_block_idx[i] = data_idx; - data_idx++; - } - - open_bucket_for_each(c, &h->s->parity, ob, i) - h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; - - //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); h->s->allocated = true; } out: @@ -1434,12 +1482,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) if (!h->s) goto unlock; - open_bucket_for_each(c, &h->s->blocks, ob, i) - if (ob->ptr.dev == ca->dev_idx) - goto found; - open_bucket_for_each(c, &h->s->parity, ob, i) + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + + ob = c->open_buckets + h->s->blocks[i]; if (ob->ptr.dev == ca->dev_idx) goto found; + } goto unlock; found: h->s->err = -EROFS; @@ -1466,7 +1516,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, size_t idx, struct bkey_i_stripe *new_key) { - struct bch_fs *c = trans->c; + const struct bch_stripe *v; struct bkey_s_c k; unsigned i; int ret; @@ -1481,16 +1531,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, if (k.k->type != KEY_TYPE_stripe) return -EIO; + v = bkey_s_c_to_stripe(k).v; + for (i = 0; i < v->nr_blocks; i++) + if (m->block_sectors[i] != stripe_blockcount_get(v, i)) + goto write; + return 0; +write: bkey_reassemble(&new_key->k_i, k); - spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < new_key->v.nr_blocks; i++) stripe_blockcount_set(&new_key->v, i, m->block_sectors[i]); - m->dirty = false; - - spin_unlock(&c->ec_stripes_heap_lock); bch2_trans_update(trans, iter, &new_key->k_i, 0); return 0; @@ -1514,7 +1565,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); genradix_for_each(&c->stripes[0], giter, m) { - if (!m->dirty) + if (!m->alive) continue; ret = __bch2_trans_do(&trans, NULL, NULL, @@ -1624,19 +1675,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) h->target, h->algo, h->redundancy); if (h->s) - pr_buf(out, "\tpending: blocks %u allocated %u\n", - h->s->blocks.nr, + pr_buf(out, "\tpending: blocks %u+%u allocated %u\n", + h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr)); + h->s->nr_data)); } mutex_unlock(&c->ec_stripe_head_lock); mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", - s->blocks.nr, - bitmap_weight(s->blocks_allocated, - s->blocks.nr), + pr_buf(out, "\tin flight: blocks %u+%u pin %u\n", + s->nr_data, s->nr_parity, atomic_read(&s->pin)); } mutex_unlock(&c->ec_stripe_new_lock); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index f124582f..765baa9d 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -143,11 +143,9 @@ struct ec_stripe_new { bool pending; bool have_existing_stripe; + unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - - struct open_buckets blocks; - u8 data_block_idx[BCH_BKEY_PTRS_MAX]; - struct open_buckets parity; + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; struct keylist keys; diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 5b688b43..84777016 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -18,8 +18,7 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; - unsigned alive:1; - unsigned dirty:1; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ unsigned on_heap:1; u8 blocks_nonempty; u16 block_sectors[BCH_BKEY_PTRS_MAX]; diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 1faca4bc..5c43678e 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -192,18 +192,13 @@ bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *_k; struct bkey_s_c k; - struct bkey unpacked; - int sectors; + int ret, sectors; - _k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; /* Check if we're splitting a compressed extent: */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 959eff4c..af7f8791 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -84,6 +84,7 @@ struct dio_read { struct closure cl; struct kiocb *req; long ret; + bool should_dirty; struct bch_read_bio rbio; }; @@ -1619,12 +1620,22 @@ again: /* O_DIRECT reads */ +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + static void bch2_dio_read_complete(struct closure *cl) { struct dio_read *dio = container_of(cl, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret, 0); - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); } static void bch2_direct_IO_read_endio(struct bio *bio) @@ -1639,8 +1650,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio) static void bch2_direct_IO_read_split_endio(struct bio *bio) { + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + bch2_direct_IO_read_endio(bio); - bio_check_pages_dirty(bio); /* transfers ownership */ + bio_check_or_release(bio, should_dirty); } static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) @@ -1694,6 +1708,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) dio->req = req; dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); goto start; while (iter->count) { @@ -1715,7 +1735,9 @@ start: } offset += bio->bi_iter.bi_size; - bio_set_pages_dirty(bio); + + if (dio->should_dirty) + bio_set_pages_dirty(bio); if (iter->count) closure_get(&dio->cl); @@ -1729,7 +1751,7 @@ start: closure_sync(&dio->cl); closure_debug_destroy(&dio->cl); ret = dio->ret; - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); return ret; } else { return -EIOCBQUEUED; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 4c4ba07c..5f74583f 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -499,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 04c94e57..d6273c8d 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_gc.h" +#include "btree_update.h" #include "buckets.h" #include "journal.h" #include "journal_io.h" @@ -82,6 +83,7 @@ static void bch2_journal_buf_init(struct journal *j) bkey_extent_init(&buf->key); buf->noflush = false; buf->must_flush = false; + buf->separate_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -823,18 +825,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); + if (!c || new_fs) + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); if (c) { spin_unlock(&c->journal.lock); percpu_up_read(&c->mark_lock); } + if (c && !new_fs) + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, NULL, ca, + bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (!new_fs) bch2_open_bucket_put(c, ob); + + if (ret) + goto err; } err: bch2_sb_resize_journal(&ca->disk_sb, @@ -953,6 +965,7 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && (journal_entry_is_open(j) || j->last_empty_seq + 1 != journal_cur_seq(j))); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 1db1f190..bda8cb97 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 385cb4d5..750f6fab 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1189,6 +1189,53 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } +static void do_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, system_highpri_wq); + return; +} + void bch2_journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); @@ -1198,9 +1245,8 @@ void bch2_journal_write(struct closure *cl) struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - struct bch_extent_ptr *ptr; bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1330,49 +1376,30 @@ retry_alloc: if (c->opts.nochanges) goto no_io; - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; + for_each_rw_member(ca, c, i) + nr_rw_members++; + + if (nr_rw_members > 1) + w->separate_flush = true; + + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_opf = REQ_OP_FLUSH; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); } - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; - if (!JSET_NO_FLUSH(jset)) - bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; - bch2_bio_map(bio, jset, sectors << 9); - - trace_journal_write(bio); - closure_bio_submit(bio, cl); - - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - if (!JSET_NO_FLUSH(jset)) { - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); + bch2_bucket_seq_cleanup(c); - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - } + continue_at(cl, do_journal_write, system_highpri_wq); + return; no_io: bch2_bucket_seq_cleanup(c); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 9953663e..d17a1ff8 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -31,6 +31,7 @@ struct journal_buf { unsigned u64s_reserved; bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ + bool separate_flush; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 9505eab9..b4c315cf 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -154,7 +154,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) if (ret) goto err; - if (disk_sectors_delta > (s64) &op->res.sectors) { + if (disk_sectors_delta > (s64) op->res.sectors) { ret = bch2_disk_reservation_add(c, &op->res, disk_sectors_delta - op->res.sectors, !should_check_enospc diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index efa7f38e..d0acc1ee 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -291,7 +291,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) fragmented_allowed += ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); - fragmented += usage.sectors_fragmented; + fragmented += usage.d[BCH_DATA_user].fragmented; } return max_t(s64, 0, fragmented_allowed - fragmented); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 8c67f146..422f2fbe 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1099,27 +1099,13 @@ use_clean: set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { - /* - * interior btree node updates aren't consistent with the - * journal; after an unclean shutdown we have to walk all - * pointers to metadata: - */ - bch_info(c, "starting metadata mark and sweep"); - err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, true); - if (ret) - goto err; - bch_verbose(c, "mark and sweep done"); - } - if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, false); + ret = bch2_gc(c, &c->journal_keys, true); if (ret) goto err; bch_verbose(c, "mark and sweep done"); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index b1d8db67..ce8b7355 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -159,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); - new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -282,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c, for (i = 0; i < ARRAY_SIZE(new_usage); i++) if (!(new_usage[i] = __alloc_percpu_gfp(bytes, - sizeof(u64), GFP_NOIO))) + sizeof(u64), GFP_KERNEL))) goto err; - if (!(new_base = kzalloc(bytes, GFP_NOIO)) || - !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(bytes, GFP_KERNEL)) || (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) goto err; for (i = 0; i < ARRAY_SIZE(new_usage); i++) @@ -548,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, - GFP_NOIO); + GFP_KERNEL); if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); @@ -671,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; @@ -703,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, entry_size += sizeof(struct bch_replicas_entry) - sizeof(struct bch_replicas_entry_v0); - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 651fbc5d..00681533 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -235,10 +235,7 @@ nowrote_alloc: * the journal kicks off btree writes via reclaim - wait for in flight * writes after stopping journal: */ - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_btree_flush_all_writes(c); - else - bch2_btree_verify_flushed(c); + bch2_btree_flush_all_writes(c); /* * After stopping journal: @@ -1222,13 +1219,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && - !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { - mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(ca->fs, ca, 0); - mutex_unlock(&c->sb_lock); - } - bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) @@ -1602,7 +1592,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) * allocate the journal, reset all the marks, then remark after we * attach... */ - bch2_mark_dev_superblock(ca->fs, ca, 0); + bch2_mark_dev_superblock(NULL, ca, 0); err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); @@ -1661,15 +1651,13 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); - bch2_mark_dev_superblock(c, ca, 0); - bch2_write_super(c); mutex_unlock(&c->sb_lock); - err = "alloc write failed"; - ret = bch2_dev_alloc_write(c, ca, 0); + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret) - goto err; + goto err_late; if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); @@ -1690,6 +1678,7 @@ err: bch_err(c, "Unable to add device: %s", err); return ret; err_late: + up_write(&c->state_lock); bch_err(c, "Error going rw after adding device: %s", err); return -EINVAL; } @@ -1724,6 +1713,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } + if (bch2_trans_mark_dev_sb(c, NULL, ca)) { + err = "bch2_trans_mark_dev_sb() error"; + goto err; + } + ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index bfae0d71..4fc5777e 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -797,61 +797,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) nr[c->open_buckets[i].type]++; pr_buf(out, - "free_inc: %zu/%zu\n" - "free[RESERVE_MOVINGGC]: %zu/%zu\n" - "free[RESERVE_NONE]: %zu/%zu\n" - "buckets:\n" - " capacity: %llu\n" - " alloc: %llu\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " available: %lli\n" - "sectors:\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " fragmented: %llu\n" - " copygc threshold: %llu\n" - "freelist_wait: %s\n" - "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n" - "open_buckets_btree: %u\n" - "open_buckets_user: %u\n" - "btree reserve cache: %u\n", - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, - ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_alloc, - stats.buckets[BCH_DATA_sb], - stats.buckets[BCH_DATA_journal], - stats.buckets[BCH_DATA_btree], - stats.buckets[BCH_DATA_user], - stats.buckets[BCH_DATA_cached], - stats.buckets_ec, - __dev_buckets_available(ca, stats), - stats.sectors[BCH_DATA_sb], - stats.sectors[BCH_DATA_journal], - stats.sectors[BCH_DATA_btree], - stats.sectors[BCH_DATA_user], - stats.sectors[BCH_DATA_cached], - stats.sectors_ec, - stats.sectors_fragmented, - c->copygc_threshold, - c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, - BTREE_NODE_OPEN_BUCKET_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_btree], - nr[BCH_DATA_user], - c->btree_reserve_cache_nr); + "\t\t buckets\t sectors fragmented\n" + "capacity%16llu\n", + ca->mi.nbuckets - ca->mi.first_bucket); + + for (i = 1; i < BCH_DATA_NR; i++) + pr_buf(out, "%-8s%16llu%16llu%16llu\n", + bch2_data_types[i], stats.d[i].buckets, + stats.d[i].sectors, stats.d[i].fragmented); + + pr_buf(out, + "ec\t%16llu\n" + "available%15llu\n" + "alloc\t%16llu\n" + "\n" + "free_inc\t\t%zu/%zu\n" + "free[RESERVE_MOVINGGC]\t%zu/%zu\n" + "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" + "open buckets\t\t%u/%u (reserved %u)\n" + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" + "btree reserve cache\t%u\n", + stats.buckets_ec, + __dev_buckets_available(ca, stats), + stats.buckets_alloc, + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + c->freelist_wait.list.first ? "waiting" : "empty", + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = {