From 8bcd38555cb224b7da722bcdeca5c2d15f3ef284 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Mar 2020 15:56:38 -0400 Subject: [PATCH] Update bcachefs sources to fd637ebda0 bcachefs: Journal updates to interior nodes --- .bcachefs_revision | 2 +- libbcachefs/bcachefs_format.h | 3 +- libbcachefs/bkey.h | 3 +- libbcachefs/bkey_methods.c | 2 +- libbcachefs/bkey_sort.c | 64 ++--- libbcachefs/btree_cache.c | 81 ++++++- libbcachefs/btree_cache.h | 3 + libbcachefs/btree_gc.c | 113 ++++++--- libbcachefs/btree_io.c | 11 +- libbcachefs/btree_io.h | 9 +- libbcachefs/btree_iter.c | 9 +- libbcachefs/btree_types.h | 8 +- libbcachefs/btree_update.h | 1 + libbcachefs/btree_update_interior.c | 354 ++++++++++------------------ libbcachefs/btree_update_interior.h | 16 +- libbcachefs/btree_update_leaf.c | 45 ++-- libbcachefs/extent_update.c | 4 + libbcachefs/fsck.c | 3 +- libbcachefs/recovery.c | 183 +++++++++----- libbcachefs/recovery.h | 21 +- libbcachefs/super-io.c | 2 + 21 files changed, 491 insertions(+), 446 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index e6246606..330c6bdd 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -3592e42edfaed6a66470fb6a456a5895243ef2f4 +fd637ebda030609b15a473f01f1ef54bbe818f27 diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 798f5c9e..a78988e3 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1312,7 +1312,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(new_extent_overwrite, 9) \ x(incompressible, 10) \ x(btree_ptr_v2, 11) \ - x(extents_above_btree_updates, 12) + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 9106bea9..cbcfbd26 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format, static inline void bkey_reassemble(struct bkey_i *dst, struct bkey_s_c src) { - BUG_ON(bkey_packed(src.k)); dst->k = *src.k; - memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); + memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } #define bkey_s_null ((struct bkey_s) { .k = NULL }) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index c064cf46..0aa3d3b9 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { - if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) + if (bkey_cmp(k.k->p, b->data->min_key) < 0) return "key before start of btree node"; if (bkey_cmp(k.k->p, b->data->max_key) > 0) diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 68965a0f..839e78d1 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, return nr; } -static void extent_sort_advance_prev(struct bkey_format *f, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev) -{ - if (*prev) { - bch2_bkey_pack(*prev, (void *) *prev, f); - - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; - } -} - static void extent_sort_append(struct bch_fs *c, struct bkey_format *f, struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, + struct bkey_packed **out, struct bkey_s k) { - if (bkey_whiteout(k.k)) - return; + if (!bkey_whiteout(k.k)) { + if (!bch2_bkey_pack_key(*out, k.k, f)) + memcpy_u64s_small(*out, k.k, BKEY_U64s); - /* - * prev is always unpacked, for key merging - until right before we - * advance it: - */ + memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); - if (*prev && - bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) == - BCH_MERGE_MERGE) - return; - - extent_sort_advance_prev(f, nr, start, prev); - - bkey_reassemble((void *) *prev, k.s_c); + btree_keys_account_key_add(nr, 0, *out); + *out = bkey_next(*out); + } } /* Sort + repack in a new format: */ @@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -/* Sort, repack, and merge: */ +/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ struct btree_nr_keys bch2_sort_repack_merge(struct bch_fs *c, struct bset *dst, struct btree *src, @@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c, struct bkey_format *out_f, bool filter_whiteouts) { - struct bkey_packed *prev = NULL, *k_packed; + struct bkey_packed *out = vstruct_last(dst), *k_packed; struct bkey_on_stack k; struct btree_nr_keys nr; @@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c, bch2_bkey_normalize(c, bkey_i_to_s(k.k))) continue; - extent_sort_append(c, out_f, &nr, vstruct_last(dst), - &prev, bkey_i_to_s(k.k)); + extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); } - extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); - - dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); bkey_on_stack_exit(&k, c); return nr; } @@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, struct btree *b = iter->b; struct bkey_format *f = &b->format; struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; - struct bkey_packed *prev = NULL; + struct bkey_packed *out = dst->start; struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; @@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, l = __bkey_disassemble(b, _l->k, &l_unpacked); if (iter->used == 1) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); + extent_sort_append(c, f, &nr, &out, l); extent_iter_advance(iter, 0); continue; } @@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, /* If current key and next key don't overlap, just append */ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); + extent_sort_append(c, f, &nr, &out, l); extent_iter_advance(iter, 0); continue; } @@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, __sort_iter_sift(iter, 0, extent_sort_fix_overlapping_cmp); - extent_sort_append(c, f, &nr, dst->start, - &prev, bkey_i_to_s(split.k)); + extent_sort_append(c, f, &nr, &out, + bkey_i_to_s(split.k)); } else { bch2_cut_back_s(bkey_start_pos(r.k), l); extent_save(b, _l->k, l.k); } } - extent_sort_advance_prev(f, &nr, dst->start, &prev); - - dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); bkey_on_stack_exit(&split, c); return nr; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index e9df7e82..5c3e7e16 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -588,6 +588,7 @@ err: static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, + enum btree_id btree_id, unsigned level, enum six_lock_type lock_type, bool sync) @@ -600,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * Parent node must be locked, else we could read in a btree node that's * been freed: */ - if (!bch2_btree_node_relock(iter, level + 1)) + if (iter && !bch2_btree_node_relock(iter, level + 1)) return ERR_PTR(-EINTR); b = bch2_btree_node_mem_alloc(c); @@ -608,7 +609,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return b; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ @@ -628,7 +629,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * * XXX: ideally should be dropping all btree node locks here */ - if (btree_node_read_locked(iter, level + 1)) + if (iter && btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); bch2_btree_node_read(c, b, sync); @@ -676,7 +677,8 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); + b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -762,6 +764,74 @@ lock_node: return b; } +struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { +lock_node: + six_lock_read(&b->lock); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->btree_id != btree_id || + b->level != level)) { + six_unlock_read(&b->lock); + goto retry; + } + } + + /* XXX: waiting on IO with btree locks held: */ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->lock); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->btree_id != btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + + return b; +} + struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree_iter *iter, struct btree *b, @@ -876,7 +946,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, if (b) return; - bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, SIX_LOCK_read, false); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index bc24d926..132cc95a 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, enum six_lock_type); +struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned); + struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, struct btree *, enum btree_node_sibling); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index c5a0c0ed..7c89a6dd 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -184,16 +184,8 @@ fsck_err: return ret; } -static bool pos_in_journal_keys(struct journal_keys *journal_keys, - enum btree_id id, struct bpos pos) -{ - struct journal_key *k = journal_key_search(journal_keys, id, pos); - - return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos); -} - static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, - struct journal_keys *journal_keys, bool initial) + bool initial) { struct btree_node_iter iter; struct bkey unpacked; @@ -207,10 +199,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - if (!b->level && journal_keys && - pos_in_journal_keys(journal_keys, b->btree_id, k.k->p)) - continue; - bch2_bkey_debugcheck(c, b, k); ret = bch2_gc_mark_key(c, k, max_stale, initial); @@ -222,7 +210,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - struct journal_keys *journal_keys, bool initial, bool metadata_only) { struct btree_trans trans; @@ -250,8 +237,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(c, b, &max_stale, - journal_keys, initial); + ret = btree_gc_mark_node(c, b, &max_stale, initial); if (ret) break; @@ -287,6 +273,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return ret; } +static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + struct journal_keys *journal_keys, + unsigned target_depth) +{ + struct btree_and_journal_iter iter; + struct bkey_s_c k; + u8 max_stale = 0; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, k, &max_stale, true); + if (ret) + break; + + if (b->level > target_depth) { + struct btree *child; + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); + + child = bch2_btree_node_get_noiter(c, &tmp.k, + b->btree_id, b->level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; + + bch2_gc_btree_init_recurse(c, child, + journal_keys, target_depth); + six_unlock_read(&child->lock); + } + + bch2_btree_and_journal_iter_advance(&iter); + } + + return ret; +} + +static int bch2_gc_btree_init(struct bch_fs *c, + struct journal_keys *journal_keys, + enum btree_id btree_id, + bool metadata_only) +{ + struct btree *b; + unsigned target_depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; + int ret = 0; + + b = c->btree_roots[btree_id].b; + + if (btree_node_fake(b)) + return 0; + + six_lock_read(&b->lock); + if (b->level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, + journal_keys, target_depth); + + if (!ret) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, true); + six_unlock_read(&b->lock); + + return ret; +} + static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) { return (int) btree_id_to_gc_phase(l) - @@ -305,27 +363,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; - enum btree_node_type type = __btree_node_type(0, id); - - int ret = bch2_gc_btree(c, id, journal_keys, - initial, metadata_only); + int ret = initial + ? bch2_gc_btree_init(c, journal_keys, + id, metadata_only) + : bch2_gc_btree(c, id, initial, metadata_only); if (ret) return ret; - - if (journal_keys && !metadata_only && - btree_node_type_needs_gc(type)) { - struct journal_key *j; - u8 max_stale; - int ret; - - for_each_journal_key(*journal_keys, j) - if (j->btree_id == id) { - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k), - &max_stale, initial); - if (ret) - return ret; - } - } } return 0; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 3f7c1042..b48d48b8 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1261,7 +1261,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, closure_put(&((struct btree_update *) new)->cl); bch2_journal_pin_drop(&c->journal, &w->journal); - closure_wake_up(&w->wait); } static void btree_node_write_done(struct bch_fs *c, struct btree *b) @@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; - if (b->level || !b->written) - wbio->wbio.bio.bi_opf |= REQ_FUA; - bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* @@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) { unsigned long flags = READ_ONCE(b->flags); - unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; if (!(flags & (1 << BTREE_NODE_dirty))) continue; - pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", + pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", b, (flags & (1 << BTREE_NODE_dirty)) != 0, (flags & (1 << BTREE_NODE_need_write)) != 0, @@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) b->written, !list_empty_careful(&b->write_blocked), b->will_make_reachable != 0, - b->will_make_reachable & 1, - b->writes[ idx].wait.list.first != NULL, - b->writes[!idx].wait.list.first != NULL); + b->will_make_reachable & 1); } rcu_read_unlock(); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index e90e89ee..fd719dda 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type); -static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) { while (b->written && btree_node_need_write(b) && btree_node_may_write(b)) { if (!btree_node_write_in_flight(b)) { - bch2_btree_node_write(c, b, SIX_LOCK_read); + bch2_btree_node_write(c, b, lock_held); break; } six_unlock_read(&b->lock); btree_node_wait_on_io(b); - btree_node_lock_type(c, b, SIX_LOCK_read); + btree_node_lock_type(c, b, lock_held); } } @@ -131,7 +132,7 @@ do { \ new |= (1 << BTREE_NODE_need_write); \ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ \ - btree_node_write_if_need(_c, _b); \ + btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ } while (0) void bch2_btree_flush_all_reads(struct bch_fs *); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 2819b9a4..6ed688cd 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1068,7 +1068,14 @@ retry_all: goto retry_all; } - ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; + if (hweight64(trans->iters_live) > 1) + ret = -EINTR; + else + trans_for_each_iter(trans, iter) + if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { + ret = -EINTR; + break; + } out: bch2_btree_cache_cannibalize_unlock(c); return ret; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 51d579a4..31a5c215 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -53,7 +53,6 @@ struct bset_tree { struct btree_write { struct journal_entry_pin journal; - struct closure_waitlist wait; }; struct btree_alloc { @@ -261,6 +260,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } +static inline struct btree_iter_level *iter_l(struct btree_iter *iter) +{ + return iter->l + iter->level; +} + struct btree_insert_entry { unsigned trigger_flags; unsigned trans_triggers_run:1; @@ -539,8 +543,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) struct btree_root { struct btree *b; - struct btree_update *as; - /* On disk root - see async splits: */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 9f58d47e..11f7d02d 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, struct btree_iter *); bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_i *); +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { __BTREE_INSERT_NOUNLOCK, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 02f19146..bc7749c8 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -24,7 +24,6 @@ static void btree_node_will_make_reachable(struct btree_update *, struct btree *); static void btree_update_drop_new_node(struct bch_fs *, struct btree *); -static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int); /* Debug code: */ @@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, } static void bch2_btree_node_free_ondisk(struct bch_fs *c, - struct pending_btree_node_free *pending) + struct pending_btree_node_free *pending, + u64 journal_seq) { BUG_ON(!pending->index_update_done); bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); + 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - 0, 0, NULL, 0, + 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE| BTREE_TRIGGER_GC); } @@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); - BUG_ON(as->nr_new_nodes); - BUG_ON(as->nr_pending); + BUG_ON((as->nr_new_nodes || as->nr_pending) && + !bch2_journal_error(&c->journal));; if (as->reserve) bch2_btree_reserve_put(c, as->reserve); @@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as) mutex_unlock(&c->btree_interior_update_lock); } -static void btree_update_nodes_reachable(struct closure *cl) +static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) { - struct btree_update *as = container_of(cl, struct btree_update, cl); struct bch_fs *c = as->c; - bch2_journal_pin_drop(&c->journal, &as->journal); - mutex_lock(&c->btree_interior_update_lock); while (as->nr_new_nodes) { @@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl) } while (as->nr_pending) - bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); + bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending], + seq); mutex_unlock(&c->btree_interior_update_lock); - - closure_wake_up(&as->wait); - - bch2_btree_update_free(as); -} - -static void btree_update_wait_on_journal(struct closure *cl) -{ - struct btree_update *as = container_of(cl, struct btree_update, cl); - struct bch_fs *c = as->c; - int ret; - - ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); - if (ret == -EAGAIN) { - continue_at(cl, btree_update_wait_on_journal, system_wq); - return; - } - if (ret < 0) - goto err; - - bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); -err: - continue_at(cl, btree_update_nodes_reachable, system_wq); } static void btree_update_nodes_written(struct closure *cl) { struct btree_update *as = container_of(cl, struct btree_update, cl); + struct journal_res res = { 0 }; struct bch_fs *c = as->c; struct btree *b; + struct bset *i; + struct bkey_i *k; + unsigned journal_u64s = 0; + int ret; /* * We did an update to a parent node where the pointers we added pointed @@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl) */ mutex_lock(&c->btree_interior_update_lock); as->nodes_written = true; -retry: +again: as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, struct btree_update, unwritten_list); if (!as || !as->nodes_written) { @@ -679,31 +662,53 @@ retry: return; } + b = as->b; + if (b && !six_trylock_intent(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + btree_node_lock_type(c, b, SIX_LOCK_intent); + six_unlock_intent(&b->lock); + goto out; + } + + journal_u64s = 0; + + if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) + for_each_keylist_key(&as->parent_keys, k) + journal_u64s += jset_u64s(k->k.u64s); + + ret = bch2_journal_res_get(&c->journal, &res, journal_u64s, + JOURNAL_RES_GET_RESERVED); + if (ret) { + BUG_ON(!bch2_journal_error(&c->journal)); + /* can't unblock btree writes */ + goto free_update; + } + + if (as->mode != BTREE_INTERIOR_UPDATING_ROOT) + for_each_keylist_key(&as->parent_keys, k) + bch2_journal_add_entry(&c->journal, &res, + BCH_JSET_ENTRY_btree_keys, + as->btree_id, + as->level, + k, k->k.u64s); + switch (as->mode) { case BTREE_INTERIOR_NO_UPDATE: BUG(); case BTREE_INTERIOR_UPDATING_NODE: - /* The usual case: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - btree_node_lock_type(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - mutex_lock(&c->btree_interior_update_lock); - goto retry; - } - - BUG_ON(!btree_node_dirty(b)); - closure_wait(&btree_current_write(b)->wait, &as->cl); + /* @b is the node we did the final insert into: */ + BUG_ON(!res.ref); + six_lock_write(&b->lock); list_del(&as->write_blocked_list); - /* - * for flush_held_btree_writes() waiting on updates to flush or - * nodes to be writeable: - */ - closure_wake_up(&c->btree_interior_update_wait); + i = btree_bset_last(b); + i->journal_seq = cpu_to_le64( + max(res.seq, + le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, res.seq); + six_unlock_write(&b->lock); list_del(&as->unwritten_list); mutex_unlock(&c->btree_interior_update_lock); @@ -712,82 +717,51 @@ retry: * b->write_blocked prevented it from being written, so * write it now if it needs to be written: */ - bch2_btree_node_write_cond(c, b, true); - six_unlock_read(&b->lock); - continue_at(&as->cl, btree_update_nodes_reachable, system_wq); + btree_node_write_if_need(c, b, SIX_LOCK_intent); + six_unlock_intent(&b->lock); break; case BTREE_INTERIOR_UPDATING_AS: - /* - * The btree node we originally updated has been freed and is - * being rewritten - so we need to write anything here, we just - * need to signal to that btree_update that it's ok to make the - * new replacement node visible: - */ - closure_put(&as->parent_as->cl); - - /* - * and then we have to wait on that btree_update to finish: - */ - closure_wait(&as->parent_as->wait, &as->cl); + BUG_ON(b); list_del(&as->unwritten_list); mutex_unlock(&c->btree_interior_update_lock); - - continue_at(&as->cl, btree_update_nodes_reachable, system_wq); break; - case BTREE_INTERIOR_UPDATING_ROOT: - /* b is the new btree root: */ - b = READ_ONCE(as->b); + case BTREE_INTERIOR_UPDATING_ROOT: { + struct btree_root *r = &c->btree_roots[as->btree_id]; - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - btree_node_lock_type(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - mutex_lock(&c->btree_interior_update_lock); - goto retry; - } + BUG_ON(b); - BUG_ON(c->btree_roots[b->btree_id].as != as); - c->btree_roots[b->btree_id].as = NULL; - - bch2_btree_set_root_ondisk(c, b, WRITE); - - /* - * We don't have to wait anything anything here (before - * btree_update_nodes_reachable frees the old nodes - * ondisk) - we've ensured that the very next journal write will - * have the pointer to the new root, and before the allocator - * can reuse the old nodes it'll have to do a journal commit: - */ - six_unlock_read(&b->lock); + mutex_lock(&c->btree_root_lock); + bkey_copy(&r->key, as->parent_keys.keys); + r->level = as->level; + r->alive = true; + c->btree_roots_dirty = true; + mutex_unlock(&c->btree_root_lock); list_del(&as->unwritten_list); mutex_unlock(&c->btree_interior_update_lock); - - /* - * Bit of funny circularity going on here we have to break: - * - * We have to drop our journal pin before writing the journal - * entry that points to the new btree root: else, we could - * deadlock if the journal currently happens to be full. - * - * This mean we're dropping the journal pin _before_ the new - * nodes are technically reachable - but this is safe, because - * after the bch2_btree_set_root_ondisk() call above they will - * be reachable as of the very next journal write: - */ - bch2_journal_pin_drop(&c->journal, &as->journal); - - as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); - - btree_update_wait_on_journal(&as->cl); break; } + } + bch2_journal_pin_drop(&c->journal, &as->journal); + + bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + btree_update_nodes_reachable(as, res.seq); +free_update: + bch2_btree_update_free(as); + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); +out: mutex_lock(&c->btree_interior_update_lock); - goto retry; + goto again; } /* @@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(!btree_node_dirty(b)); - as->mode = BTREE_INTERIOR_UPDATING_NODE; - as->b = b; + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; + as->level = b->level; list_add(&as->write_blocked_list, &b->write_blocked); mutex_unlock(&c->btree_interior_update_lock); - - /* - * In general, when you're staging things in a journal that will later - * be written elsewhere, and you also want to guarantee ordering: that - * is, if you have updates a, b, c, after a crash you should never see c - * and not a or b - there's a problem: - * - * If the final destination of the update(s) (i.e. btree node) can be - * written/flushed _before_ the relevant journal entry - oops, that - * breaks ordering, since the various leaf nodes can be written in any - * order. - * - * Normally we use bset->journal_seq to deal with this - if during - * recovery we find a btree node write that's newer than the newest - * journal entry, we just ignore it - we don't need it, anything we're - * supposed to have (that we reported as completed via fsync()) will - * still be in the journal, and as far as the state of the journal is - * concerned that btree node write never happened. - * - * That breaks when we're rewriting/splitting/merging nodes, since we're - * mixing btree node writes that haven't happened yet with previously - * written data that has been reported as completed to the journal. - * - * Thus, before making the new nodes reachable, we have to wait the - * newest journal sequence number we have data for to be written (if it - * hasn't been yet). - */ - bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); -} - -static void interior_update_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct btree_update *as = - container_of(pin, struct btree_update, journal); - - bch2_journal_flush_seq_async(j, as->journal_seq, NULL); } static void btree_update_reparent(struct btree_update *as, @@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as, { struct bch_fs *c = as->c; + lockdep_assert_held(&c->btree_interior_update_lock); + child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - child->parent_as = as; - closure_get(&as->cl); /* * When we write a new btree root, we have to drop our journal pin @@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as, * just transfer the journal pin to the new interior update so * btree_update_nodes_written() can drop it. */ - bch2_journal_pin_copy(&c->journal, &as->journal, - &child->journal, interior_update_flush); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); bch2_journal_pin_drop(&c->journal, &child->journal); - - as->journal_seq = max(as->journal_seq, child->journal_seq); } -static void btree_update_updated_root(struct btree_update *as) +static void btree_update_updated_root(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - struct btree_root *r = &c->btree_roots[as->btree_id]; + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!bch2_keylist_empty(&as->parent_keys)); mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); - - /* - * Old root might not be persistent yet - if so, redirect its - * btree_update operation to point to us: - */ - if (r->as) - btree_update_reparent(as, r->as); - - as->mode = BTREE_INTERIOR_UPDATING_ROOT; - as->b = r->b; - r->as = as; - + as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->level = b->level; + bch2_keylist_add(&as->parent_keys, &b->key); mutex_unlock(&c->btree_interior_update_lock); - - /* - * When we're rewriting nodes and updating interior nodes, there's an - * issue with updates that haven't been written in the journal getting - * mixed together with older data - see btree_update_updated_node() - * for the explanation. - * - * However, this doesn't affect us when we're writing a new btree root - - * because to make that new root reachable we have to write out a new - * journal entry, which must necessarily be newer than as->journal_seq. - */ } static void btree_node_will_make_reachable(struct btree_update *as, @@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - struct closure *cl, *cl_n; struct btree_update *p, *n; struct btree_write *w; - struct bset_tree *t; set_btree_node_dying(b); @@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, btree_interior_update_add_node_reference(as, b); - /* - * Does this node have data that hasn't been written in the journal? - * - * If so, we have to wait for the corresponding journal entry to be - * written before making the new nodes reachable - we can't just carry - * over the bset->journal_seq tracking, since we'll be mixing those keys - * in with keys that aren't in the journal anymore: - */ - for_each_bset(b, t) - as->journal_seq = max(as->journal_seq, - le64_to_cpu(bset(b, t)->journal_seq)); - mutex_lock(&c->btree_interior_update_lock); /* @@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, clear_btree_node_dirty(b); clear_btree_node_need_write(b); - w = btree_current_write(b); - - /* - * Does this node have any btree_update operations waiting on this node - * to be written? - * - * If so, wake them up when this btree_update operation is reachable: - */ - llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) - llist_add(&cl->list, &as->wait.list); /* * Does this node have unwritten data that has a pin on the journal? @@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, * oldest pin of any of the nodes we're freeing. We'll release the pin * when the new nodes are persistent and reachable on disk: */ - bch2_journal_pin_copy(&c->journal, &as->journal, - &w->journal, interior_update_flush); + w = btree_current_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, - &w->journal, interior_update_flush); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, { struct btree_reserve *reserve; struct btree_update *as; + int ret; reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); if (IS_ERR(reserve)) @@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, bch2_keylist_init(&as->parent_keys, as->inline_keys); + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0); + if (ret) { + bch2_btree_reserve_put(c, reserve); + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + return ERR_PTR(ret); + } + mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); @@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } -static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) -{ - struct btree_root *r = &c->btree_roots[b->btree_id]; - - mutex_lock(&c->btree_root_lock); - - BUG_ON(b != r->b); - bkey_copy(&r->key, &b->key); - r->level = b->level; - r->alive = true; - if (rw == WRITE) - c->btree_roots_dirty = true; - - mutex_unlock(&c->btree_root_lock); -} - /** * bch_btree_set_root - update the root in memory and on disk * @@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, bch2_btree_set_root_inmem(as, b); - btree_update_updated_root(as); + btree_update_updated_root(as, b); /* * Unlock old root after new root is visible: @@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->lock); - bch2_keylist_add(&as->parent_keys, &n1->key); + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); } bch2_btree_node_write(c, n1, SIX_LOCK_intent); @@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, (bkey_cmp_packed(b, k, &insert->k) >= 0)) ; - while (!bch2_keylist_empty(keys)) { - insert = bch2_keylist_front(keys); - + for_each_keylist_key(keys, insert) bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); - bch2_keylist_pop_front(keys); - } btree_update_updated_node(as, b); @@ -1630,7 +1512,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned flags) { struct btree_trans *trans = iter->trans; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; struct btree_update *as; struct closure cl; int ret = 0; @@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bkey_copy(&b->key, new_key); } - btree_update_updated_root(as); + btree_update_updated_root(as, b); bch2_btree_node_unlock_write(b, iter); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index c90fcd48..0ac95dd8 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -69,8 +69,10 @@ struct btree_update { unsigned nodes_written:1; enum btree_id btree_id; + u8 level; struct btree_reserve *reserve; + struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: @@ -83,18 +85,6 @@ struct btree_update { struct btree *b; struct list_head write_blocked_list; - /* - * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now - * we're now blocking another btree_update - * @parent_as - btree_update that's waiting on our nodes to finish - * writing, before it can make new nodes visible on disk - * @wait - list of child btree_updates that are waiting on this - * btree_update to make all the new nodes visible before they can free - * their old btree nodes - */ - struct btree_update *parent_as; - struct closure_waitlist wait; - /* * We may be freeing nodes that were dirty, and thus had journal entries * pinned: we need to transfer the oldest of those pins to the @@ -103,8 +93,6 @@ struct btree_update { */ struct journal_entry_pin journal; - u64 journal_seq; - /* * Nodes being freed: * Protected by c->btree_node_pending_free_lock diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 7c2f72a3..f94bc6a0 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates2 && - i[0].iter->l[0].b == i[-1].iter->l[0].b; + iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; } inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, @@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, return __btree_node_flush(j, pin, 1, seq); } +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); +} + static inline void __btree_journal_key(struct btree_trans *trans, enum btree_id btree_id, struct bkey_i *insert) @@ -172,13 +183,8 @@ static void bch2_btree_journal_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree *b = iter->l[0].b; - struct btree_write *w = btree_current_write(b); - u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - ? trans->journal_res.seq - : j->replay_journal_seq; + struct btree *b = iter_l(iter)->b; - EBUG_ON(iter->level || b->level); EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); @@ -188,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans, cpu_to_le64(trans->journal_res.seq); } - bch2_journal_pin_add(j, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); + bch2_btree_add_journal_pin(c, b, + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + ? trans->journal_res.seq + : j->replay_journal_seq); if (unlikely(!btree_node_dirty(b))) set_btree_node_dirty(b); @@ -205,17 +211,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans, struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; struct bset_tree *t = bset_tree_last(b); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - EBUG_ON(iter->level); - insert->k.needs_whiteout = false; - if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert))) + if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert))) bch2_btree_journal_key(trans, iter, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; @@ -241,7 +245,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, { struct bch_fs *c = trans->c; - BUG_ON(iter->level); BUG_ON(bkey_cmp(insert->k.p, iter->pos)); BUG_ON(debug_check_bkeys(c) && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); @@ -290,7 +293,7 @@ btree_key_can_insert(struct btree_trans *trans, unsigned *u64s) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; static enum btree_insert_ret ret; if (unlikely(btree_node_fake(b))) @@ -345,7 +348,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) struct btree_insert_entry *i; trans_for_each_update(trans, i) - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b))) bch2_mark_update(trans, i->iter, i->k, NULL, i->trigger_flags|BTREE_TRIGGER_GC); } @@ -461,7 +464,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, int ret; trans_for_each_update2(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, 0)); + BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ret = bch2_journal_preres_get(&trans->c->journal, &trans->journal_preres, trans->journal_preres_u64s, @@ -495,13 +498,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(trans->c, - i->iter->l[0].b, i->iter); + iter_l(i->iter)->b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, i->iter); /* diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index beb3b694..8e5070d5 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans, * extent we're inserting and overwriting: */ *nr_iters += 1; + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } switch (k.k->type) { case KEY_TYPE_extent: diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 902c8da9..936e6366 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c) bch_verbose(c, "checking extents"); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(BCACHEFS_ROOT_INO, 0), 0); + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT); retry: for_each_btree_key_continue(iter, 0, k, ret) { if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 02b381cb..2b428ee7 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -27,30 +27,78 @@ /* iterate over keys read from the journal: */ -struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +static struct journal_key *journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { - while (iter->k) { - if (iter->k->btree_id == iter->btree_id) - return bkey_i_to_s_c(iter->k->k); + size_t l = 0, r = journal_keys->nr, m; - iter->k++; - if (iter->k == iter->keys->d + iter->keys->nr) - iter->k = NULL; + while (l < r) { + m = l + ((r - l) >> 1); + if ((cmp_int(id, journal_keys->d[m].btree_id) ?: + cmp_int(level, journal_keys->d[m].level) ?: + bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) + l = m + 1; + else + r = m; } - return bkey_s_c_null; + BUG_ON(l < journal_keys->nr && + (cmp_int(id, journal_keys->d[l].btree_id) ?: + cmp_int(level, journal_keys->d[l].level) ?: + bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); + + BUG_ON(l && + (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: + cmp_int(level, journal_keys->d[l - 1].level) ?: + bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); + + return l < journal_keys->nr ? journal_keys->d + l : NULL; } -struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) +static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - if (!iter->k) - return bkey_s_c_null; + if (iter->k && + iter->k < iter->keys->d + iter->keys->nr && + iter->k->btree_id == iter->btree_id && + iter->k->level == iter->level) + return iter->k->k; - iter->k++; - if (iter->k == iter->keys->d + iter->keys->nr) - iter->k = NULL; + iter->k = NULL; + return NULL; +} - return bch2_journal_iter_peek(iter); +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->k) + iter->k++; +} + +static void bch2_journal_iter_init(struct journal_iter *iter, + struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = journal_keys; + iter->k = journal_key_search(journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return iter->btree + ? bch2_btree_iter_peek(iter->btree) + : bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + if (iter->btree) + bch2_btree_iter_next(iter->btree); + else + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); } void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) @@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) case none: break; case btree: - bch2_btree_iter_next(iter->btree); + bch2_journal_iter_advance_btree(iter); break; case journal: - bch2_journal_iter_next(&iter->journal); + bch2_journal_iter_advance(&iter->journal); break; } @@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * struct bkey_s_c ret; while (1) { - struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree); - struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal); + struct bkey_s_c btree_k = + bch2_journal_iter_peek_btree(iter); + struct bkey_s_c journal_k = + bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); if (btree_k.k && journal_k.k) { int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); if (!cmp) - bch2_btree_iter_next(iter->btree); + bch2_journal_iter_advance_btree(iter); iter->last = cmp < 0 ? btree : journal; } else if (btree_k.k) { @@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * } ret = iter->last == journal ? journal_k : btree_k; + + if (iter->b && + bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { + iter->journal.k = NULL; + iter->last = none; + return bkey_s_c_null; + } + if (!bkey_deleted(ret.k)) break; @@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * return bch2_btree_and_journal_iter_peek(iter); } -struct journal_key *journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, struct bpos pos) -{ - size_t l = 0, r = journal_keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if ((cmp_int(id, journal_keys->d[m].btree_id) ?: - bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < journal_keys->nr && - (cmp_int(id, journal_keys->d[l].btree_id) ?: - bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); - - BUG_ON(l && - (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: - bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); - - return l < journal_keys->nr ? journal_keys->d + l : NULL; -} - void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, struct btree_trans *trans, struct journal_keys *journal_keys, enum btree_id id, struct bpos pos) { - iter->journal.keys = journal_keys; - iter->journal.k = journal_key_search(journal_keys, id, pos); - iter->journal.btree_id = id; + memset(iter, 0, sizeof(*iter)); iter->btree = bch2_trans_get_iter(trans, id, pos, 0); + bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); +} + +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct journal_keys *journal_keys, + struct btree *b) +{ + struct bpos start = b->data->min_key; + + if (btree_node_type_is_extents(b->btree_id)) + start = bkey_successor(start); + + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); + bch2_journal_iter_init(&iter->journal, journal_keys, + b->btree_id, b->level, start); } /* sort and dedup all keys in the journal: */ @@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: bkey_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); @@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + return cmp_int(r->level, l->level) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->k->k.p, r->k->k.p); } static void journal_keys_free(struct journal_keys *keys) @@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) for_each_jset_key(k, _n, entry, &p->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, + .level = entry->level, .k = k, .journal_seq = le64_to_cpu(p->j.seq) - keys.journal_seq_base, @@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) src = dst = keys.d; while (src < keys.d + keys.nr) { while (src + 1 < keys.d + keys.nr && - src[0].btree_id == src[1].btree_id && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) src++; @@ -351,12 +404,15 @@ err: } static int __bch2_journal_replay_key(struct btree_trans *trans, - enum btree_id id, struct bkey_i *k) + enum btree_id id, unsigned level, + struct bkey_i *k) { struct btree_iter *iter; int ret; - iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT); + iter = bch2_trans_get_node_iter(trans, id, k->k.p, + BTREE_MAX_DEPTH, level, + BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); @@ -375,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, } static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *k) + unsigned level, struct bkey_i *k) { return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY, - __bch2_journal_replay_key(&trans, id, k)); + __bch2_journal_replay_key(&trans, id, level, k)); } static int bch2_journal_replay(struct bch_fs *c, @@ -393,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c, sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); - for_each_journal_key(keys, i) { - replay_now_at(j, keys.journal_seq_base + i->journal_seq); + replay_now_at(j, keys.journal_seq_base); + for_each_journal_key(keys, i) { + if (!i->level) + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + + if (i->level) + ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); else if (i->k->k.size) ret = bch2_extent_replay_key(c, i->btree_id, i->k); else - ret = bch2_journal_replay_key(c, i->btree_id, i->k); + ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); if (ret) { bch_err(c, "journal replay: error %d while replaying key", @@ -864,7 +925,7 @@ int bch2_fs_recovery(struct bch_fs *c) */ bch_info(c, "starting metadata mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, NULL, true, true); + ret = bch2_gc(c, &journal_keys, true, true); if (ret) goto err; bch_verbose(c, "mark and sweep done"); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index c9130930..fa1f2818 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -5,6 +5,7 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; + unsigned level:8; struct bkey_i *k; u32 journal_seq; u32 journal_offset; @@ -17,15 +18,23 @@ struct journal_keys { for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) struct journal_iter { + enum btree_id btree_id; + unsigned level; struct journal_keys *keys; struct journal_key *k; - enum btree_id btree_id; }; -struct btree_and_journal_iter { - enum btree_id btree_id; +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ +struct btree_and_journal_iter { struct btree_iter *btree; + + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + struct journal_iter journal; enum last_key_returned { @@ -38,12 +47,14 @@ struct btree_and_journal_iter { void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -struct journal_key *journal_key_search(struct journal_keys *, - enum btree_id, struct bpos); + void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, struct btree_trans *, struct journal_keys *, enum btree_id, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct journal_keys *, + struct btree *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index e3cb08d8..6596764c 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1089,6 +1090,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;