From 2286a79ccb086d8e850cb22df9d02d71d3aacc43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Mar 2020 20:24:57 -0400 Subject: [PATCH] Update bcachefs sources to d9bb516b2d bcachefs: Move extent overwrite handling out of core btree code --- .bcachefs_revision | 2 +- libbcachefs/bcachefs_format.h | 6 +- libbcachefs/bset.c | 6 +- libbcachefs/btree_gc.c | 57 ++- libbcachefs/btree_io.c | 17 +- libbcachefs/btree_iter.c | 521 +++++++++++++++------------- libbcachefs/btree_iter.h | 17 +- libbcachefs/btree_types.h | 6 +- libbcachefs/btree_update.h | 5 + libbcachefs/btree_update_interior.c | 4 +- libbcachefs/btree_update_interior.h | 23 +- libbcachefs/btree_update_leaf.c | 228 ++++++++++-- libbcachefs/buckets.c | 21 +- libbcachefs/buckets.h | 2 +- libbcachefs/extent_update.c | 414 ++-------------------- libbcachefs/extent_update.h | 5 +- libbcachefs/fsck.c | 56 +++ libbcachefs/recovery.c | 149 +++----- libbcachefs/recovery.h | 2 - 19 files changed, 688 insertions(+), 853 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 70ff03f8..cc273031 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -7e07e19c29a31d2c719eb16f7e9061ad5aba5099 +d9bb516b2d79fd1ad412e2c8e828182fe6e851c1 diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 42dd1022..798f5c9e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1311,12 +1311,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(inline_data, 8) \ x(new_extent_overwrite, 9) \ x(incompressible, 10) \ - x(btree_ptr_v2, 11) + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ - (1ULL << BCH_FEATURE_btree_ptr_v2)) + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 1efb7fb3..6360b2e8 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1682,7 +1682,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct bset_tree *t; unsigned end = 0; - bch2_btree_node_iter_verify(iter, b); + if (btree_keys_expensive_checks(b)) + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1717,7 +1718,8 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - bch2_btree_node_iter_verify(iter, b); + if (btree_keys_expensive_checks(b)) + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 3705c41f..c5a0c0ed 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -184,8 +184,16 @@ fsck_err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, - u8 *max_stale, bool initial) +static bool pos_in_journal_keys(struct journal_keys *journal_keys, + enum btree_id id, struct bpos pos) +{ + struct journal_key *k = journal_key_search(journal_keys, id, pos); + + return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos); +} + +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + struct journal_keys *journal_keys, bool initial) { struct btree_node_iter iter; struct bkey unpacked; @@ -199,6 +207,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + if (!b->level && journal_keys && + pos_in_journal_keys(journal_keys, b->btree_id, k.k->p)) + continue; + bch2_bkey_debugcheck(c, b, k); ret = bch2_gc_mark_key(c, k, max_stale, initial); @@ -210,6 +222,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + struct journal_keys *journal_keys, bool initial, bool metadata_only) { struct btree_trans trans; @@ -237,7 +250,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(c, b, &max_stale, initial); + ret = btree_gc_mark_node(c, b, &max_stale, + journal_keys, initial); if (ret) break; @@ -279,36 +293,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int mark_journal_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *insert) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - u8 max_stale; - int ret = 0; - - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); - if (ret) - return ret; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), - BTREE_ITER_SLOTS, k, ret) { - percpu_down_read(&c->mark_lock); - ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, - BTREE_TRIGGER_GC| - BTREE_TRIGGER_NOATOMIC); - percpu_up_read(&c->mark_lock); - - if (!ret) - break; - } - - return bch2_trans_exit(&trans) ?: ret; -} - static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, bool initial, bool metadata_only) { @@ -323,18 +307,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, enum btree_id id = ids[i]; enum btree_node_type type = __btree_node_type(0, id); - int ret = bch2_gc_btree(c, id, initial, metadata_only); + int ret = bch2_gc_btree(c, id, journal_keys, + initial, metadata_only); if (ret) return ret; if (journal_keys && !metadata_only && btree_node_type_needs_gc(type)) { struct journal_key *j; + u8 max_stale; int ret; for_each_journal_key(*journal_keys, j) if (j->btree_id == id) { - ret = mark_journal_key(c, id, j->k); + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k), + &max_stale, initial); if (ret) return ret; } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 0370c782..3f7c1042 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -709,9 +709,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, unsigned *whiteout_u64s, int write, bool have_retry) { - struct bkey_packed *k; - struct bkey prev = KEY(0, 0, 0); - struct bpos prev_data = POS_MIN; + struct bkey_packed *k, *prev = NULL; bool seen_non_whiteout = false; unsigned version; const char *err; @@ -853,15 +851,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b, if (!seen_non_whiteout && (!bkey_whiteout(k) || - (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) { + (prev && bkey_iter_cmp(b, prev, k) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; - } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || - bkey_cmp(prev.p, u.k->p) > 0) { + } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { char buf1[80]; char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); - bch2_bkey_to_text(&PBUF(buf1), &prev); + bch2_bkey_to_text(&PBUF(buf1), &up); bch2_bkey_to_text(&PBUF(buf2), u.k); bch2_dump_bset(b, i, 0); @@ -871,10 +869,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, /* XXX: repair this */ } - if (!bkey_deleted(u.k)) - prev_data = u.k->p; - prev = *u.k; - + prev = k; k = bkey_next_skip_noops(k, vstruct_last(i)); } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 024a931b..2819b9a4 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -5,6 +5,7 @@ #include "btree_cache.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update.h" #include "debug.h" #include "extents.h" @@ -261,7 +262,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Btree iterator locking: */ #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_iter_verify_locks(struct btree_iter *iter) +static void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; @@ -282,6 +283,8 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) trans_for_each_iter(trans, iter) bch2_btree_iter_verify_locks(iter); } +#else +static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} #endif __flatten @@ -405,23 +408,43 @@ void bch2_trans_unlock(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG -static void __bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) +static void bch2_btree_iter_verify_level(struct btree_iter *iter, + unsigned level) { struct bpos pos = btree_iter_search_key(iter); - struct btree_iter_level *l = &iter->l[b->level]; + struct btree_iter_level *l = &iter->l[level]; struct btree_node_iter tmp = l->iter; - struct bkey_packed *k; + bool locked = btree_node_locked(iter, level); + struct bkey_packed *p, *k; + char buf1[100], buf2[100]; + const char *msg; if (!debug_check_iterators(iter->trans->c)) return; - if (iter->uptodate > BTREE_ITER_NEED_PEEK) + BUG_ON(iter->level < iter->min_depth); + + if (!btree_iter_node(iter, level)) return; - BUG_ON(!btree_iter_pos_in_node(iter, b)); + if (!bch2_btree_node_relock(iter, level)) + return; - bch2_btree_node_iter_verify(&l->iter, b); + /* + * Ideally this invariant would always be true, and hopefully in the + * future it will be, but for now set_pos_same_leaf() breaks it: + */ + BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && + !btree_iter_pos_in_node(iter, l->b)); + + /* + * node iterators don't use leaf node iterator: + */ + if (btree_iter_type(iter) == BTREE_ITER_NODES && + level <= iter->min_depth) + goto unlock; + + bch2_btree_node_iter_verify(&l->iter, l->b); /* * For interior nodes, the iterator will have skipped past @@ -430,46 +453,73 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, * For extents, the iterator may have skipped past deleted keys (but not * whiteouts) */ - k = b->level || btree_node_type_is_extents(iter->btree_id) - ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) - : bch2_btree_node_iter_prev_all(&tmp, b); - if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); + p = level || btree_node_type_is_extents(iter->btree_id) + ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - bch2_bkey_to_text(&PBUF(buf), &uk); - panic("iterator should be before prev key:\n%s\n%llu:%llu\n", - buf, iter->pos.inode, iter->pos.offset); + if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { + msg = "before"; + goto err; } - k = bch2_btree_node_iter_peek_all(&l->iter, b); - if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); - - bch2_bkey_to_text(&PBUF(buf), &uk); - panic("iter should be after current key:\n" - "iter pos %llu:%llu\n" - "cur key %s\n", - iter->pos.inode, iter->pos.offset, buf); + if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + msg = "after"; + goto err; } +unlock: + if (!locked) + btree_node_unlock(iter, level); + return; +err: + strcpy(buf1, "(none)"); + strcpy(buf2, "(none)"); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); + bch2_bkey_to_text(&PBUF(buf1), &uk); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + } + + panic("iterator should be %s key at level %u:\n" + "iter pos %s %llu:%llu\n" + "prev key %s\n" + "cur key %s\n", + msg, level, + iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", + iter->pos.inode, iter->pos.offset, + buf1, buf2); } -void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) +static void bch2_btree_iter_verify(struct btree_iter *iter) { - struct btree_iter *linked; + unsigned i; - if (!debug_check_iterators(iter->trans->c)) + bch2_btree_trans_verify_locks(iter->trans); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + bch2_btree_iter_verify_level(iter, i); +} + +void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) +{ + struct btree_iter *iter; + + if (!debug_check_iterators(trans->c)) return; - trans_for_each_iter_with_node(iter->trans, b, linked) - __bch2_btree_iter_verify(linked, b); + trans_for_each_iter_with_node(trans, b, iter) + bch2_btree_iter_verify_level(iter, b->level); } #else -static inline void __bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} +static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} +static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} #endif @@ -514,7 +564,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_iter_fix_key_modified(linked, b, where); - __bch2_btree_iter_verify(linked, b); + bch2_btree_iter_verify_level(linked, b->level); } } @@ -613,19 +663,8 @@ fixup_done: if (!b->level && node_iter == &iter->l[0].iter && - iter_current_key_modified) { - struct bkey_packed *k = - bch2_btree_node_iter_peek_all(node_iter, b); - - if (likely(k)) { - bkey_disassemble(b, k, &iter->k); - } else { - /* XXX: for extents, calculate size of hole? */ - iter->k.type = KEY_TYPE_deleted; - } - + iter_current_key_modified) btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - } } void bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -641,14 +680,16 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, if (node_iter != &iter->l[b->level].iter) { __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - bch2_btree_node_iter_verify(node_iter, b); + + if (debug_check_iterators(iter->trans->c)) + bch2_btree_node_iter_verify(node_iter, b); } trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); - __bch2_btree_iter_verify(linked, b); + bch2_btree_iter_verify_level(linked, b->level); } } @@ -1134,9 +1175,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter) iter->uptodate = BTREE_ITER_NEED_PEEK; - bch2_btree_trans_verify_locks(iter->trans); - if (btree_iter_node(iter, iter->level)) - __bch2_btree_iter_verify(iter, iter->l[iter->level].b); + bch2_btree_iter_verify(iter); return 0; } @@ -1156,12 +1195,14 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, enum btree_iter_type type) { EBUG_ON(iter->btree_id >= BTREE_ID_NR); - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (btree_node_type_is_extents(iter->btree_id) && - type != BTREE_ITER_NODES)); EBUG_ON(btree_iter_type(iter) != type); - bch2_btree_trans_verify_locks(iter->trans); + BUG_ON(type == BTREE_ITER_KEYS && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0)); + + bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_verify_level(iter, iter->level); } /* Iterate across nodes (leaf and interior nodes) */ @@ -1189,10 +1230,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify(iter); + return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree *b; int ret; @@ -1238,7 +1281,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) iter->pos = iter->btree_id == BTREE_ID_INODES ? btree_type_successor(iter->btree_id, iter->pos) : bkey_successor(iter->pos); - iter->level = depth; + iter->level = iter->min_depth; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ret = bch2_btree_iter_traverse(iter); @@ -1251,6 +1294,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) iter->pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify(iter); + return b; } @@ -1265,7 +1310,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ EBUG_ON(!btree_node_locked(iter, 0)); EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); - iter->pos = new_pos; + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); btree_iter_advance_to_pos(iter, l, -1); @@ -1275,9 +1321,14 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } -static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) +static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) { - unsigned l = btree_iter_up_until_good_node(iter, cmp); + unsigned l = iter->level; + + if (!cmp) + goto out; + + l = btree_iter_up_until_good_node(iter, cmp); if (btree_iter_node(iter, l)) { /* @@ -1294,85 +1345,71 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(iter, l); } - - return l; +out: + if (l != iter->level) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + else + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, bool strictly_greater) { struct bpos old = btree_iter_search_key(iter); - unsigned l; int cmp; iter->flags &= ~BTREE_ITER_IS_EXTENTS; iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; - iter->pos = new_pos; + + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; cmp = bkey_cmp(btree_iter_search_key(iter), old); - if (!cmp) - return; - l = btree_iter_pos_changed(iter, cmp); - - if (l != iter->level) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - else - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + btree_iter_pos_changed(iter, cmp); } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { int cmp = bkey_cmp(new_pos, iter->pos); - unsigned l; - if (!cmp) - return; + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; - iter->pos = new_pos; - - l = btree_iter_pos_changed(iter, cmp); - - if (l != iter->level) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - else - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + btree_iter_pos_changed(iter, cmp); } static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + bool ret; - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + bkey_init(&iter->k); + iter->k.p = iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - bkey_init(&iter->k); - iter->k.p = POS_MAX; - return false; - } + ret = bkey_cmp(iter->pos, POS_MAX) != 0; + if (ret) + iter->k.p = iter->pos = btree_type_successor(iter->btree_id, iter->pos); - iter->pos = btree_type_successor(iter->btree_id, iter->pos); btree_iter_pos_changed(iter, 1); - return true; + return ret; } static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + bool ret; - iter->pos = l->b->data->min_key; + bkey_init(&iter->k); + iter->k.p = iter->pos = l->b->data->min_key; iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bkey_cmp(iter->pos, POS_MIN)) { - bkey_init(&iter->k); - iter->k.p = POS_MIN; - return false; - } + ret = bkey_cmp(iter->pos, POS_MIN) != 0; + if (ret) + iter->k.p = iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); - iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); btree_iter_pos_changed(iter, -1); - return true; + return ret; } /** @@ -1441,6 +1478,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify_level(iter, 0); return k; } @@ -1449,61 +1488,105 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) * position */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +{ + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); + + return bch2_btree_iter_peek(iter); +} + +static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) +{ + struct bpos pos = btree_iter_search_key(iter); + struct btree_trans *trans = iter->trans; + struct btree_insert_entry *i; + + trans_for_each_update2(trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) + break; + + return i < trans->updates2 + trans->nr_updates2 && + iter->btree_id == i->iter->btree_id + ? bkey_i_to_s_c(i->k) + : bkey_s_c_null; +} + +static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *p; + struct bkey_s_c k = __btree_iter_peek(iter, l); + struct bkey_s_c u = __btree_trans_updates_peek(iter); + + if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) + return k; + if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { + iter->k = *u.k; + return u; + } + return bkey_s_c_null; +} + +struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) +{ struct bkey_s_c k; + int ret; bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + k = __bch2_btree_iter_peek_with_updates(iter); + + if (k.k && bkey_deleted(k.k)) { + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); + continue; + } + + if (likely(k.k)) + break; + + if (!btree_iter_set_pos_to_next_leaf(iter)) return bkey_s_c_null; - - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - bch2_btree_iter_set_pos(iter, - btree_type_successor(iter->btree_id, iter->k.p)); - - return bch2_btree_iter_peek(iter); } - if (unlikely(bkey_deleted(&iter->k))) { - /* - * we're currently pointed at a hole, because previously we were - * iterating over slots: - */ - return bch2_btree_iter_peek(iter); - } + /* + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); - do { - bch2_btree_node_iter_advance(&l->iter, l->b); - p = bch2_btree_node_iter_peek_all(&l->iter, l->b); - } while (likely(p) && bkey_whiteout(p)); - - if (unlikely(!p)) - return btree_iter_set_pos_to_next_leaf(iter) - ? bch2_btree_iter_peek(iter) - : bkey_s_c_null; - - k = __btree_iter_unpack(iter, l, &iter->k, p); - - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); - iter->pos = bkey_start_pos(k.k); + iter->uptodate = BTREE_ITER_UPTODATE; return k; } +struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) +{ + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); + + return bch2_btree_iter_peek_with_updates(iter); +} + /** * bch2_btree_iter_peek_prev: returns first key less than or equal to * iterator's current position */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { + struct bpos pos = iter->pos; struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; int ret; @@ -1520,8 +1603,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) return bkey_s_c_err(ret); k = __btree_iter_peek(iter, l); - if (!k.k || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) k = __btree_iter_prev(iter, l); if (likely(k.k)) @@ -1531,7 +1613,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) return bkey_s_c_null; } - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; return k; @@ -1543,33 +1625,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c k; + struct bpos pos = bkey_start_pos(&iter->k); bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - iter->pos = btree_type_predecessor(iter->btree_id, - iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (unlikely(!bkey_cmp(pos, POS_MIN))) + return bkey_s_c_null; - return bch2_btree_iter_peek_prev(iter); - } + bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); - k = __btree_iter_prev(iter, l); - if (unlikely(!k.k)) - return btree_iter_set_pos_to_prev_leaf(iter) - ? bch2_btree_iter_peek(iter) - : bkey_s_c_null; - - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); - iter->pos = bkey_start_pos(k.k); - return k; + return bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c @@ -1581,8 +1646,17 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) struct bkey n; int ret; -recheck: - btree_iter_advance_to_pos(iter, l, -1); + /* keys & holes can't span inode numbers: */ + if (iter->pos.offset == KEY_OFFSET_MAX) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } /* * iterator is now at the correct position for inserting at iter->pos, @@ -1596,47 +1670,17 @@ recheck: if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { /* - * If there wasn't actually a hole, want the iterator to be - * pointed at the key we found: - * - * XXX: actually, we shouldn't be changing the iterator here: - * the iterator needs to be correct for inserting at iter->pos, - * and there may be whiteouts between iter->pos and what this - * iterator points at: + * We're not setting iter->uptodate because the node iterator + * doesn't necessarily point at the key we're returning: */ - l->iter = node_iter; EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); - iter->uptodate = BTREE_ITER_UPTODATE; - - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return k; } - /* - * If we got to the end of the node, check if we need to traverse to the - * next node: - */ - if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - goto recheck; - } - /* hole */ - /* holes can't span inode numbers: */ - if (iter->pos.offset == KEY_OFFSET_MAX) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; - - iter->pos = bkey_successor(iter->pos); - goto recheck; - } - if (!k.k) k.k = &l->b->key.k; @@ -1654,15 +1698,24 @@ recheck: iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return (struct bkey_s_c) { &iter->k, NULL }; } -static inline struct bkey_s_c -__bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; + int ret; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); if (iter->flags & BTREE_ITER_IS_EXTENTS) return __bch2_btree_iter_peek_slot_extents(iter); @@ -1679,52 +1732,19 @@ __bch2_btree_iter_peek_slot(struct btree_iter *iter) } iter->uptodate = BTREE_ITER_UPTODATE; - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return k; } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -{ - int ret; - - bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - - if (iter->uptodate == BTREE_ITER_UPTODATE) - return btree_iter_peek_uptodate(iter); - - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - return __bch2_btree_iter_peek_slot(iter); -} - struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; - /* XXX directly setting iter->pos is wrong */ - iter->pos = btree_type_successor(iter->btree_id, iter->k.p); + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); - if (unlikely(btree_iter_pos_after_node(iter, iter->l[0].b))) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - - return bch2_btree_iter_peek_slot(iter); - } - - btree_iter_advance_to_pos(iter, &iter->l[0], -1); - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - return __bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(iter); } static inline void bch2_btree_iter_init(struct btree_trans *trans, @@ -1746,6 +1766,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; iter->level = 0; + iter->min_depth = 0; iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; @@ -1800,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates; + void *p, *new_iters, *new_updates, *new_updates2; size_t iters_bytes; size_t updates_bytes; @@ -1818,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, iters_bytes = sizeof(struct btree_iter) * new_size; updates_bytes = sizeof(struct btree_insert_entry) * new_size; - new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); - if (new_iters) + p = kmalloc(iters_bytes + + updates_bytes + + updates_bytes, GFP_NOFS); + if (p) goto success; - new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); new_size = BTREE_ITER_MAX; trans->used_mempool = true; success: - new_updates = new_iters + iters_bytes; + new_iters = p; p += iters_bytes; + new_updates = p; p += updates_bytes; + new_updates2 = p; p += updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); memcpy(new_updates, trans->updates, sizeof(struct btree_insert_entry) * trans->nr_updates); + memcpy(new_updates2, trans->updates2, + sizeof(struct btree_insert_entry) * trans->nr_updates2); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) memset(trans->iters, POISON_FREE, @@ -1844,6 +1871,7 @@ success: trans->iters = new_iters; trans->updates = new_updates; + trans->updates2 = new_updates2; trans->size = new_size; if (trans->iters_live) { @@ -2011,6 +2039,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, iter->locks_want = locks_want; iter->level = depth; + iter->min_depth = depth; for (i = 0; i < ARRAY_SIZE(iter->l); i++) iter->l[i].b = NULL; @@ -2104,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->need_reset = 0; trans->nr_updates = 0; + trans->nr_updates2 = 0; trans->mem_top = 0; if (trans->fs_usage_deltas) { @@ -2134,6 +2164,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->updates2 = trans->updates2_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -2171,5 +2202,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + - sizeof(u8) * nr); + sizeof(struct btree_insert_entry) * nr); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index a95d0f13..6f51ef35 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -96,11 +96,11 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, (_iter)->idx + 1)) #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_iter_verify(struct btree_iter *, struct btree *); +void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); void bch2_btree_trans_verify_locks(struct btree_trans *); #else -static inline void bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} +static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, + struct btree *b) {} static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} #endif @@ -154,11 +154,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) int bch2_btree_iter_traverse_all(struct btree_trans *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -199,9 +202,7 @@ static inline int __btree_iter_cmp(enum btree_id id, struct bpos pos, const struct btree_iter *r) { - if (id != r->btree_id) - return id < r->btree_id ? -1 : 1; - return bkey_cmp(pos, r->pos); + return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos); } static inline int btree_iter_cmp(const struct btree_iter *l, @@ -231,7 +232,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) _start, _locks_want, _depth, _flags), \ _b = bch2_btree_iter_peek_node(_iter); \ (_b); \ - (_b) = bch2_btree_iter_next_node(_iter, _depth)) + (_b) = bch2_btree_iter_next_node(_iter)) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ _flags, _b) \ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 20757d0c..51d579a4 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -235,9 +235,10 @@ struct btree_iter { u16 flags; u8 idx; - enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; + enum btree_iter_uptodate uptodate:4; unsigned level:4, + min_depth:4, locks_want:4, nodes_locked:4, nodes_intent_locked:4; @@ -279,6 +280,7 @@ struct btree_trans { u8 nr_iters; u8 nr_updates; + u8 nr_updates2; u8 size; unsigned used_mempool:1; unsigned error:1; @@ -291,6 +293,7 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; + struct btree_insert_entry *updates2; /* update path: */ struct journal_res journal_res; @@ -304,6 +307,7 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[2]; + struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index d1cd839a..12127a33 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +#define trans_for_each_update2(_trans, _i) \ + for ((_i) = (_trans)->updates2; \ + (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ + (_i)++) + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 68924ad0..02f19146 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1557,7 +1557,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, trans_for_each_iter_with_node(iter->trans, b, linked) bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); - bch2_btree_iter_verify(iter, b); + bch2_btree_trans_verify_iters(iter->trans, b); } /** @@ -1827,7 +1827,7 @@ retry: bch2_btree_iter_node_replace(iter, n); - bch2_btree_iter_verify(iter, n); + bch2_btree_trans_verify_iters(trans, n); bch2_btree_node_free_inmem(c, b, iter); bch2_btree_node_free_inmem(c, m, iter); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 657b3d31..c90fcd48 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, } static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bkey_packed *k) + struct bpos pos) { - unsigned u64s = bkeyp_key_u64s(&b->format, k); - struct bkey_packed *dst; + struct bkey_packed k; - BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); - b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); - dst = unwritten_whiteouts_start(c, b); - memcpy_u64s(dst, k, u64s); - dst->u64s = u64s; - dst->type = KEY_TYPE_deleted; + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; + + bkey_init(u); + u->p = pos; + } + + k.needs_whiteout = true; + + b->whiteout_u64s += k.u64s; + bkey_copy(unwritten_whiteouts_start(c, b), &k); } /* diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 59a1175c..389b5ee2 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -23,11 +23,10 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { - return i != trans->updates && + return i != trans->updates2 && i[0].iter->l[0].b == i[-1].iter->l[0].b; } - inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { @@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(iter->trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_packed(b, k, &insert->k)) @@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(iter->trans->c, b, k); + push_whiteout(iter->trans->c, b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans, set_btree_node_dirty(b); } -static void bch2_insert_fixup_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - - EBUG_ON(iter->level); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, l->b)); - - if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) - bch2_btree_journal_key(trans, iter, insert); -} - /** * btree_insert_key - insert a key one key into a leaf node */ @@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + EBUG_ON(iter->level); + insert->k.needs_whiteout = false; - if (!btree_node_is_extents(b)) - bch2_insert_fixup_key(trans, iter, insert); - else - bch2_insert_fixup_extent(trans, iter, insert); + if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert))) + bch2_btree_journal_key(trans, iter, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct bch_fs *c = trans->c; BUG_ON(iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); - EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); - + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); BUG_ON(debug_check_bkeys(c) && - !bkey_deleted(&insert->k) && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); } @@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans, if (unlikely(btree_node_fake(b))) return BTREE_INSERT_BTREE_NODE_FULL; + /* + * old bch2_extent_sort_fix_overlapping() algorithm won't work with new + * style extent updates: + */ + if (unlikely(btree_node_old_extent_overwrite(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + ret = !btree_node_is_extents(b) ? BTREE_INSERT_OK - : bch2_extent_can_insert(trans, iter, insert, u64s); + : bch2_extent_can_insert(trans, iter, insert); if (ret) return ret; @@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); - trans_for_each_update(trans, i) { + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; @@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; else if (inject_invalid_keys(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } @@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { @@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_iter *iter; int ret; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) BUG_ON(!btree_node_intent_locked(i->iter, 0)); ret = bch2_journal_preres_get(&trans->c->journal, @@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) btree_insert_entry_checks(trans, i->iter, i->k); bch2_btree_trans_verify_locks(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(trans->c, i->iter->l[0].b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, i->iter); @@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_foreground_maybe_merge(trans->c, i->iter, 0, trans->flags); trans->nounlock = false; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) bch2_btree_iter_downgrade(i->iter); return 0; @@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static void bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_insert_entry *i, n = (struct btree_insert_entry) { + .iter = iter, .k = insert + }; + + btree_insert_entry_checks(trans, n.iter, n.k); + + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + EBUG_ON(trans->nr_updates2 >= trans->nr_iters); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans_for_each_update2(trans, i) { + if (btree_iter_cmp(n.iter, i->iter) == 0) { + *i = n; + return; + } + + if (btree_iter_cmp(n.iter, i->iter) <= 0) + break; + } + + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); +} + +static int extent_update_to_keys(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert) +{ + struct btree_iter *iter; + + if (bkey_deleted(&insert->k)) + return 0; + + iter = bch2_trans_copy_iter(trans, orig_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + iter->flags |= BTREE_ITER_INTENT; + __bch2_btree_iter_set_pos(iter, insert->k.p, false); + bch2_trans_update2(trans, iter, insert); + bch2_trans_iter_put(trans, iter); + return 0; +} + +static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos start, struct bpos end) +{ + struct btree_iter *iter = NULL, *update_iter; + struct bkey_i *update; + struct bkey_s_c k; + int ret = 0; + + iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_with_updates(iter); + + while (k.k && !(ret = bkey_err(k))) { + if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) + break; + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_back(start, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + if (bkey_cmp(k.k->p, end) > 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_front(end, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } else { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + update->k = *k.k; + set_bkey_val_u64s(&update->k, 0); + update->k.type = KEY_TYPE_deleted; + update->k.size = 0; + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + k = bch2_btree_iter_next_with_updates(iter); + } +err: + if (!IS_ERR_OR_NULL(iter)) + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; @@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans) } } while (trans_trigger_run); + /* Turn extents updates into keys: */ + trans_for_each_update(trans, i) + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + struct bpos start = bkey_start_pos(&i->k->k); + + while (i + 1 < trans->updates + trans->nr_updates && + i[0].iter->btree_id == i[1].iter->btree_id && + !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) + i++; + + ret = extent_handle_overwrites(trans, i->iter->btree_id, + start, i->k->k.p); + if (ret) + goto out; + } + trans_for_each_update(trans, i) { + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + ret = extent_update_to_keys(trans, i->iter, i->k); + if (ret) + goto out; + } else { + bch2_trans_update2(trans, i->iter, i->k); + } + } + + trans_for_each_update2(trans, i) { + BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(i->iter->locks_want < 1); + u64s = jset_u64s(i->k->k.u64s); if (0) trans->journal_preres_u64s += u64s; @@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .trigger_flags = flags, .iter = iter, .k = k }; - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); + EBUG_ON(bkey_cmp(iter->pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&k->k) + : k->k.p)); iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 61831a49..4497c984 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, struct bch_fs_usage *fs_usage, - unsigned flags) + unsigned flags, + bool is_extents) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; unsigned offset = 0; - s64 sectors = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; - if (btree_node_is_extents(b) + if (is_extents ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 : bkey_cmp(new->k.p, old.k->p)) return 0; - if (btree_node_is_extents(b)) { + if (is_extents) { switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: offset = 0; @@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans, struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ret = bch2_mark_overwrite(trans, iter, k, insert, - fs_usage, flags); + fs_usage, flags, + btree_node_type_is_extents(iter->btree_id)); if (ret <= 0) break; @@ -1671,8 +1672,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, k.k->p.offset > idx + sectors)) goto out; - bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + sectors = k.k->p.offset - idx; r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(r_v); @@ -1689,9 +1689,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, set_bkey_val_u64s(&r_v->k, 0); } + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + bch2_trans_update(trans, iter, &r_v->k_i, 0); out: - ret = k.k->p.offset - idx; + ret = sectors; err: bch2_trans_iter_put(trans, iter); return ret; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index c1cc63af..765650ce 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, struct bkey_s_c, struct bkey_i *, - struct bch_fs_usage *, unsigned); + struct bch_fs_usage *, unsigned, bool); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 846d77dc..fa6c0698 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans, { int ret = 0; + /* + * The extent update path requires an _additional_ iterator for each + * extent we're inserting and overwriting: + */ + *nr_iters += 1; + switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: @@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, - unsigned *u64s) + struct bkey_i *insert) { struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *_k; + struct bkey_s_c k; struct bkey unpacked; int sectors; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) + return BTREE_INSERT_OK; - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + k = bkey_disassemble(l->b, _k, &unpacked); + + /* Check if we're splitting a compressed extent: */ + + if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && + bkey_cmp(insert->k.p, k.k->p) < 0 && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; + + switch (bch2_disk_reservation_add(trans->c, trans->disk_res, + sectors, flags)) { + case 0: break; - - overlap = bch2_extent_overlap(&insert->k, k.k); - - /* - * If we're overwriting an existing extent, we may need to emit - * a whiteout - unless we're inserting a new extent at the same - * position: - */ - if (k.k->needs_whiteout && - (!bkey_whiteout(&insert->k) || - bkey_cmp(k.k->p, insert->k.p))) - *u64s += BKEY_U64s; - - /* - * If we're partially overwriting an existing extent which has - * been written out to disk, we'll need to emit a new version of - * that extent: - */ - if (bkey_written(l->b, _k) && - overlap != BCH_EXTENT_OVERLAP_ALL) - *u64s += _k->u64s; - - /* And we may be splitting an existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_bkey_sectors_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); } - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); } return BTREE_INSERT_OK; } - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - -static void pack_push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) -{ - struct bkey_packed k; - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = pos; - bkey_copy(&k, &tmp); - } - - k.needs_whiteout = true; - push_whiteout(c, b, &k); -} - -static void -extent_drop(struct bch_fs *c, struct btree_iter *iter, - struct bkey_packed *_k, struct bkey_s k) -{ - struct btree_iter_level *l = &iter->l[0]; - - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); - - k.k->size = 0; - k.k->type = KEY_TYPE_deleted; - - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } -} - -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_on_stack tmp, split; - - bkey_on_stack_init(&tmp); - bkey_on_stack_init(&split); - - if (!btree_node_old_extent_overwrite(l->b)) { - if (!bkey_whiteout(&insert->k) && - !bkey_cmp(k.k->p, insert->k.p)) { - insert->k.needs_whiteout = k.k->needs_whiteout; - k.k->needs_whiteout = false; - } - } else { - insert->k.needs_whiteout |= k.k->needs_whiteout; - } - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - /* - * needs_whiteout was propagated to new version of @k, - * @tmp: - */ - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - /* - * No need to call bset_fix_invalidated_key, start of - * extent changed but extents are indexed by where they - * end - */ - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - break; - case BCH_EXTENT_OVERLAP_BACK: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); - - /* - * @tmp has different position than @k, needs_whiteout - * should not be propagated: - */ - if (!btree_node_old_extent_overwrite(l->b)) - tmp.k->k.needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - /* - * position of @k is changing, emit a whiteout if - * needs_whiteout is set: - */ - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - btree_keys_account_val_delta(l->b, _k, - bch2_cut_back_s(bkey_start_pos(&insert->k), k)); - extent_save(l->b, _k, k.k); - - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - break; - case BCH_EXTENT_OVERLAP_ALL: - extent_drop(c, iter, _k, k); - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - bkey_on_stack_reassemble(&split, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), split.k); - - if (!btree_node_old_extent_overwrite(l->b)) - split.k->k.needs_whiteout = false; - - /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - - extent_bset_insert(c, iter, split.k); - break; - } - - bkey_on_stack_exit(&split, c); - bkey_on_stack_exit(&tmp, c); -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - bool do_update = !bkey_whiteout(&insert->k); - struct bkey_packed *_k; - struct bkey unpacked; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (!bkey_whiteout(k.k)) - do_update = true; - - if (!do_update) { - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - - bch2_cut_front(cur_end, insert); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - } else { - extent_squash(c, iter, insert, _k, k, overlap); - } - - node_iter = l->iter; - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - } - - l->iter = node_iter; - bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - - if (do_update) { - if (insert->k.type == KEY_TYPE_deleted) - insert->k.type = KEY_TYPE_discard; - - if (!bkey_whiteout(&insert->k) || - btree_node_old_extent_overwrite(l->b)) - extent_bset_insert(c, iter, insert); - - bch2_btree_journal_key(trans, iter, insert); - } - - bch2_cut_front(insert->k.p, insert); -} diff --git a/libbcachefs/extent_update.h b/libbcachefs/extent_update.h index e9dc8091..38dc0846 100644 --- a/libbcachefs/extent_update.h +++ b/libbcachefs/extent_update.h @@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, - struct bkey_i *, unsigned *); -void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_iter *, - struct bkey_i *); + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index eca72312..902c8da9 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) POS(inode_nr + 1, 0), NULL); } +static int bch2_fix_overlapping_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, struct bpos cut_at) +{ + struct btree_iter *u_iter; + struct bkey_i *u; + int ret; + + u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(u, k); + bch2_cut_front(cut_at, u); + + u_iter = bch2_trans_copy_iter(trans, iter); + ret = PTR_ERR_OR_ZERO(u_iter); + if (ret) + return ret; + + /* + * We don't want to go through the + * extent_handle_overwrites path: + */ + __bch2_btree_iter_set_pos(u_iter, u->k.p, false); + + /* + * XXX: this is going to leave disk space + * accounting slightly wrong + */ + ret = bch2_trans_update(trans, u_iter, u, 0); + bch2_trans_iter_put(trans, u_iter); + return ret; +} + /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent @@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + struct bkey prev = KEY(0, 0, 0); u64 i_sectors; int ret = 0; @@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c) POS(BCACHEFS_ROOT_INO, 0), 0); retry: for_each_btree_key_continue(iter, 0, k, ret) { + if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &prev); + bch2_bkey_to_text(&PBUF(buf2), k.k); + + if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_fix_overlapping_extent(&trans, + iter, k, prev.p)); + if (ret) + goto err; + } + } + prev = *k.k; + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 712a6b1f..27378cc9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list) } } +/* + * When keys compare equal, oldest compares first: + */ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; return cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos) ?: + bkey_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos); -} - -static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) -{ - while (i + 1 < keys->d + keys->nr && - journal_sort_key_cmp(i, i + 1) > 0) { - swap(i[0], i[1]); - i++; - } + bkey_cmp(l->k->k.p, r->k->k.p); } static void journal_keys_free(struct journal_keys *keys) { - struct journal_key *i; - - for_each_journal_key(*keys, i) - if (i->allocated) - kfree(i->k); kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) struct journal_replay *p; struct jset_entry *entry; struct bkey_i *k, *_n; - struct journal_keys keys = { NULL }, keys_deduped = { NULL }; - struct journal_key *i; + struct journal_keys keys = { NULL }; + struct journal_key *src, *dst; size_t nr_keys = 0; list_for_each_entry(p, journal_entries, list) for_each_jset_key(k, _n, entry, &p->j) nr_keys++; - keys.journal_seq_base = keys_deduped.journal_seq_base = + keys.journal_seq_base = le64_to_cpu(list_first_entry(journal_entries, struct journal_replay, list)->j.seq); @@ -225,91 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (!keys.d) goto err; - keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); - if (!keys_deduped.d) - goto err; - list_for_each_entry(p, journal_entries, list) for_each_jset_key(k, _n, entry, &p->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, - .pos = bkey_start_pos(&k->k), .k = k, .journal_seq = le64_to_cpu(p->j.seq) - keys.journal_seq_base, .journal_offset = k->_data - p->j._data, }; - sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); - i = keys.d; - while (i < keys.d + keys.nr) { - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - !bkey_cmp(i[0].pos, i[1].pos)) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - i++; - } else { - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - } - continue; - } + src = dst = keys.d; + while (src < keys.d + keys.nr) { + while (src + 1 < keys.d + keys.nr && + src[0].btree_id == src[1].btree_id && + !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + src++; - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { - if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: - cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); - } else { - struct bkey_i *split = - kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); - - if (!split) - goto err; - - bkey_copy(split, i[0].k); - bch2_cut_back(bkey_start_pos(&i[1].k->k), split); - keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { - .btree_id = i[0].btree_id, - .allocated = true, - .pos = bkey_start_pos(&split->k), - .k = split, - .journal_seq = i[0].journal_seq, - .journal_offset = i[0].journal_offset, - }; - - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - continue; - } - } else { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { - i[1] = i[0]; - i++; - continue; - } else { - bch2_cut_front(i[0].k->k.p, i[1].k); - i[1].pos = i[0].k->k.p; - journal_keys_sift(&keys, i + 1); - continue; - } - } - } - - keys_deduped.d[keys_deduped.nr++] = *i++; + *dst++ = *src++; } - kvfree(keys.d); - return keys_deduped; + keys.nr = dst - keys.d; err: - journal_keys_free(&keys_deduped); - kvfree(keys.d); - return (struct journal_keys) { NULL }; + return keys; } /* journal replay: */ @@ -360,11 +289,6 @@ retry: atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); - split_iter = bch2_trans_copy_iter(&trans, iter); - ret = PTR_ERR_OR_ZERO(split_iter); - if (ret) - goto err; - split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ret = PTR_ERR_OR_ZERO(split); if (ret) @@ -383,12 +307,25 @@ retry: } bkey_copy(split, k); - bch2_cut_front(split_iter->pos, split); + bch2_cut_front(iter->pos, split); bch2_cut_back(atomic_end, split); + split_iter = bch2_trans_copy_iter(&trans, iter); + ret = PTR_ERR_OR_ZERO(split_iter); + if (ret) + goto err; + + /* + * It's important that we don't go through the + * extent_handle_overwrites() and extent_update_to_keys() path + * here: journal replay is supposed to treat extents like + * regular keys + */ + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); bch2_trans_update(&trans, split_iter, split, !remark ? BTREE_TRIGGER_NORUN : BTREE_TRIGGER_NOOVERWRITES); + bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); @@ -419,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, struct btree_iter *iter; int ret; - iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); + /* + * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run + * extent_handle_overwrites() and extent_update_to_keys() - but we don't + * want that here, journal replay is supposed to treat extents like + * regular keys: + */ + __bch2_btree_iter_set_pos(iter, k->k.p, false); + ret = bch2_btree_iter_traverse(iter) ?: bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); bch2_trans_iter_put(trans, iter); @@ -454,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - else if (btree_node_type_is_extents(i->btree_id)) + else if (i->k->k.size) ret = bch2_extent_replay_key(c, i->btree_id, i->k); else ret = bch2_journal_replay_key(c, i->btree_id, i->k); @@ -854,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c) journal_seq = le64_to_cpu(clean->journal_seq) + 1; } + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + ret = journal_replay_early(c, clean, &journal_entries); if (ret) goto err; diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index ccd84a8f..c9130930 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -5,8 +5,6 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; - unsigned allocated:1; - struct bpos pos; struct bkey_i *k; u32 journal_seq; u32 journal_offset;