From 6c42566c6204bb5dcd6af3b97257e548b9d2db67 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Aug 2021 13:11:04 -0400 Subject: [PATCH] Update bcachefs sources to 60fbf06f49 bcachefs: Fix an unhandled transaction restart --- .bcachefs_revision | 2 +- libbcachefs/btree_iter.c | 226 +++++++++++++++++++++++++++----- libbcachefs/btree_iter.h | 37 ++++-- libbcachefs/btree_types.h | 3 + libbcachefs/btree_update_leaf.c | 14 +- libbcachefs/fs-io.c | 9 ++ libbcachefs/inode.c | 23 +++- libbcachefs/inode.h | 2 + libbcachefs/io.c | 9 ++ libbcachefs/recovery.c | 16 +++ 10 files changed, 291 insertions(+), 50 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index f6620a21..ab237af5 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -b964c6cba873eb5d2ebd0174876b664730e69a73 +60fbf06f49679fdb2b37e1e863c321dfddfc3a4a diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 3af00e24..fe710d19 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -18,10 +18,21 @@ #include static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +static void btree_trans_sort_iters(struct btree_trans *); +static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); -static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *); +static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, + struct btree_iter *); static void btree_iter_copy(struct btree_iter *, struct btree_iter *); +static inline int btree_iter_cmp(const struct btree_iter *l, + const struct btree_iter *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: + bkey_cmp(l->real_pos, r->real_pos); +} + static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); @@ -1260,8 +1271,7 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, { struct bch_fs *c = trans->c; struct btree_iter *iter; - u8 sorted[BTREE_ITER_MAX]; - int i, nr_sorted = 0; + int i; if (trans->in_traverse_all) return -EINTR; @@ -1270,22 +1280,14 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, retry_all: trans->restarted = false; - nr_sorted = 0; - - trans_for_each_iter(trans, iter) { - sorted[nr_sorted++] = iter->idx; + trans_for_each_iter(trans, iter) iter->should_be_locked = false; - } -#define btree_iter_cmp_by_idx(_l, _r) \ - btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) + btree_trans_sort_iters(trans); - bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); -#undef btree_iter_cmp_by_idx - - for (i = nr_sorted - 2; i >= 0; --i) { - struct btree_iter *iter1 = trans->iters + sorted[i]; - struct btree_iter *iter2 = trans->iters + sorted[i + 1]; + for (i = trans->nr_sorted - 2; i >= 0; --i) { + struct btree_iter *iter1 = trans->iters + trans->sorted[i]; + struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1]; if (iter1->btree_id == iter2->btree_id && iter1->locks_want < iter2->locks_want) @@ -1316,20 +1318,18 @@ retry_all: BUG_ON(ret && ret != -EINTR); /* Now, redo traversals in correct order: */ - for (i = 0; i < nr_sorted; i++) { - unsigned idx = sorted[i]; + trans_for_each_iter_inorder(trans, iter) { + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); - /* - * sucessfully traversing one iterator can cause another to be - * unlinked, in btree_key_cache_fill() - */ - if (!(trans->iters_linked & (1ULL << idx))) - continue; - - ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); + ret = btree_iter_traverse_one(iter, _THIS_IP_); if (ret) goto retry_all; + + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); } + + trans_for_each_iter(trans, iter) + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); out: bch2_btree_cache_cannibalize_unlock(c); @@ -1613,6 +1613,8 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p iter->real_pos = new_pos; iter->should_be_locked = false; + btree_iter_check_sort(iter->trans, iter); + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { btree_node_unlock(iter, 0); iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; @@ -2018,6 +2020,151 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, /* new transactional stuff: */ +static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans, + struct btree_iter *iter) +{ + EBUG_ON(iter->sorted_idx >= trans->nr_sorted); + EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx); + EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); +} + +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned i; + + for (i = 0; i < trans->nr_sorted; i++) + btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]); +#endif +} + +static inline void btree_trans_verify_sorted(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_iter *iter, *prev = NULL; + + trans_for_each_iter_inorder(trans, iter) + BUG_ON(prev && btree_iter_cmp(prev, iter) > 0); +#endif +} + +static inline void btree_iter_swap(struct btree_trans *trans, + struct btree_iter *l, struct btree_iter *r) +{ + swap(l->sorted_idx, r->sorted_idx); + swap(trans->sorted[l->sorted_idx], + trans->sorted[r->sorted_idx]); + + btree_iter_verify_sorted_ref(trans, l); + btree_iter_verify_sorted_ref(trans, r); +} + +static void btree_trans_sort_iters(struct btree_trans *trans) +{ + bool swapped = false; + int i, l = 0, r = trans->nr_sorted; + + while (1) { + for (i = l; i + 1 < r; i++) { + if (btree_iter_cmp(trans->iters + trans->sorted[i], + trans->iters + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], trans->sorted[i + 1]); + trans->iters[trans->sorted[i]].sorted_idx = i; + trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } + } + + if (!swapped) + break; + + r--; + swapped = false; + + for (i = r - 2; i >= l; --i) { + if (btree_iter_cmp(trans->iters + trans->sorted[i], + trans->iters + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], + trans->sorted[i + 1]); + trans->iters[trans->sorted[i]].sorted_idx = i; + trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } + } + + if (!swapped) + break; + + l++; + swapped = false; + } + + btree_trans_verify_sorted_refs(trans); + btree_trans_verify_sorted(trans); +} + +static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter) +{ + struct btree_iter *n; + + EBUG_ON(iter->sorted_idx == U8_MAX); + + n = next_btree_iter(trans, iter); + if (n && btree_iter_cmp(iter, n) > 0) { + do { + btree_iter_swap(trans, iter, n); + n = next_btree_iter(trans, iter); + } while (n && btree_iter_cmp(iter, n) > 0); + + return; + } + + n = prev_btree_iter(trans, iter); + if (n && btree_iter_cmp(n, iter) > 0) { + do { + btree_iter_swap(trans, n, iter); + n = prev_btree_iter(trans, iter); + } while (n && btree_iter_cmp(n, iter) > 0); + } + + btree_trans_verify_sorted(trans); +} + +static inline void btree_iter_list_remove(struct btree_trans *trans, + struct btree_iter *iter) +{ + unsigned i; + + EBUG_ON(iter->sorted_idx >= trans->nr_sorted); + + array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx); + + for (i = iter->sorted_idx; i < trans->nr_sorted; i++) + trans->iters[trans->sorted[i]].sorted_idx = i; + + iter->sorted_idx = U8_MAX; + + btree_trans_verify_sorted_refs(trans); +} + +static inline void btree_iter_list_add(struct btree_trans *trans, + struct btree_iter *pos, + struct btree_iter *iter) +{ + unsigned i; + + btree_trans_verify_sorted_refs(trans); + + iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; + + array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx); + + for (i = iter->sorted_idx; i < trans->nr_sorted; i++) + trans->iters[trans->sorted[i]].sorted_idx = i; + + btree_trans_verify_sorted_refs(trans); +} + static void btree_iter_child_free(struct btree_iter *iter) { struct btree_iter *child = btree_iter_child(iter); @@ -2035,7 +2182,7 @@ static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, struct btree_iter *child = btree_iter_child(iter); if (!child) { - child = btree_trans_iter_alloc(trans); + child = btree_trans_iter_alloc(trans, iter); child->ip_allocated = ip; iter->child_idx = child->idx; @@ -2051,6 +2198,8 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, { btree_iter_child_free(&trans->iters[idx]); + btree_iter_list_remove(trans, &trans->iters[idx]); + __bch2_btree_iter_unlock(&trans->iters[idx]); trans->iters_linked &= ~(1ULL << idx); trans->iters_live &= ~(1ULL << idx); @@ -2097,10 +2246,12 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) struct btree_insert_entry *i; char buf[100]; - trans_for_each_iter(trans, iter) + btree_trans_sort_iters(trans); + + trans_for_each_iter_inorder(trans, iter) printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", bch2_btree_ids[iter->btree_id], - (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), + (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf), btree_iter_live(trans, iter) ? " live" : "", (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", @@ -2116,7 +2267,8 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) panic("trans iter oveflow\n"); } -static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) +static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, + struct btree_iter *pos) { struct btree_iter *iter; unsigned idx; @@ -2131,10 +2283,13 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) iter->trans = trans; iter->idx = idx; iter->child_idx = U8_MAX; + iter->sorted_idx = U8_MAX; iter->flags = 0; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; trans->iters_linked |= 1ULL << idx; + + btree_iter_list_add(trans, pos, iter); return iter; } @@ -2155,6 +2310,8 @@ static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; + + btree_iter_check_sort(dst->trans, dst); } struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, @@ -2208,10 +2365,10 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, } if (!best) { - iter = btree_trans_iter_alloc(trans); + iter = btree_trans_iter_alloc(trans, NULL); bch2_btree_iter_init(trans, iter, btree_id); } else if (btree_iter_keep(trans, best)) { - iter = btree_trans_iter_alloc(trans); + iter = btree_trans_iter_alloc(trans, best); btree_iter_copy(iter, best); } else { iter = best; @@ -2292,7 +2449,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, { struct btree_iter *iter; - iter = btree_trans_iter_alloc(trans); + iter = btree_trans_iter_alloc(trans, src); btree_iter_copy(iter, src); trans->iters_live |= 1ULL << iter->idx; @@ -2407,6 +2564,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) { size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX; void *p = NULL; BUG_ON(trans->used_mempool); @@ -2419,6 +2577,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) trans->iters = p; p += iters_bytes; trans->updates = p; p += updates_bytes; + trans->sorted = p; p += sorted_bytes; } void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, @@ -2621,6 +2780,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return init_srcu_struct(&c->btree_trans_barrier) ?: mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(u8) * nr + sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index aeabc07d..39124e68 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -71,6 +71,30 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx) (_iter); \ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) +static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter) +{ + unsigned idx = iter ? iter->sorted_idx + 1 : 0; + + EBUG_ON(idx > trans->nr_sorted); + + return idx < trans->nr_sorted + ? trans->iters + trans->sorted[idx] + : NULL; +} + +static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter) +{ + EBUG_ON(iter->sorted_idx >= trans->nr_sorted); + return iter->sorted_idx + ? trans->iters + trans->sorted[iter->sorted_idx - 1] + : NULL; +} + +#define trans_for_each_iter_inorder(_trans, _iter) \ + for (_iter = next_btree_iter(trans, NULL); \ + (_iter); \ + _iter = next_btree_iter((_trans), (_iter))) + static inline bool __iter_has_node(const struct btree_iter *iter, const struct btree *b) { @@ -191,19 +215,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) +static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx) { - return iter->child_idx == U8_MAX ? NULL - : iter->trans->iters + iter->child_idx; + return idx != U8_MAX ? trans->iters + idx : NULL; } -/* Sort order for locking btree iterators: */ -static inline int btree_iter_lock_cmp(const struct btree_iter *l, - const struct btree_iter *r) +static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) { - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: - bkey_cmp(l->real_pos, r->real_pos); + return idx_to_btree_iter(iter->trans, iter->child_idx); } /* diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 6882873d..a1e5debf 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -246,6 +246,7 @@ struct btree_iter { u8 idx; u8 child_idx; + u8 sorted_idx; /* btree_iter_copy starts here: */ u16 flags; @@ -379,6 +380,7 @@ struct btree_trans { unsigned long ip; int srcu_idx; + u8 nr_sorted; u8 nr_updates; bool used_mempool:1; bool error:1; @@ -398,6 +400,7 @@ struct btree_trans { unsigned mem_bytes; void *mem; + u8 *sorted; struct btree_iter *iters; struct btree_insert_entry *updates; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index e9e54226..7e9909e2 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -1041,7 +1041,19 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, if (i < trans->updates + trans->nr_updates && !btree_insert_entry_cmp(&n, i)) { BUG_ON(i->trans_triggers_run); - *i = n; + + /* + * This is a hack to ensure that inode creates update the btree, + * not the key cache, which helps with cache coherency issues in + * other areas: + */ + if (btree_iter_type(n.iter) == BTREE_ITER_CACHED && + btree_iter_type(i->iter) != BTREE_ITER_CACHED) { + i->k = n.k; + i->flags = n.flags; + } else { + *i = n; + } } else array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1ac99f37..3333f616 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -808,6 +808,15 @@ retry: unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + break; + } + bch2_btree_iter_set_pos(iter, POS(inum, rbio->bio.bi_iter.bi_sector)); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 25607b58..3b671082 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -371,6 +371,22 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } +static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags); + +#define x(_name, _bits) \ + pr_buf(out, #_name " %llu ", (u64) inode->_name); + BCH_INODE_FIELDS() +#undef x +} + +void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + pr_buf(out, "inum: %llu ", inode->bi_inum); + __bch2_inode_unpacked_to_text(out, inode); +} + void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -382,12 +398,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, return; } - pr_buf(out, "mode: %o ", unpacked.bi_mode); - -#define x(_name, _bits) \ - pr_buf(out, #_name ": %llu ", (u64) unpacked._name); - BCH_INODE_FIELDS() -#undef x + __bch2_inode_unpacked_to_text(out, &unpacked); } const char *bch2_inode_generation_invalid(const struct bch_fs *c, diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 2cb081ae..d67af4f5 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -55,6 +55,8 @@ void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + struct btree_iter *bch2_inode_peek(struct btree_trans *, struct bch_inode_unpacked *, u64, unsigned); int bch2_inode_write(struct btree_trans *, struct btree_iter *, diff --git a/libbcachefs/io.c b/libbcachefs/io.c index a59b291a..4585a403 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -2284,6 +2284,15 @@ retry: unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(&trans)) { + ret = -EINTR; + break; + } + bch2_btree_iter_set_pos(iter, POS(inode, bvec_iter.bi_sector)); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 84e224fb..afb72648 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -39,6 +39,20 @@ static void drop_alloc_keys(struct journal_keys *keys) keys->nr = dst; } +/* + * Btree node pointers have a field to stack a pointer to the in memory btree + * node; we need to zero out this field when reading in btree nodes, or when + * reading in keys from the journal: + */ +static void zero_out_btree_mem_ptr(struct journal_keys *keys) +{ + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->k->k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; +} + /* iterate over keys read from the journal: */ static int __journal_key_cmp(enum btree_id l_btree_id, @@ -1072,6 +1086,8 @@ use_clean: drop_alloc_keys(&c->journal_keys); } + zero_out_btree_mem_ptr(&c->journal_keys); + ret = journal_replay_early(c, clean, &c->journal_entries); if (ret) goto err;