From 40e14938eeebf74830c870a067cca0e8c73feed7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 29 Mar 2021 00:21:38 -0400 Subject: [PATCH] Update bcachefs sources to 9922afc8b6 bcachefs: Add repair code for out of order keys in a btree node. --- .bcachefs_revision | 2 +- libbcachefs/btree_io.c | 36 ++++++--------- libbcachefs/btree_key_cache.h | 9 ++++ libbcachefs/btree_update_leaf.c | 2 +- libbcachefs/fs-common.c | 6 ++- libbcachefs/fsck.c | 1 + libbcachefs/inode.c | 78 +++++++++++++++++++++++---------- libbcachefs/inode.h | 2 +- libbcachefs/journal_reclaim.c | 2 +- libbcachefs/move.c | 42 +++++++++++++++--- 10 files changed, 124 insertions(+), 56 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 385c19f6..2e71c6c8 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -18686af68412ebfad9c2adc6ee976ffdb9e1b886 +9922afc8b6d6227f4193feef6442f8c3d881f78c diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index b43d4468..7fbacd9e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -578,6 +578,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); } + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { @@ -660,14 +664,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; - bool seen_non_whiteout = false; int ret = 0; - if (!BSET_SEPARATE_WHITEOUTS(i)) { - seen_non_whiteout = true; - *whiteout_u64s = 0; - } - for (k = i->start; k != vstruct_last(i);) { struct bkey_s u; @@ -719,18 +717,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BSET_BIG_ENDIAN(i), write, &b->format, k); - /* - * with the separate whiteouts thing (used for extents), the - * second set of keys actually can have whiteouts too, so we - * can't solely go off bkey_deleted()... - */ - - if (!seen_non_whiteout && - (!bkey_deleted(k) || - (prev && bkey_iter_cmp(b, prev, k) > 0))) { - *whiteout_u64s = k->_data - i->_data; - seen_non_whiteout = true; - } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && bkey_iter_cmp(b, prev, k) > 0) { char buf1[80]; char buf2[80]; struct bkey up = bkey_unpack_key(b, prev); @@ -739,10 +726,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&PBUF(buf2), u.k); bch2_dump_bset(c, b, i, 0); - btree_err(BTREE_ERR_FATAL, c, NULL, b, i, - "keys out of order: %s > %s", - buf1, buf2); - /* XXX: repair this */ + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "keys out of order: %s > %s", + buf1, buf2)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } } prev = k; diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 02715cd2..4e1e5a9c 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -1,6 +1,15 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = nr_keys / 4; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 67a2c65b..221a6004 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -1188,7 +1188,7 @@ retry: goto retry; } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_free(trans, iter); return ret; } diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 83c2168c..281a6135 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -36,7 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - inode_iter = bch2_inode_create(trans, new_inode); + inode_iter = bch2_inode_create(trans, new_inode, U32_MAX); ret = PTR_ERR_OR_ZERO(inode_iter); if (ret) goto err; @@ -80,6 +80,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, new_inode->bi_dir_offset = dir_offset; } + /* XXX use bch2_btree_iter_set_snapshot() */ + inode_iter->snapshot = U32_MAX; + bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + ret = bch2_inode_write(trans, inode_iter, new_inode); err: bch2_trans_iter_put(trans, inode_iter); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 62788ae1..acf128f0 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1361,6 +1361,7 @@ static int check_inode(struct btree_trans *trans, struct bkey_inode_buf p; bch2_inode_pack(c, &p, &u); + p.inode.k.p = iter->pos; ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index f1665ca8..d4c32839 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -471,12 +471,13 @@ static inline u32 bkey_generation(struct bkey_s_c k) } struct btree_iter *bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u) + struct bch_inode_unpacked *inode_u, + u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter *iter = NULL; struct bkey_s_c k; - u64 min, max, start, *hint; + u64 min, max, start, pos, *hint; int ret; u64 cpu = raw_smp_processor_id(); @@ -493,39 +494,70 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, if (start >= max || start < min) start = min; + + pos = start; + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); again: - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (bkey_cmp(iter->pos, POS(0, max)) > 0) - break; + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(k.k->p, POS(0, max)) < 0) { + while (pos < iter->pos.offset) { + if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) + goto found_slot; + + pos++; + } + + if (k.k->p.snapshot == snapshot && + k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { + bch2_btree_iter_next(iter); + continue; + } /* - * There's a potential cache coherency issue with the btree key - * cache code here - we're iterating over the btree, skipping - * that cache. We should never see an empty slot that isn't - * actually empty due to a pending update in the key cache - * because the update that creates the inode isn't done with a - * cached iterator, but - better safe than sorry, check the - * cache before using a slot: + * We don't need to iterate over keys in every snapshot once + * we've found just one: */ - if (k.k->type != KEY_TYPE_inode && - !bch2_btree_key_cache_find(c, BTREE_ID_inodes, iter->pos)) + pos = iter->pos.offset + 1; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + } + + while (!ret && pos < max) { + if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) goto found_slot; + + pos++; } - bch2_trans_iter_put(trans, iter); + if (!ret && start == min) + ret = -ENOSPC; - if (ret) + if (ret) { + bch2_trans_iter_put(trans, iter); return ERR_PTR(ret); - - if (start != min) { - /* Retry from start */ - start = min; - goto again; } - return ERR_PTR(-ENOSPC); + /* Retry from start */ + pos = start = min; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + goto again; found_slot: + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret); + } + + /* We may have raced while the iterator wasn't pointing at pos: */ + if (k.k->type == KEY_TYPE_inode || + bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) + goto again; + *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 6bad6dfb..23c322d9 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_unpacked *); struct btree_iter *bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *); + struct bch_inode_unpacked *, u32); int bch2_inode_rm(struct bch_fs *, u64, bool); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 4a5b50ed..93b5e07e 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -602,7 +602,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (fifo_free(&j->pin) <= 32) min_nr = 1; - min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); + min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c)); trace_journal_reclaim_start(c, min_nr, diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 732e2dbb..c9e18491 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -509,6 +509,32 @@ err: return ret; } +static int lookup_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode) +{ + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + if (ret) + goto err; + + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + if (ret) + goto err; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static int __bch2_move_data(struct bch_fs *c, struct moving_context *ctxt, struct bch_ratelimit *rate, @@ -566,7 +592,7 @@ static int __bch2_move_data(struct bch_fs *c, try_to_freeze(); } } while (delay); -peek: + k = bch2_btree_iter_peek(iter); stats->pos = iter->pos; @@ -586,14 +612,18 @@ peek: cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; - /* don't hold btree locks while looking up inode: */ - bch2_trans_unlock(&trans); - io_opts = bch2_opts_to_inode_opts(c->opts); - if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) + + ret = lookup_inode(&trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (ret == -EINTR) + continue; + + if (!ret) bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); + cur_inum = k.k->p.inode; - goto peek; } switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {