diff --git a/.bcachefs_revision b/.bcachefs_revision index 06ebd7da..46d09322 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -400c2f8d960ac55105bd22905a6ea1a40daa7f4f +787de128a5caf209845e5a8d0f14f24e1a42492c diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f60972c7..35082ae3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -105,6 +105,7 @@ struct super_block { #define DT_LNK 10 #define DT_SOCK 12 #define DT_WHT 14 +#define DT_MAX 16 #endif /* diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 2324b81c..fff85c17 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode(in, end, &v); \ + ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ @@ -166,7 +166,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, nr_fields++; \ \ if (src._name) { \ - out += bch2_varint_encode(out, src._name); \ + out += bch2_varint_encode_fast(out, src._name); \ \ last_nonzero_field = out; \ last_nonzero_fieldnr = nr_fields; \ @@ -1232,3 +1232,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); } + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) { + pr_buf(out, "%zu ref %u type %s\n", + ob - c->open_buckets, + atomic_read(&ob->pin), + bch2_data_types[ob->type]); + } + spin_unlock(&ob->lock); + } + +} diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 9cadfdb5..a4f6bf56 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -132,4 +132,6 @@ int bch2_dev_allocator_start(struct bch_dev *); int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 94273d51..8a89ab0d 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1346,6 +1346,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); /* * Features: diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 73bfd01f..1aacd271 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -233,7 +233,7 @@ wait_on_io: if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -691,7 +691,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * currently fails for iterators that aren't pointed at a valid btree * node */ - if (iter && !bch2_trans_relock(iter->trans)) + if (iter && + (!bch2_trans_relock(iter->trans) || + !bch2_btree_iter_relock(iter, _THIS_IP_))) return ERR_PTR(-EINTR); if (!six_relock_type(&b->c.lock, lock_type, seq)) @@ -851,7 +853,9 @@ lock_node: * currently fails for iterators that aren't pointed at a valid * btree node */ - if (iter && !bch2_trans_relock(iter->trans)) + if (iter && + (!bch2_trans_relock(iter->trans) || + !bch2_btree_iter_relock(iter, _THIS_IP_))) return ERR_PTR(-EINTR); if (!six_relock_type(&b->c.lock, lock_type, seq)) @@ -1002,7 +1006,7 @@ wait_on_io: six_lock_write(&b->c.lock, NULL, NULL); if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 12894f89..957a6a9a 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1566,9 +1566,47 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, static void btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; bch2_btree_complete_write(c, b, w); - bch2_btree_node_io_unlock(b); + + v = READ_ONCE(b->flags); + do { + old = new = v; + + if (old & (1U << BTREE_NODE_need_write)) + goto do_write; + + new &= ~(1U << BTREE_NODE_write_in_flight); + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + return; + +do_write: + six_lock_read(&b->c.lock, NULL, NULL); + v = READ_ONCE(b->flags); + do { + old = new = v; + + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && + btree_node_may_write(b)) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); + new |= (1U << BTREE_NODE_just_written); + new ^= (1U << BTREE_NODE_write_idx); + } else { + new &= ~(1U << BTREE_NODE_write_in_flight); + } + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, true); + + six_unlock_read(&b->c.lock); } static void bch2_btree_node_write_error(struct bch_fs *c, @@ -1733,7 +1771,7 @@ static void btree_write_submit(struct work_struct *work) bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); } -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) { struct btree_write_bio *wbio; struct bset_tree *t; @@ -1750,7 +1788,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) bool validate_before_checksum = false; void *data; - BUG_ON(btree_node_write_in_flight(b)); + if (already_started) + goto do_write; if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1774,14 +1813,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) if (old & (1 << BTREE_NODE_never_write)) return; - if (old & (1 << BTREE_NODE_write_in_flight)) { - /* - * XXX waiting on btree writes with btree locks held - - * this can deadlock, and we hit the write error path - */ - bch2_btree_node_wait_on_write(b); - continue; - } + BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_need_write); @@ -1790,6 +1822,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + if (new & (1U << BTREE_NODE_need_write)) + return; +do_write: atomic_dec(&c->btree_cache.dirty); BUG_ON(btree_node_fake(b)); @@ -2044,7 +2079,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_intent || (lock_type_held == SIX_LOCK_read && six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); /* don't cycle lock unnecessarily: */ if (btree_node_just_written(b) && @@ -2056,7 +2091,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); } else { - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); if (lock_type_held == SIX_LOCK_write && btree_node_just_written(b)) bch2_btree_post_write_cleanup(c, b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 89fd4aba..3732d135 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -139,7 +139,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); void bch2_btree_write_error_work(struct work_struct *); -void __bch2_btree_node_write(struct bch_fs *, struct btree *); +void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -148,18 +148,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *, static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, enum six_lock_type lock_held) { - while (b->written && - btree_node_need_write(b) && - btree_node_may_write(b)) { - if (!btree_node_write_in_flight(b)) { - bch2_btree_node_write(c, b, lock_held); - break; - } - - six_unlock_type(&b->c.lock, lock_held); - bch2_btree_node_wait_on_write(b); - btree_node_lock_type(c, b, lock_held); - } + if (b->written && + btree_node_need_write(b) && + btree_node_may_write(b) && + !btree_node_write_in_flight(b)) + bch2_btree_node_write(c, b, lock_held); } #define bch2_btree_node_write_cond(_c, _b, cond) \ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 0444dbd1..24d7422c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -347,6 +347,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, #ifdef CONFIG_BCACHEFS_DEBUG static void bch2_btree_iter_verify_locks(struct btree_iter *iter) { + struct bch_fs *c = iter->trans->c; unsigned l; if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { @@ -354,7 +355,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter) return; } - for (l = 0; is_btree_node(iter, l); l++) { + for (l = 0; btree_iter_node(iter, l); l++) { if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && !btree_node_locked(iter, l)) continue; @@ -376,7 +377,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} #endif __flatten -static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) +bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) { return btree_iter_get_locks(iter, false, trace_ip); } @@ -602,6 +603,8 @@ err: static void bch2_btree_iter_verify(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; enum btree_iter_type type = btree_iter_type(iter); unsigned i; @@ -620,10 +623,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(iter->btree_id)); - bch2_btree_iter_verify_locks(iter); + for (i = 0; i < BTREE_MAX_DEPTH; i++) { + if (!iter->l[i].b) { + BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i); + break; + } - for (i = 0; i < BTREE_MAX_DEPTH; i++) bch2_btree_iter_verify_level(iter, i); + } + + bch2_btree_iter_verify_locks(iter); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -1345,30 +1354,30 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, static int btree_iter_traverse_one(struct btree_iter *iter, unsigned long trace_ip) { - unsigned depth_want = iter->level; + unsigned l, depth_want = iter->level; int ret = 0; - /* - * if we need interior nodes locked, call btree_iter_relock() to make - * sure we walk back up enough that we lock them: - */ - if (iter->uptodate == BTREE_ITER_NEED_RELOCK || - iter->locks_want > 1) - bch2_btree_iter_relock(iter, _THIS_IP_); - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ret = bch2_btree_iter_traverse_cached(iter); goto out; } - if (iter->uptodate < BTREE_ITER_NEED_RELOCK) - goto out; - if (unlikely(iter->level >= BTREE_MAX_DEPTH)) goto out; iter->level = btree_iter_up_until_good_node(iter, 0); + /* If we need intent locks, take them too: */ + for (l = iter->level + 1; + l < iter->locks_want && btree_iter_node(iter, l); + l++) + if (!bch2_btree_node_relock(iter, l)) + while (iter->level <= l) { + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; + iter->level++; + } + /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -1389,6 +1398,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter, goto out; } + __bch2_btree_iter_unlock(iter); iter->level = depth_want; if (ret == -EIO) { diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 31175cf0..58f15b71 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -111,6 +111,8 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_packed *, unsigned, unsigned); +bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); + bool bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index dfaf5e6d..7f47ef33 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -270,7 +270,9 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) BUG_ON(iter->level); - if (btree_node_locked(iter, 0)) { + iter->l[1].b = NULL; + + if (bch2_btree_node_relock(iter, 0)) { ck = (void *) iter->l[0].b; goto fill; } diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 6b55a410..0b4e4056 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -948,13 +948,6 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, closure_init_stack(&cl); retry: - /* - * This check isn't necessary for correctness - it's just to potentially - * prevent us from doing a lot of work that'll end up being wasted: - */ - ret = bch2_journal_error(&c->journal); - if (ret) - return ERR_PTR(ret); /* * XXX: figure out how far we might need to split, @@ -995,6 +988,22 @@ retry: bch2_keylist_init(&as->new_keys, as->_new_keys); bch2_keylist_init(&as->parent_keys, as->inline_keys); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* + * We don't want to allocate if we're in an error state, that can cause + * deadlock on emergency shutdown due to open buckets getting stuck in + * the btree_reserve_cache after allocator shutdown has cleared it out. + * This check needs to come after adding us to the btree_interior_update + * list but before calling bch2_btree_reserve_get, to synchronize with + * __bch2_fs_read_only(). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, journal_flags|JOURNAL_RES_GET_NONBLOCK); @@ -1046,10 +1055,6 @@ retry: atomic64_read(&c->journal.seq), &as->journal, NULL); - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - return as; err: bch2_btree_update_free(as); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index d5883ab7..a95165b8 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -112,7 +112,10 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, bch_scnmemcpy(out, d.v->d_name, bch2_dirent_name_bytes(d)); - pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]); + pr_buf(out, " -> %llu type %s", d.v->d_inum, + d.v->d_type < DT_MAX + ? bch2_d_types[d.v->d_type] + : "(bad d_type)"); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 2795b37b..ae55453b 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1870,8 +1870,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); ret = -EFAULT; goto err; } @@ -1939,6 +1937,7 @@ loop: if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); + bio->bi_vcnt = 0; if (dio->op.error) { set_bit(EI_INODE_ERROR, &inode->ei_flags); @@ -1961,6 +1960,9 @@ err: if (dio->free_iov) kfree(dio->iter.iov); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); bio_put(bio); /* inode->i_dio_count is our ref on inode and thus bch_fs */ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 7ea1a41a..bedfd348 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -267,11 +267,11 @@ static struct inode_walker inode_walker_init(void) }; } -static int walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) +static int __walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) { if (inum != w->cur_inum) { - int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot); + int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot); if (ret && ret != -ENOENT) return ret; @@ -286,6 +286,12 @@ static int walk_inode(struct btree_trans *trans, return 0; } +static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) +{ + return lockrestart_do(trans, __walk_inode(trans, w, inum)); +} + static int hash_redo_key(struct btree_trans *trans, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, @@ -704,6 +710,177 @@ fsck_err: return bch2_trans_exit(&trans) ?: ret; } +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, + struct inode_walker *w, unsigned *nr_subdirs) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked target; + u32 target_snapshot; + bool have_target; + bool backpointer_exists = true; + u64 d_inum; + char buf[200]; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 1; + + ret = bkey_err(k); + if (ret) + return ret; + + if (w->have_inode && + w->cur_inum != k.k->p.inode && + fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c, + "directory %llu with wrong i_nlink: got %u, should be %u", + w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) { + w->inode.bi_nlink = *nr_subdirs; + ret = write_inode(trans, &w->inode, w->snapshot); + return ret ?: -EINTR; + } + + ret = __walk_inode(trans, w, k.k->p.inode); + if (ret) + return ret; + + if (w->first_this_inode) + *nr_subdirs = 0; + + if (fsck_err_on(!w->have_inode, c, + "dirent in nonexisting directory:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) || + fsck_err_on(!S_ISDIR(w->inode.bi_mode), c, + "dirent in non directory inode type %u:\n%s", + mode_to_type(w->inode.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return __bch2_trans_do(trans, NULL, NULL, 0, + bch2_btree_delete_at(trans, iter, 0)); + + if (!w->have_inode) + return 0; + + if (w->first_this_inode) + *hash_info = bch2_hash_info_init(c, &w->inode); + + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); + if (ret < 0) + return ret; + if (ret) /* dirent has been deleted */ + return 0; + + if (k.k->type != KEY_TYPE_dirent) + return 0; + + d = bkey_s_c_to_dirent(k); + d_inum = le64_to_cpu(d.v->d_inum); + + ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); + if (ret && ret != -ENOENT) + return ret; + + have_target = !ret; + ret = 0; + + if (fsck_err_on(!have_target, c, + "dirent points to missing inode:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) + return remove_dirent(trans, d.k->p); + + if (!have_target) + return 0; + + if (!target.bi_dir && + !target.bi_dir_offset) { + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + + ret = __write_inode(trans, &target, target_snapshot) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOUNLOCK); + if (ret) + return ret; + return -EINTR; + } + + if (!inode_backpointer_matches(d, &target)) { + ret = inode_backpointer_exists(trans, &target); + if (ret < 0) + return ret; + + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target.bi_mode) && + backpointer_exists, c, + "directory %llu with multiple links", + target.bi_inum)) + return remove_dirent(trans, d.k->p); + + if (fsck_err_on(backpointer_exists && + !target.bi_nlink, c, + "inode %llu has multiple links but i_nlink 0", + d_inum)) { + target.bi_nlink++; + target.bi_flags &= ~BCH_INODE_UNLINKED; + + ret = write_inode(trans, &target, target_snapshot); + return ret ?: -EINTR; + } + + if (fsck_err_on(!backpointer_exists, c, + "inode %llu has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + d_inum, + target.bi_dir, + target.bi_dir_offset, + k.k->p.inode, + k.k->p.offset)) { + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + + ret = write_inode(trans, &target, target_snapshot); + return ret ?: -EINTR; + } + } + + if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target.bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + struct bkey_i_dirent *n; + + n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); + if (!n) + return -ENOMEM; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = mode_to_type(target.bi_mode); + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, &n->k_i, 0)); + kfree(n); + return ret ?: -EINTR; + } + + *nr_subdirs += d.v->d_type == DT_DIR; + return 0; +fsck_err: + return ret; +} + /* * Walk dirents: verify that they all have a corresponding S_ISDIR inode, * validate d_type @@ -715,8 +892,6 @@ static int check_dirents(struct bch_fs *c) struct bch_hash_info hash_info; struct btree_trans trans; struct btree_iter *iter; - struct bkey_s_c k; - char buf[200]; unsigned nr_subdirs = 0; int ret = 0; @@ -728,186 +903,22 @@ static int check_dirents(struct bch_fs *c) POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_INTENT| BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - struct bkey_s_c_dirent d; - struct bch_inode_unpacked target; - u32 target_snapshot; - bool have_target; - bool backpointer_exists = true; - u64 d_inum; - if (w.have_inode && - w.cur_inum != k.k->p.inode && - fsck_err_on(w.inode.bi_nlink != nr_subdirs, c, - "directory %llu with wrong i_nlink: got %u, should be %u", - w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) { - w.inode.bi_nlink = nr_subdirs; - ret = write_inode(&trans, &w.inode, w.snapshot); - if (ret) - break; + while (1) { + ret = lockrestart_do(&trans, + check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs)); + if (ret == 1) { + /* at end */ + ret = 0; + break; } - - ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; - if (w.first_this_inode) - nr_subdirs = 0; - - if (fsck_err_on(!w.have_inode, c, - "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf)) || - fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, - "dirent in non directory inode type %u:\n%s", - mode_to_type(w.inode.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_delete_at(&trans, iter, 0)); - if (ret) - goto err; - goto next; - } - - if (!w.have_inode) - goto next; - - if (w.first_this_inode) - hash_info = bch2_hash_info_init(c, &w.inode); - - ret = hash_check_key(&trans, bch2_dirent_hash_desc, - &hash_info, iter, k); - if (ret > 0) { - ret = 0; - goto next; - } - if (ret) - goto fsck_err; - - if (k.k->type != KEY_TYPE_dirent) - goto next; - - d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); - - ret = lookup_inode(&trans, d_inum, &target, &target_snapshot); - if (ret && ret != -ENOENT) - break; - - have_target = !ret; - ret = 0; - - if (fsck_err_on(!have_target, c, - "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - ret = remove_dirent(&trans, d.k->p); - if (ret) - goto err; - goto next; - } - - if (!have_target) - goto next; - - if (!target.bi_dir && - !target.bi_dir_offset) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(&trans, &target, target_snapshot); - if (ret) - goto err; - } - - if (!inode_backpointer_matches(d, &target)) { - ret = inode_backpointer_exists(&trans, &target); - if (ret < 0) - goto err; - - backpointer_exists = ret; - ret = 0; - - if (fsck_err_on(S_ISDIR(target.bi_mode) && - backpointer_exists, c, - "directory %llu with multiple links", - target.bi_inum)) { - ret = remove_dirent(&trans, d.k->p); - if (ret) - goto err; - continue; - } - - if (fsck_err_on(backpointer_exists && - !target.bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - d_inum)) { - target.bi_nlink++; - target.bi_flags &= ~BCH_INODE_UNLINKED; - - ret = write_inode(&trans, &target, target_snapshot); - if (ret) - goto err; - } - - if (fsck_err_on(!backpointer_exists, c, - "inode %llu has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - d_inum, - target.bi_dir, - target.bi_dir_offset, - k.k->p.inode, - k.k->p.offset)) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(&trans, &target, target_snapshot); - if (ret) - goto err; - } - } - - if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, - "incorrect d_type: should be %u:\n%s", - mode_to_type(target.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - struct bkey_i_dirent *n; - - n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto err; - } - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(target.bi_mode); - - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &n->k_i, 0)); - kfree(n); - if (ret) - goto err; - - } - - nr_subdirs += d.v->d_type == DT_DIR; -next: bch2_btree_iter_advance(iter); } -err: -fsck_err: - if (ret == -EINTR) - goto retry; - bch2_trans_iter_put(&trans, iter); + return bch2_trans_exit(&trans) ?: ret; } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 59edb4ce..67983ff4 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -137,7 +137,7 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, nr_fields++; \ \ if (inode->_name) { \ - ret = bch2_varint_encode(out, inode->_name); \ + ret = bch2_varint_encode_fast(out, inode->_name); \ out += ret; \ \ if (_bits > 64) \ @@ -246,13 +246,13 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, #define x(_name, _bits) \ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ - ret = bch2_varint_decode(in, end, &v[0]); \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ if (ret < 0) \ return ret; \ in += ret; \ \ if (_bits > 64) { \ - ret = bch2_varint_decode(in, end, &v[1]); \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ if (ret < 0) \ return ret; \ in += ret; \ @@ -300,8 +300,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_CACHED|flags); + if (trans->c->opts.inodes_use_key_cache) + flags |= BTREE_ITER_CACHED; + + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -577,8 +579,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) struct bpos end = POS(inode_nr + 1, 0); struct bch_inode_unpacked inode_u; struct bkey_s_c k; + unsigned iter_flags = BTREE_ITER_INTENT; int ret; + if (cached && c->opts.inodes_use_key_cache) + iter_flags |= BTREE_ITER_CACHED; + bch2_trans_init(&trans, c, 0, 1024); /* @@ -600,11 +606,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), - (cached - ? BTREE_ITER_CACHED - : BTREE_ITER_SLOTS)| - BTREE_ITER_INTENT); + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, + POS(0, inode_nr), iter_flags); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index fd3f7cdd..5de29607 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -63,7 +63,7 @@ const char * const bch2_member_states[] = { #undef x -const char * const bch2_d_types[] = { +const char * const bch2_d_types[DT_MAX] = { [DT_UNKNOWN] = "unknown", [DT_FIFO] = "fifo", [DT_CHR] = "chr", diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index c331535b..ed505857 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -173,6 +173,11 @@ enum opt_type { OPT_BOOL(), \ BCH_SB_SHARD_INUMS, false, \ NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ x(gc_reserve_percent, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 84a7acb0..9b1ffbf9 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -171,6 +171,7 @@ read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); read_attribute(stripes_heap); +read_attribute(open_buckets); read_attribute(internal_uuid); @@ -409,6 +410,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_open_buckets) { + bch2_open_buckets_to_text(&out, c); + return out.pos - buf; + } + if (attr == &sysfs_compression_stats) { bch2_compression_stats_to_text(&out, c); return out.pos - buf; @@ -567,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_transactions, &sysfs_stripes_heap, + &sysfs_open_buckets, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index a3d252c7..e6a04154 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -1,15 +1,80 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "varint.h" +/** + * bch2_varint_encode - encode a variable length integer + * @out - destination to encode to + * @v - unsigned integer to encode + * + * Returns the size in bytes of the encoded integer - at most 9 bytes + */ int bch2_varint_encode(u8 *out, u64 v) { unsigned bits = fls64(v|1); unsigned bytes = DIV_ROUND_UP(bits, 7); + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + v = cpu_to_le64(v); + memcpy(out, &v, bytes); + } else { + *out++ = 255; + bytes = 9; + put_unaligned_le64(v, out); + } + + return bytes; +} + +/** + * bch2_varint_decode - encode a variable length integer + * @in - varint to decode + * @end - end of buffer to decode from + * @out - on success, decoded integer + * + * Returns the size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) + */ +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + unsigned bytes = likely(in < end) + ? ffz(*in & 255) + 1 + : 1; + u64 v; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v = 0; + memcpy(&v, in, bytes); + v = le64_to_cpu(v); + v >>= bytes; + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} + +/** + * bch2_varint_encode_fast - fast version of bch2_varint_encode + * + * This version assumes it's always safe to write 8 bytes to @out, even if the + * encoded integer would be smaller. + */ +int bch2_varint_encode_fast(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + if (likely(bytes < 9)) { v <<= bytes; v |= ~(~0 << (bytes - 1)); @@ -22,7 +87,13 @@ int bch2_varint_encode(u8 *out, u64 v) return bytes; } -int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +/** + * bch2_varint_decode_fast - fast version of bch2_varint_decode + * + * This version assumes that it is safe to read at most 8 bytes past the end of + * @end (we still return an error if the varint extends past @end). + */ +int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) { u64 v = get_unaligned_le64(in); unsigned bytes = ffz(v & 255) + 1; diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h index 8daf8135..92a182fb 100644 --- a/libbcachefs/varint.h +++ b/libbcachefs/varint.h @@ -5,4 +5,7 @@ int bch2_varint_encode(u8 *, u64); int bch2_varint_decode(const u8 *, const u8 *, u64 *); +int bch2_varint_encode_fast(u8 *, u64); +int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); + #endif /* _BCACHEFS_VARINT_H */