From 6016d33b801a5fe13e86e5be3abf68ed166c0796 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 4 Nov 2019 12:53:59 -0500 Subject: [PATCH] Update bcachefs sources to 9e76e8d98c bcachefs: Fix uninitialized field in hash_check_init() --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 2 +- libbcachefs/bkey.c | 2 +- libbcachefs/bkey.h | 4 +- libbcachefs/bset.c | 18 +- libbcachefs/btree_cache.c | 3 - libbcachefs/btree_io.c | 5 +- libbcachefs/btree_io.h | 6 +- libbcachefs/btree_iter.c | 51 ++- libbcachefs/btree_iter.h | 7 + libbcachefs/btree_locking.h | 18 + libbcachefs/btree_types.h | 35 +- libbcachefs/btree_update.h | 31 +- libbcachefs/btree_update_interior.c | 1 + libbcachefs/btree_update_interior.h | 6 +- libbcachefs/btree_update_leaf.c | 527 +++++++++++++--------------- libbcachefs/buckets.c | 62 ++-- libbcachefs/buckets.h | 6 +- libbcachefs/buckets_types.h | 6 +- libbcachefs/extents.c | 465 +++++++++--------------- libbcachefs/extents.h | 2 + libbcachefs/fs-io.c | 186 +++++----- libbcachefs/fsck.c | 1 + libbcachefs/inode.c | 11 +- libbcachefs/io.c | 96 +++-- libbcachefs/io.h | 3 +- libbcachefs/io_types.h | 3 +- libbcachefs/journal.h | 2 +- libbcachefs/journal_io.c | 2 - libbcachefs/recovery.c | 2 + libbcachefs/reflink.c | 5 + libbcachefs/util.h | 28 ++ 32 files changed, 802 insertions(+), 796 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index a589122a..9676940a 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -fc4f1d59cf9330bbb27cd12c459706aa5e7fe33c +9e76e8d98c52c128641b0f916a1990a37d60d22e diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index a186aa52..323b663d 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -299,7 +299,6 @@ do { \ x(btree_node_sort) \ x(btree_node_read) \ x(btree_gc) \ - x(btree_update) \ x(btree_lock_contended_read) \ x(btree_lock_contended_intent) \ x(btree_lock_contended_write) \ @@ -498,6 +497,7 @@ enum { /* misc: */ BCH_FS_BDEV_MOUNTED, BCH_FS_FIXED_GENS, + BCH_FS_ALLOC_WRITTEN, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 0f9dfe37..ed7ca5b0 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -327,7 +327,7 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, const struct bkey_packed *src) { - dst->k = bkey_unpack_key(b, src); + __bkey_unpack_key(b, &dst->k, src); memcpy_u64s(&dst->v, bkeyp_val(&b->format, src), diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 5ef66aed..b26f4934 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -87,8 +87,8 @@ do { \ (u64 *) (_dst) < (u64 *) (_src) + \ ((struct bkey *) (_src))->u64s); \ \ - __memmove_u64s_down((_dst), (_src), \ - ((struct bkey *) (_src))->u64s); \ + memcpy_u64s_small((_dst), (_src), \ + ((struct bkey *) (_src))->u64s); \ } while (0) struct btree; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 32436ed5..19f13b7e 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1565,11 +1565,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * So we've got to search for start_of_range, then after the lookup iterate * past any extents that compare equal to the position we searched for. */ +__flatten void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { struct bset_tree *t; struct bkey_packed p, *packed_search = NULL; + struct btree_node_iter_set *pos = iter->data; EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); @@ -1588,11 +1590,17 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, return; } - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, - packed_search, &p), - btree_bkey_last(b, t)); + for_each_bset(b, t) { + struct bkey_packed *k = bch2_bset_search(b, t, search, + packed_search, &p); + struct bkey_packed *end = btree_bkey_last(b, t); + + if (k != end) + *pos++ = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }; + } bch2_btree_node_iter_sort(iter, b); } diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 2d8f6379..41694951 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -83,9 +83,6 @@ static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) goto err; - memset(&b->data->csum, 0, sizeof b->data->csum); - b->data->flags = 0; - bc->used++; list_move(&b->list, &bc->freeable); return; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index b6e286c3..591980d2 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1500,10 +1500,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->data = data; wbio->wbio.order = order; wbio->wbio.used_mempool = used_mempool; - wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; + if (b->level || !b->written) + wbio->wbio.bio.bi_opf |= REQ_FUA; + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index c817aeed..955a80ca 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -62,10 +62,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) { - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; + unsigned total_u64s = bset_u64s(t); + unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set]; - return dead_u64s > 128 && dead_u64s * 3 > bset_u64s; + return dead_u64s > 64 && dead_u64s * 3 > total_u64s; } static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 492f5e5b..5d4a2cb8 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -64,21 +64,9 @@ static inline int btree_iter_pos_cmp(struct btree_iter *iter, /* Btree node locking: */ -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) { - struct btree_iter *linked; - - EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); - - trans_for_each_iter_with_node(iter->trans, b, linked) - linked->l[b->level].lock_seq += 2; - - six_unlock_write(&b->lock); + bch2_btree_node_unlock_write_inlined(b, iter); } void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) @@ -306,9 +294,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) __flatten static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) { - return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? btree_iter_get_locks(iter, false, trace) - : true; + return btree_iter_get_locks(iter, false, trace); } bool __bch2_btree_iter_upgrade(struct btree_iter *iter, @@ -513,6 +499,30 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); } +static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_node_iter *node_iter = &iter->l[0].iter; + + if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { + bkey_disassemble(b, where, &iter->k); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } +} + +void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_iter *linked; + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); + __bch2_btree_iter_verify(linked, b); + } +} + static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree *b, struct btree_node_iter *node_iter, @@ -939,7 +949,7 @@ static void btree_iter_prefetch(struct btree_iter *iter) btree_node_unlock(iter, iter->level); } -static inline int btree_iter_down(struct btree_iter *iter) +static __always_inline int btree_iter_down(struct btree_iter *iter) { struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; @@ -948,7 +958,7 @@ static inline int btree_iter_down(struct btree_iter *iter) enum six_lock_type lock_type = __btree_lock_want(iter, level); BKEY_PADDED(k) tmp; - BUG_ON(!btree_node_locked(iter, iter->level)); + EBUG_ON(!btree_node_locked(iter, iter->level)); bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); @@ -1086,7 +1096,10 @@ static int btree_iter_traverse_one(struct btree_iter *iter) if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - if (bch2_btree_iter_relock(iter, false)) + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + bch2_btree_iter_relock(iter, false); + + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) return 0; /* diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 321baab5..4c503222 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -48,6 +48,11 @@ static inline int btree_iter_err(const struct btree_iter *iter) /* Iterate over iters within a transaction: */ +#define trans_for_each_iter_all(_trans, _iter) \ + for (_iter = (_trans)->iters; \ + _iter < (_trans)->iters + (_trans)->nr_iters; \ + _iter++) + static inline struct btree_iter * __trans_next_iter(struct btree_trans *trans, unsigned idx) { @@ -99,6 +104,8 @@ static inline void bch2_btree_iter_verify(struct btree_iter *iter, static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} #endif +void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, + struct bkey_packed *); void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_packed *, unsigned, unsigned); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 592c3b4e..aaad2d28 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -203,6 +203,24 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, __bch2_btree_node_relock(iter, level); } +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) +{ + struct btree_iter *linked; + + EBUG_ON(iter->l[b->level].b != b); + EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); + + trans_for_each_iter_with_node(iter->trans, b, linked) + linked->l[b->level].lock_seq += 2; + + six_unlock_write(&b->lock); +} + void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 96be31f8..efa68bb5 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -252,7 +252,6 @@ struct btree_insert_entry { struct btree_trans { struct bch_fs *c; unsigned long ip; - u64 commit_start; u64 iters_linked; u64 iters_live; @@ -280,12 +279,11 @@ struct btree_trans { struct disk_reservation *disk_res; unsigned flags; unsigned journal_u64s; + struct replicas_delta_list *fs_usage_deltas; struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; u8 updates_sorted_onstack[6]; - - struct replicas_delta_list *fs_usage_deltas; }; #define BTREE_FLAG(flag) \ @@ -417,6 +415,12 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) +static inline unsigned bset_u64s(struct bset_tree *t) +{ + return t->end_offset - t->data_offset - + sizeof(struct bset) / sizeof(u64); +} + static inline unsigned bset_byte_offset(struct btree *b, void *i) { return i - (void *) b->data; @@ -457,19 +461,22 @@ static inline bool btree_node_is_extents(struct btree *b) return btree_node_type_is_extents(btree_node_type(b)); } +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_ALLOC)| \ + (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_REFLINK)| \ + (1U << BKEY_TYPE_EC)| \ + (1U << BKEY_TYPE_BTREE)) + +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_REFLINK)) + static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - switch (type) { - case BKEY_TYPE_ALLOC: - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_INODES: - case BKEY_TYPE_EC: - case BKEY_TYPE_REFLINK: - return true; - default: - return false; - } + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); } struct btree_root { diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 49f4d24d..ad8cbf3f 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -27,7 +27,6 @@ enum { __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_NOMARK_OVERWRITES, __BTREE_INSERT_NOMARK, - __BTREE_INSERT_MARK_INMEM, __BTREE_INSERT_NO_CLEAR_REPLICAS, __BTREE_INSERT_BUCKET_INVALIDATE, __BTREE_INSERT_NOWAIT, @@ -68,9 +67,6 @@ enum { /* Don't call mark new key at all: */ #define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) -/* Don't mark transactionally: */ -#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM) - #define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) #define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) @@ -97,9 +93,30 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, struct btree *, struct bkey_i_btree_ptr *); -int bch2_trans_commit(struct btree_trans *, - struct disk_reservation *, - u64 *, unsigned); +int __bch2_trans_commit(struct btree_trans *); + +/** + * bch2_trans_commit - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +static inline int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + unsigned flags) +{ + trans->disk_res = disk_res; + trans->journal_seq = journal_seq; + trans->flags = flags; + + return __bch2_trans_commit(trans); +} static inline void bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index ec0de33f..9d5687ec 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2187,6 +2187,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_bset_init_first(b, &b->data->keys); bch2_btree_build_aux_trees(b); + b->data->flags = 0; b->data->min_key = POS_MIN; b->data->max_key = POS_MAX; b->data->format = bch2_btree_calc_format(b); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index e5156e90..c5a0ab5d 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -284,17 +284,17 @@ static inline unsigned btree_write_set_buffer(struct btree *b) static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { - struct bset *i = btree_bset_last(b); + struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); - if (unlikely(bset_written(b, i))) { + if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) return bne; } else { - if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) return bne; } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 2ee65a3e..051368cd 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -16,20 +16,16 @@ #include "keylist.h" #include "replicas.h" +#include #include #include static inline bool same_leaf_as_prev(struct btree_trans *trans, - unsigned sorted_idx) + unsigned idx) { - struct btree_insert_entry *i = trans->updates + - trans->updates_sorted[sorted_idx]; - struct btree_insert_entry *prev = sorted_idx - ? trans->updates + trans->updates_sorted[sorted_idx - 1] - : NULL; - - return prev && - i->iter->l[0].b == prev->iter->l[0].b; + return idx && + trans->updates[trans->updates_sorted[idx]].iter->l[0].b == + trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; } #define trans_for_each_update_sorted(_trans, _i, _iter) \ @@ -55,23 +51,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static void btree_trans_lock_write(struct btree_trans *trans, bool lock) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned iter; - - trans_for_each_update_sorted(trans, i, iter) { - if (same_leaf_as_prev(trans, iter)) - continue; - - if (lock) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); - else - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); - } -} - static inline void btree_trans_sort_updates(struct btree_trans *trans) { struct btree_insert_entry *l, *r; @@ -92,8 +71,6 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans) trans->updates_sorted[pos] = l - trans->updates; nr++; } - - BUG_ON(nr != trans->nr_updates); } /* Inserting into a given leaf node (last stage of insert): */ @@ -266,8 +243,8 @@ static void bch2_insert_fixup_key(struct btree_trans *trans, EBUG_ON(insert->k->k.u64s > bch_btree_keys_u64s_remaining(trans->c, l->b)); - if (bch2_btree_bset_insert_key(iter, l->b, &l->iter, - insert->k)) + if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, + insert->k))) bch2_btree_journal_key(trans, iter, insert->k); } @@ -280,7 +257,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + struct bset_tree *t = bset_tree_last(b); + int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -290,7 +268,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, bch2_insert_fixup_extent(trans, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); @@ -323,26 +301,12 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); } -static int bch2_trans_journal_preres_get(struct btree_trans *trans) +static noinline int +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned u64s = 0; int ret; - trans_for_each_update(trans, i) - if (0) - u64s += jset_u64s(i->k->k.u64s); - - if (!u64s) - return 0; - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, u64s, - JOURNAL_RES_GET_NONBLOCK); - if (ret != -EAGAIN) - return ret; - bch2_trans_unlock(trans); ret = bch2_journal_preres_get(&c->journal, @@ -358,8 +322,8 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans) return 0; } -static int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) +static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) { struct bch_fs *c = trans->c; int ret; @@ -397,13 +361,73 @@ btree_key_can_insert(struct btree_trans *trans, return BTREE_INSERT_OK; } -static int btree_trans_check_can_insert(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_insert_entry *insert) { + btree_insert_key_leaf(trans, insert); +} + +static inline bool update_has_trans_triggers(struct btree_insert_entry *i) +{ + return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); +} + +static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) +{ + return (BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & + (1U << i->iter->btree_id); +} + +static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) +{ + __bch2_btree_iter_unlock(iter); +} + +static noinline void bch2_trans_mark_gc(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; + + if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) + return; + + trans_for_each_update(trans, i) + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i, NULL, + mark_flags|BCH_BUCKET_MARK_GC); +} + +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; unsigned iter, u64s = 0; + bool marking = false; int ret; + if (race_fault()) { + trace_trans_restart_fault_inject(trans->ip); + return -EINTR; + } + + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + + prefetch(&trans->c->journal.flags); + trans_for_each_update_sorted(trans, i, iter) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, iter)) @@ -415,70 +439,132 @@ static int btree_trans_check_can_insert(struct btree_trans *trans, *stopped_at = i; return ret; } + + if (btree_node_type_needs_gc(i->iter->btree_id)) + marking = true; } - return 0; -} + if (marking) { + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + } -static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_insert_entry *insert) -{ - btree_insert_key_leaf(trans, insert); -} + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + ret = bch2_trans_journal_res_get(trans, + JOURNAL_RES_GET_NONBLOCK); + if (ret) + goto err; + } -static inline bool update_triggers_transactional(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && - (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES || - i->iter->btree_id == BTREE_ID_REFLINK); -} + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ -static inline bool update_has_triggers(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - btree_node_type_needs_gc(i->iter->btree_id); + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (journal_seq_verify(c)) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (inject_invalid_keys(c)) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && + bch2_replicas_delta_list_apply(c, fs_usage, + trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; + } + + trans_for_each_update(trans, i) + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_nontrans_triggers(i)) + bch2_mark_update(trans, i, fs_usage, mark_flags); + + if (marking) + bch2_trans_fs_usage_apply(trans, fs_usage); + + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); + + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); +err: + if (marking) { + bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + } + + return ret; } /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ -static inline int do_btree_insert_at(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) { - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; struct btree_iter *iter; - unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE - ? BCH_BUCKET_MARK_BUCKET_INVALIDATE - : 0; + unsigned idx, u64s, journal_preres_u64s = 0; int ret; - trans_for_each_update(trans, i) - BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - /* * note: running triggers will append more updates to the list of * updates as we're walking it: */ - trans_for_each_update(trans, i) - if (update_has_triggers(trans, i) && - update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - if (ret) - goto out_clear_replicas; + trans_for_each_update(trans, i) { + /* we know trans->nounlock won't be set here: */ + if (unlikely(!(i->iter->locks_want < 1 + ? __bch2_btree_iter_upgrade(i->iter, 1) + : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; } - trans_for_each_iter(trans, iter) { + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_trans_triggers(i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + return ret; + } + } + + u64s = jset_u64s(i->k->k.u64s); + if (0) + journal_preres_u64s += u64s; + trans->journal_u64s += u64s; + } + + ret = bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + journal_preres_u64s); + if (unlikely(ret)) + return ret; + + /* + * Can't be holding any read locks when we go to take write locks: + * + * note - this must be done after bch2_trans_journal_preres_get_cold() + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ + trans_for_each_iter_all(trans, iter) { if (iter->nodes_locked != iter->nodes_intent_locked) { - BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); - BUG_ON(trans->iters_live & (1ULL << iter->idx)); - __bch2_btree_iter_unlock(iter); + EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + EBUG_ON(trans->iters_live & (1ULL << iter->idx)); + bch2_btree_iter_unlock_noinline(iter); } } @@ -493,106 +579,41 @@ static inline int do_btree_insert_at(struct btree_trans *trans, */ btree_trans_sort_updates(trans); - btree_trans_lock_write(trans, true); + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_lock_for_insert(trans->c, + i->iter->l[0].b, i->iter); - if (race_fault()) { - ret = -EINTR; - trace_trans_restart_fault_inject(trans->ip); - goto out; - } + ret = bch2_trans_commit_write_locked(trans, stopped_at); + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + i->iter); /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: */ - ret = btree_trans_check_can_insert(trans, stopped_at); - if (ret) - goto out; + bch2_journal_res_put(&trans->c->journal, &trans->journal_res); - trans_for_each_update(trans, i) { - if (!btree_node_type_needs_gc(i->iter->btree_id)) - continue; + if (unlikely(ret)) + return ret; - if (!fs_usage) { - percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); - } + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; - if (!bch2_bkey_replicas_marked_locked(c, - bkey_i_to_s_c(i->k), true)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto out; - } - } + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - trans->journal_u64s = 0; - - trans_for_each_update(trans, i) - trans->journal_u64s += jset_u64s(i->k->k.u64s); - - ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); - if (ret) - goto out; - } - - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (journal_seq_verify(c)) - trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; - else if (inject_invalid_keys(c)) - trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; - } + trans->nounlock = false; trans_for_each_update(trans, i) - if (update_has_triggers(trans, i) && - !update_triggers_transactional(trans, i)) - bch2_mark_update(trans, i, fs_usage, mark_flags); + bch2_btree_iter_downgrade(i->iter); - if (fs_usage && trans->fs_usage_deltas) - bch2_replicas_delta_list_apply(c, fs_usage, - trans->fs_usage_deltas); - - if (fs_usage) - bch2_trans_fs_usage_apply(trans, fs_usage); - - if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - unlikely(c->gc_pos.phase)) - trans_for_each_update(trans, i) - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i, NULL, - mark_flags| - BCH_BUCKET_MARK_GC); - - trans_for_each_update(trans, i) - do_btree_insert_one(trans, i); -out: - BUG_ON(ret && - (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && - trans->journal_res.ref); - - btree_trans_lock_write(trans, false); - - if (fs_usage) { - bch2_fs_usage_scratch_put(c, fs_usage); - percpu_up_read(&c->mark_lock); - } - - bch2_journal_res_put(&c->journal, &trans->journal_res); -out_clear_replicas: - if (trans->fs_usage_deltas) { - memset(&trans->fs_usage_deltas->fs_usage, 0, - sizeof(trans->fs_usage_deltas->fs_usage)); - trans->fs_usage_deltas->used = 0; - } - - return ret; + return 0; } static noinline @@ -700,66 +721,27 @@ int bch2_trans_commit_error(struct btree_trans *trans, return ret; } -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static int __bch2_trans_commit(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned iter; int ret; - trans_for_each_update(trans, i) { - if (!bch2_btree_iter_upgrade(i->iter, 1)) { - trace_trans_restart_upgrade(trans->ip); - ret = -EINTR; - goto err; - } + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + return -EROFS; - ret = btree_iter_err(i->iter); - if (ret) - goto err; - } + bch2_trans_unlock(trans); - ret = do_btree_insert_at(trans, stopped_at); - if (unlikely(ret)) - goto err; + ret = bch2_fs_read_write_early(c); + if (ret) + return ret; - if (trans->flags & BTREE_INSERT_NOUNLOCK) - trans->nounlock = true; - - trans_for_each_update_sorted(trans, i, iter) - if (!same_leaf_as_prev(trans, iter)) - bch2_foreground_maybe_merge(c, i->iter, - 0, trans->flags); - - trans->nounlock = false; - - trans_for_each_update(trans, i) - bch2_btree_iter_downgrade(i->iter); -err: - /* make sure we didn't drop or screw up locks: */ - bch2_btree_trans_verify_locks(trans); - - return ret; + percpu_ref_get(&c->writes); + return 0; } -int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) +int __bch2_trans_commit(struct btree_trans *trans) { - struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; struct btree_iter *iter; unsigned orig_nr_updates = trans->nr_updates; @@ -770,61 +752,46 @@ int bch2_trans_commit(struct btree_trans *trans, goto out_noupdates; /* for the sake of sanity: */ - BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC)); + EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&trans->c->gc_lock); - if (!trans->commit_start) - trans->commit_start = local_clock(); - - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - trans->flags = flags; - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - !percpu_ref_tryget(&c->writes))) { - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) - return -EROFS; - - bch2_trans_unlock(trans); - - ret = bch2_fs_read_write_early(c); + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!percpu_ref_tryget(&trans->c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); if (ret) return ret; - - percpu_ref_get(&c->writes); - - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } } retry: - ret = bch2_trans_journal_preres_get(trans); - if (ret) - goto err; + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + trans->journal_u64s = 0; + + ret = do_bch2_trans_commit(trans, &i); + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } + + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); - ret = __bch2_trans_commit(trans, &i); if (ret) goto err; out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&c->writes); + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); out_noupdates: - if (!ret && trans->commit_start) { - bch2_time_stats_update(&c->times[BCH_TIME_btree_update], - trans->commit_start); - trans->commit_start = 0; - } + EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - - trans_for_each_iter(trans, iter) + trans_for_each_iter_all(trans, iter) iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; if (!ret) { @@ -838,18 +805,16 @@ out_noupdates: err: ret = bch2_trans_commit_error(trans, i, ret); - /* free updates and memory used by triggers, they'll be reexecuted: */ - trans->nr_updates = orig_nr_updates; - trans->mem_top = orig_mem_top; - /* can't loop if it was passed in and we changed it: */ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ret = -EINTR; + if (ret) + goto out; - if (!ret) - goto retry; - - goto out; + /* free updates and memory used by triggers, they'll be reexecuted: */ + trans->nr_updates = orig_nr_updates; + trans->mem_top = orig_mem_top; + goto retry; } /** diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 8481c707..c4183982 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -499,14 +499,18 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c) } } -static inline void update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, - s64 sectors) +static inline int update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); - BUG_ON(idx < 0); + if (idx < 0) + return -1; + + if (!fs_usage) + return 0; switch (r->data_type) { case BCH_DATA_BTREE: @@ -520,6 +524,7 @@ static inline void update_replicas(struct bch_fs *c, break; } fs_usage->replicas[idx] += sectors; + return 0; } static inline void update_cached_sectors(struct bch_fs *c, @@ -579,23 +584,41 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_replicas_delta_list_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct replicas_delta_list *r) +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +int bch2_replicas_delta_list_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct replicas_delta_list *r) { struct replicas_delta *d = r->d; struct replicas_delta *top = (void *) r->d + r->used; + unsigned i; - acc_u64s((u64 *) fs_usage, - (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); + for (d = r->d; d != top; d = replicas_delta_next(d)) + if (update_replicas(c, fs_usage, &d->r, d->delta)) { + top = d; + goto unwind; + } - while (d != top) { - BUG_ON((void *) d > (void *) top); + if (!fs_usage) + return 0; - update_replicas(c, fs_usage, &d->r, d->delta); + fs_usage->nr_inodes += r->nr_inodes; - d = (void *) d + replicas_entry_bytes(&d->r) + 8; + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + fs_usage->reserved += r->persistent_reserved[i]; + fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; } + + return 0; +unwind: + for (d = r->d; d != top; d = replicas_delta_next(d)) + update_replicas(c, fs_usage, &d->r, -d->delta); + return -1; } #define do_mark_fn(fn, c, pos, flags, ...) \ @@ -1451,7 +1474,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, if (ret < 0) return ret; - if (!ret) { + if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { /* * During journal replay, and if gc repairs alloc info at * runtime, the alloc info in the btree might not be up to date @@ -1739,9 +1762,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d = replicas_deltas_realloc(trans, 0); if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) - d->fs_usage.nr_inodes++; + d->nr_inodes++; else - d->fs_usage.nr_inodes--; + d->nr_inodes--; return 0; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1750,10 +1773,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, sectors *= replicas; replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->fs_usage.persistent_reserved)); + ARRAY_SIZE(d->persistent_reserved)); - d->fs_usage.reserved += sectors; - d->fs_usage.persistent_reserved[replicas - 1] += sectors; + d->persistent_reserved[replicas - 1] += sectors; return 0; } case KEY_TYPE_reflink_p: diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 8ab18b55..ad6f731b 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -279,9 +279,9 @@ int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, struct bch_fs_usage *, unsigned); -void bch2_replicas_delta_list_apply(struct bch_fs *, - struct bch_fs_usage *, - struct replicas_delta_list *); +int bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 94bd9da3..f3ff4a18 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -102,7 +102,11 @@ struct replicas_delta { struct replicas_delta_list { unsigned size; unsigned used; - struct bch_fs_usage fs_usage; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; struct replicas_delta d[0]; }; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 34eb70ce..4cc2a4b1 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -806,119 +806,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) return true; } -static bool extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - struct bkey_packed tmp; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = src->k; - else if (bch2_bkey_pack_key(&tmp, &src->k, f)) - memcpy_u64s(dst, &tmp, f->key_u64s); - else - return false; - - memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); - return true; -} - -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; - struct bkey_packed *k; - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - node_iter = l->iter; - k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) - return; - - node_iter = l->iter; - k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) - return; - - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - node_iter = l->iter; - while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0) - l->iter = node_iter; - - k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1126,6 +1013,71 @@ bch2_extent_can_insert(struct btree_trans *trans, return BTREE_INSERT_OK; } +static void verify_extent_nonoverlapping(struct bch_fs *c, + struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; + + if (!expensive_debug_checks(c)) + return; + + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); + } +#endif + +#endif +} + +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +} + static void extent_squash(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert, @@ -1140,8 +1092,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, __bch2_cut_front(insert->k.p, k); EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); break; case BCH_EXTENT_OVERLAP_BACK: @@ -1176,8 +1127,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, _k, u64s, 0); } else { extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); } break; @@ -1207,8 +1157,7 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); extent_bset_insert(c, iter, &split.k); break; @@ -1216,85 +1165,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, } } -struct extent_insert_state { - struct bkey_i whiteout; - bool update_journal; - bool update_btree; - bool deleting; -}; - -static void __bch2_insert_fixup_extent(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_i *insert, - struct extent_insert_state *s) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *_k; - struct bkey unpacked; - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (!bkey_whiteout(k.k)) - s->update_journal = true; - - if (!s->update_journal) { - bch2_cut_front(cur_end, insert); - bch2_cut_front(cur_end, &s->whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (s->deleting && - !s->update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - break; - } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - s->update_btree = true; - } - - if (s->update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; - } - - extent_squash(c, iter, insert, _k, k, overlap); - - if (!s->update_btree) - bch2_cut_front(cur_end, insert); -next: - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - } -} - /** * bch_extent_insert_fixup - insert a new extent and deal with overlaps * @@ -1335,30 +1205,96 @@ next: * key insertion needs to continue/be retried. */ void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_insert_entry *insert) + struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct extent_insert_state s = { - .whiteout = *insert->k, - .update_journal = !bkey_whiteout(&insert->k->k), - .update_btree = !bkey_whiteout(&insert->k->k), - .deleting = bkey_whiteout(&insert->k->k), - }; + struct btree_iter *iter = insert_entry->iter; + struct bkey_i *insert = insert_entry->k; + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + bool deleting = bkey_whiteout(&insert->k); + bool update_journal = !deleting; + bool update_btree = !deleting; + struct bkey_i whiteout = *insert; + struct bkey_packed *_k; + struct bkey unpacked; BKEY_PADDED(k) tmp; EBUG_ON(iter->level); - EBUG_ON(!insert->k->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); + EBUG_ON(!insert->k.size); + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - __bch2_insert_fixup_extent(c, iter, insert->k, &s); + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); - bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p); + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; - if (s.update_btree) { - bkey_copy(&tmp.k, insert->k); + if (!bkey_whiteout(k.k)) + update_journal = true; - if (s.deleting) + if (!update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); + goto next; + } + + /* + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: + */ + if (deleting && + !update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + _k->type = KEY_TYPE_discard; + reserve_whiteout(l->b, _k); + bch2_btree_iter_fix_key_modified(iter, + l->b, _k); + } + break; + } + + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { + insert->k.needs_whiteout = true; + update_btree = true; + } + + if (update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(l->b, _k); + _k->needs_whiteout = false; + } + + extent_squash(c, iter, insert, _k, k, overlap); + + if (!update_btree) + bch2_cut_front(cur_end, insert); +next: + node_iter = l->iter; + + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) + break; + } + + l->iter = node_iter; + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); + + if (update_btree) { + bkey_copy(&tmp.k, insert); + + if (deleting) tmp.k.k.type = KEY_TYPE_discard; EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); @@ -1366,10 +1302,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, extent_bset_insert(c, iter, &tmp.k); } - if (s.update_journal) { - bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout); + if (update_journal) { + bkey_copy(&tmp.k, !deleting ? insert : &whiteout); - if (s.deleting) + if (deleting) tmp.k.k.type = KEY_TYPE_discard; EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); @@ -1377,7 +1313,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, bch2_btree_journal_key(trans, iter, &tmp.k); } - bch2_cut_front(insert->k->k.p, insert->k); + bch2_cut_front(insert->k.p, insert); } const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -1485,8 +1421,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, #undef set_common_fields } -static void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); union bch_extent_crc *crc = (void *) ptrs.end; @@ -1519,8 +1455,8 @@ static inline void __extent_entry_insert(struct bkey_i *k, { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); k->k.u64s += extent_entry_u64s(new); memcpy(dst, new, extent_entry_bytes(new)); } @@ -1717,93 +1653,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, return BCH_MERGE_MERGE; } -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch2_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->l[0].b; - struct btree_node_iter *node_iter = &iter->l[0].iter; - BKEY_PADDED(k) li, ri; - struct bkey_packed *m = back_merge ? l : r; - struct bkey_i *mi = back_merge ? &li.k : &ri.k; - struct bset_tree *t = bch2_bkey_to_bset(b, m); - enum merge_result ret; - - EBUG_ON(bkey_written(b, m)); - - if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX || - bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX) - return BCH_MERGE_NOMERGE; - - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bch2_bkey_unpack(b, &li.k, l); - bch2_bkey_unpack(b, &ri.k, r); - - ret = bch2_bkey_merge(c, - bkey_i_to_s(&li.k), - bkey_i_to_s(&ri.k)); - if (ret == BCH_MERGE_NOMERGE) - return false; - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k)); - if (debug_check_bkeys(c) && - ret == BCH_MERGE_PARTIAL) - bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k)); - - /* - * check if we overlap with deleted extents - would break the sort - * order: - */ - if (back_merge) { - struct bkey_packed *n = bkey_next(m); - - if (n != btree_bkey_last(b, t) && - bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && - bkey_deleted(n)) - return false; - } else if (ret == BCH_MERGE_MERGE) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - if (prev && - bkey_cmp_left_packed_byval(b, prev, - bkey_start_pos(&li.k.k)) > 0) - return false; - } - - if (ret == BCH_MERGE_PARTIAL) { - if (!extent_i_save(b, m, mi)) - return false; - - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - } else { - if (!extent_i_save(b, m, &li.k)) - return false; - } - - bch2_bset_fix_invalidated_key(b, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - m, m->u64s, m->u64s); - - return ret == BCH_MERGE_MERGE; -} - bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, unsigned nr_replicas) { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 67abc3c8..cc7ee906 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -508,6 +508,8 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ extent_entry_last(_e), _ptr, _entry) +void bch2_extent_crc_append(struct bkey_i *, + struct bch_extent_crc_unpacked); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 770fed19..90a9bfa4 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -45,7 +45,7 @@ struct bch_writepage_io { }; struct dio_write { - struct closure cl; + struct completion done; struct kiocb *req; struct mm_struct *mm; unsigned loop:1, @@ -483,7 +483,12 @@ static void bch2_set_page_dirty(struct bch_fs *c, unsigned sectors = sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - BUG_ON(sectors > res->disk.sectors); + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; @@ -1204,6 +1209,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || bio_full(&w->io->op.wbio.bio) || + w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1726,8 +1732,6 @@ start: /* O_DIRECT writes */ -static void bch2_dio_write_loop_async(struct closure *); - static long bch2_dio_write_loop(struct dio_write *dio) { bool kthread = (current->flags & PF_KTHREAD) != 0; @@ -1747,9 +1751,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (dio->loop) goto loop; - inode_dio_begin(&inode->v); - __pagecache_block_get(&mapping->add_lock); - /* Write and invalidate pagecache range that we're writing to: */ offset = req->ki_pos + (dio->op.written << 9); ret = write_invalidate_inode_pages_range(mapping, @@ -1801,8 +1802,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) task_io_account_write(bio->bi_iter.bi_size); - closure_call(&dio->op.cl, bch2_write, NULL, &dio->cl); - if (!dio->sync && !dio->loop && dio->iter.count) { struct iovec *iov = dio->inline_vecs; @@ -1810,8 +1809,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), GFP_KERNEL); if (unlikely(!iov)) { - dio->op.error = -ENOMEM; - goto err_wait_io; + dio->sync = true; + goto do_io; } dio->free_iov = true; @@ -1820,15 +1819,14 @@ static long bch2_dio_write_loop(struct dio_write *dio) memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); dio->iter.iov = iov; } -err_wait_io: +do_io: dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); - if (!dio->sync) { - continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); + if (dio->sync) + wait_for_completion(&dio->done); + else return -EIOCBQUEUED; - } - - closure_sync(&dio->cl); loop: i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); @@ -1845,7 +1843,9 @@ loop: put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; + bio_reset(bio); + reinit_completion(&dio->done); } ret = dio->op.error ?: ((long) dio->op.written << 9); @@ -1857,8 +1857,6 @@ err: if (dio->free_iov) kfree(dio->iter.iov); - closure_debug_destroy(&dio->cl); - sync = dio->sync; bio_put(bio); @@ -1872,48 +1870,75 @@ err: return ret; } -static void bch2_dio_write_loop_async(struct closure *cl) +static void bch2_dio_write_loop_async(struct bch_write_op *op) { - struct dio_write *dio = container_of(cl, struct dio_write, cl); + struct dio_write *dio = container_of(op, struct dio_write, op); - bch2_dio_write_loop(dio); + if (dio->sync) + complete(&dio->done); + else + bch2_dio_write_loop(dio); } -static int bch2_direct_IO_write(struct kiocb *req, - struct iov_iter *iter, - bool swap) +static noinline +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) { struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct dio_write *dio; struct bio *bio; + bool locked = true, extending; ssize_t ret; - lockdep_assert_held(&inode->v.i_rwsem); + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); - if (unlikely(!iter->count)) - return 0; + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - return -EINVAL; + goto err; + + inode_dio_begin(&inode->v); + __pagecache_block_get(&mapping->add_lock); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); - closure_init(&dio->cl, NULL); + init_completion(&dio->done); dio->req = req; dio->mm = current->mm; dio->loop = false; - dio->sync = is_sync_kiocb(req) || - req->ki_pos + iter->count > inode->v.i_size; + dio->sync = is_sync_kiocb(req) || extending; dio->free_iov = false; dio->quota_res.sectors = 0; dio->iter = *iter; bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = bch2_dio_write_loop_async; dio->op.target = opts.foreground_target; op_journal_seq_set(&dio->op, &inode->ei_journal_seq); dio->op.write_point = writepoint_hashed((unsigned long) current); @@ -1926,7 +1951,7 @@ static int bch2_direct_IO_write(struct kiocb *req, ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, iter->count >> 9, true); if (unlikely(ret)) - goto err; + goto err_put_bio; dio->op.nr_replicas = dio->op.opts.data_replicas; @@ -1937,15 +1962,22 @@ static int bch2_direct_IO_write(struct kiocb *req, req->ki_pos >> 9), iter->count >> 9, dio->op.opts.data_replicas)) - goto err; + goto err_put_bio; - return bch2_dio_write_loop(dio); + ret = bch2_dio_write_loop(dio); err: + if (locked) + inode_unlock(&inode->v); + if (ret > 0) + req->ki_pos += ret; + return ret; +err_put_bio: + __pagecache_block_put(&mapping->add_lock); bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); - closure_debug_destroy(&dio->cl); bio_put(bio); - return ret; + inode_dio_end(&inode->v); + goto err; } ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) @@ -1953,61 +1985,49 @@ ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; ssize_t ret; + if (iov_iter_rw(iter) == WRITE) + return -EINVAL; + blk_start_plug(&plug); - ret = iov_iter_rw(iter) == WRITE - ? bch2_direct_IO_write(req, iter, false) - : bch2_direct_IO_read(req, iter); + ret = bch2_direct_IO_read(req, iter); blk_finish_plug(&plug); return ret; } -static ssize_t -bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter) -{ - return bch2_direct_IO_write(iocb, iter, true); -} - -static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) + return bch2_direct_write(iocb, from); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(&inode->v); + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); if (ret) - goto out; + goto unlock; ret = file_update_time(file); if (ret) - goto out; - - ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, from) - : bch2_buffered_write(iocb, from); + goto unlock; + ret = bch2_buffered_write(iocb, from); if (likely(ret > 0)) iocb->ki_pos += ret; -out: - current->backing_dev_info = NULL; - return ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp); - bool direct = iocb->ki_flags & IOCB_DIRECT; - ssize_t ret; - - inode_lock(&inode->v); - ret = generic_write_checks(iocb, from); - if (ret > 0) - ret = __bch2_write_iter(iocb, from); +unlock: inode_unlock(&inode->v); + current->backing_dev_info = NULL; - if (ret > 0 && !direct) + if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; @@ -2726,20 +2746,26 @@ long bch2_fallocate_dispatch(struct file *file, int mode, loff_t offset, loff_t len) { struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - return bchfs_fallocate(inode, mode, offset, len); + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + ret = bchfs_fpunch(inode, offset, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else + ret = -EOPNOTSUPP; - if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - return bchfs_fpunch(inode, offset, len); + percpu_ref_put(&c->writes); - if (mode == FALLOC_FL_INSERT_RANGE) - return bchfs_fcollapse_finsert(inode, offset, len, true); - - if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bchfs_fcollapse_finsert(inode, offset, len, false); - - return -EOPNOTSUPP; + return ret; } static void mark_range_unallocated(struct bch_inode_info *inode, diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 5acf1fb6..3cced2b9 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -147,6 +147,7 @@ struct hash_check { static void hash_check_init(struct hash_check *h) { h->chain = NULL; + h->chain_end = 0; } static void hash_stop_chain(struct btree_trans *trans, diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 59653b97..c0642ff4 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -509,7 +509,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, { struct btree_iter *iter; struct bkey_s_c k; - int ret = -ENOENT; + int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inode_nr, 0), BTREE_ITER_SLOTS); @@ -517,8 +517,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, return PTR_ERR(iter); k = bch2_btree_iter_peek_slot(iter); - if (k.k->type == KEY_TYPE_inode) - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bkey_err(k); + if (ret) + return ret; + + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; bch2_trans_iter_put(trans, iter); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 79003dff..836004b1 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -300,6 +300,7 @@ int bch2_extent_update(struct btree_trans *trans, bch2_trans_update(trans, iter, k); ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_ATOMIC| BTREE_INSERT_USE_RESERVE); @@ -496,7 +497,12 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - closure_return(cl); + if (op->end_io) + op->end_io(op); + if (cl->parent) + closure_return(cl); + else + closure_debug_destroy(cl); } /** @@ -605,8 +611,10 @@ static void bch2_write_endio(struct bio *bio) if (parent) bio_endio(&parent->bio); - else + else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) closure_put(cl); + else + continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); } static void init_append_extent(struct bch_write_op *op, @@ -615,27 +623,36 @@ static void init_append_extent(struct bch_write_op *op, struct bch_extent_crc_unpacked crc) { struct bch_fs *c = op->c; - struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); - struct extent_ptr_decoded p = { .crc = crc }; + struct bkey_i_extent *e; struct open_bucket *ob; unsigned i; + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; e->k.version = version; - BUG_ON(crc.compressed_size > wp->sectors_free); - wp->sectors_free -= crc.compressed_size; + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); open_bucket_for_each(c, &wp->ptrs, ob, i) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + union bch_extent_entry *end = + bkey_val_end(bkey_i_to_s(&e->k_i)); - p.ptr = ob->ptr; - p.ptr.cached = !ca->mi.durability || + end->ptr = ob->ptr; + end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + end->ptr.cached = !ca->mi.durability || (op->flags & BCH_WRITE_CACHED) != 0; - p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_extent_ptr_decoded_append(&e->k_i, &p); + end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; + + e->k.u64s++; BUG_ON(crc.compressed_size > ob->sectors_free); ob->sectors_free -= crc.compressed_size; @@ -816,15 +833,14 @@ static enum prep_encoded_ret { return PREP_ENCODED_OK; } -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) { struct bch_fs *c = op->c; struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; - struct bkey_i *key_to_write; void *ec_buf; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; + struct bpos ec_pos = op->pos; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; @@ -843,6 +859,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ if (ec_buf) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, @@ -992,21 +1009,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter.bi_size = total_output; do_write: /* might have done a realloc... */ + bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); - key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - - bch2_ec_add_backpointer(c, wp, - bkey_start_pos(&key_to_write->k), - total_input >> 9); - - dst->bi_end_io = bch2_write_endio; - dst->bi_private = &op->cl; - bio_set_op_attrs(dst, REQ_OP_WRITE, 0); - - closure_get(dst->bi_private); - - bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER, - key_to_write); + *_dst = dst; return more; csum_err: bch_err(c, "error verifying existing checksum while " @@ -1026,11 +1031,17 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct write_point *wp; + struct bio *bio; + bool skip_put = true; int ret; again: memset(&op->failed, 0, sizeof(op->failed)); do { + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) @@ -1063,23 +1074,39 @@ again: goto flush_io; } - ret = bch2_write_extent(op, wp); - bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); if (ret < 0) goto err; + + if (ret) + skip_put = false; + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + if (!skip_put) + closure_get(bio->bi_private); + else + op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + key_to_write); } while (ret); - continue_at(cl, bch2_write_index, index_update_wq(op)); + if (!skip_put) + continue_at(cl, bch2_write_index, index_update_wq(op)); return; err: op->error = ret; - continue_at(cl, !bch2_keylist_empty(&op->insert_keys) - ? bch2_write_index - : bch2_write_done, index_update_wq(op)); + continue_at(cl, bch2_write_index, index_update_wq(op)); return; flush_io: closure_sync(cl); @@ -1434,6 +1461,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio int ret; flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; bch2_trans_init(&trans, c, 0, 0); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index a72c7ccd..91aaa58f 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -33,6 +33,7 @@ enum bch_write_flags { /* Internal: */ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), + BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -67,7 +68,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct bch_io_opts opts) { op->c = c; - op->io_wq = index_update_wq(op); + op->end_io = NULL; op->flags = 0; op->written = 0; op->error = 0; diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index c2c2cce0..c37b7d74 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -93,7 +93,7 @@ struct bch_write_bio { struct bch_write_op { struct closure cl; struct bch_fs *c; - struct workqueue_struct *io_wq; + void (*end_io)(struct bch_write_op *); u64 start_time; unsigned written; /* sectors */ @@ -109,7 +109,6 @@ struct bch_write_op { struct bch_devs_list devs_have; u16 target; u16 nonce; - struct bch_io_opts opts; struct bpos pos; diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index ec5ba2b9..ec61137d 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -269,7 +269,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _RET_IP_); + lock_release(&j->res_map, 0, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index a6a4dda5..387377da 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -934,8 +934,6 @@ out: /* also must come before signalling write completion: */ closure_debug_destroy(cl); - DEBUG_MEMORY_FREED(w->data, w->buf_size); - BUG_ON(!j->reservations.prev_buf_unwritten); atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, &j->reservations.counter); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 23f3ed54..2c441a27 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -864,6 +864,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } bch_verbose(c, "alloc write done"); + + set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); } if (!c->sb.clean) { diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 4a4b17f9..6d45ae24 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -166,6 +166,9 @@ s64 bch2_remap_range(struct bch_fs *c, u64 src_done, dst_done; int ret = 0, ret2 = 0; + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { mutex_lock(&c->sb_lock); if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { @@ -295,5 +298,7 @@ err: ret = bch2_trans_exit(&trans) ?: ret; + percpu_ref_put(&c->writes); + return dst_done ?: ret ?: ret2; } diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 05027761..8e704b4a 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -550,6 +550,16 @@ size_t bch2_rand_range(size_t); void memcpy_to_bio(struct bio *, struct bvec_iter, void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +static inline void memcpy_u64s_small(void *dst, const void *src, + unsigned u64s) +{ + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +} + static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { @@ -591,6 +601,24 @@ static inline void memmove_u64s_down(void *dst, const void *src, __memmove_u64s_down(dst, src, u64s); } +static inline void __memmove_u64s_up_small(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s; + u64 *src = (u64 *) _src + u64s; + + while (u64s--) + *--dst = *--src; +} + +static inline void memmove_u64s_up_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up_small(dst, src, u64s); +} + static inline void __memmove_u64s_up(void *_dst, const void *_src, unsigned u64s) {