diff --git a/.bcachefs_revision b/.bcachefs_revision index def87375..713b4346 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -454bd4f82d85bb42a86b8eb0172b13e86e5788a7 +f38382c5747090ac9160e6d5fa1386954cb1f23c diff --git a/cmd_debug.c b/cmd_debug.c index 637da1c5..808226d9 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -63,7 +63,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) struct btree_iter *iter; struct btree *b; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_node(&trans, iter, i, POS_MIN, 0, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); @@ -160,7 +160,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, char buf[512]; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, btree_id, start, BTREE_ITER_PREFETCH, k, ret) { @@ -181,7 +181,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, struct btree *b; char buf[4096]; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_node(&trans, iter, btree_id, start, 0, b) { if (bkey_cmp(b->key.k.p, end) > 0) @@ -204,7 +204,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, struct btree *b; char buf[4096]; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_node(&trans, iter, btree_id, start, 0, b) { if (bkey_cmp(b->key.k.p, end) > 0) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 5fa570a5..76673d9a 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -499,16 +499,14 @@ TRACE_EVENT(copygc, ); DECLARE_EVENT_CLASS(transaction_restart, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip), + TP_PROTO(unsigned long ip), + TP_ARGS(ip), TP_STRUCT__entry( - __array(char, name, 16) __field(unsigned long, ip ) ), TP_fast_assign( - memcpy(__entry->name, c->name, 16); __entry->ip = ip; ), @@ -516,58 +514,130 @@ DECLARE_EVENT_CLASS(transaction_restart, ); DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_iters_realloced, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) +TRACE_EVENT(trans_restart_iters_realloced, + TP_PROTO(unsigned long ip, unsigned nr), + TP_ARGS(ip, nr), + + TP_STRUCT__entry( + __field(unsigned long, ip ) + __field(unsigned, nr ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->nr = nr; + ), + + TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ); -DEFINE_EVENT(transaction_restart, trans_restart_mem_realloced, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(unsigned long ip, unsigned long bytes), + TP_ARGS(ip, bytes), + + TP_STRUCT__entry( + __field(unsigned long, ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->bytes = bytes; + ), + + TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_mark, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_upgrade, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_traverse, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) ); DEFINE_EVENT(transaction_restart, trans_restart_atomic, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip) + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq), + + TP_STRUCT__entry( + __field(u32, level) + __field(u32, iter_seq) + __field(u32, node) + __field(u32, node_seq) + ), + + TP_fast_assign( + __entry->level = level; + __entry->iter_seq = iter_seq; + __entry->node = node; + __entry->node_seq = node_seq; + ), + + TP_printk("level %u iter seq %u node %u node seq %u", + __entry->level, __entry->iter_seq, + __entry->node, __entry->node_seq) +); + +DEFINE_EVENT(node_lock_fail, node_upgrade_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq) +); + +DEFINE_EVENT(node_lock_fail, node_relock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq) ); #endif /* _TRACE_BCACHE_H */ diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index e1c7b87d..cdcccaad 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -220,7 +220,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -301,7 +301,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); if (type == ACL_TYPE_ACCESS && acl) { ret = posix_acl_update_mode(&inode->v, &mode, &acl); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index a61b25cc..744addb0 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -228,10 +228,12 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) unsigned i; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) - bch2_mark_key(c, k, true, 0, NULL, 0, 0); + bch2_mark_key(c, k, 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); ret = bch2_trans_exit(&trans) ?: ret; if (ret) { @@ -241,8 +243,9 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) for_each_journal_key(*journal_keys, j) if (j->btree_id == BTREE_ID_ALLOC) - bch2_mark_key(c, bkey_i_to_s_c(j->k), - true, 0, NULL, 0, 0); + bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); percpu_down_write(&c->mark_lock); bch2_dev_usage_from_buckets(c); @@ -283,7 +286,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) if (k->k.p.offset >= ca->mi.nbuckets) return 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, BTREE_ITER_INTENT); @@ -328,7 +331,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -948,6 +951,7 @@ retry: BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_BUCKET_INVALIDATE| flags); if (ret == -EINTR) goto retry; @@ -1027,7 +1031,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) u64 journal_seq = 0; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index d6dc3bd4..09afbed9 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -842,4 +842,9 @@ static inline s64 bch2_current_time(struct bch_fs *c) return timespec_to_bch2_time(c, now); } +static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +{ + return dev < c->sb.nr_devices && c->devs[dev]; +} + #endif /* _BCACHEFS_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 48c86e52..711bc88f 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -201,15 +201,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, struct bkey_i *l, struct bkey_i *r) { const struct bkey_ops *ops = &bch2_bkey_ops[l->k.type]; + enum merge_result ret; - if (!key_merging_disabled(c) && - ops->key_merge && - l->k.type == r->k.type && - !bversion_cmp(l->k.version, r->k.version) && - !bkey_cmp(l->k.p, bkey_start_pos(&r->k))) - return ops->key_merge(c, l, r); + if (key_merging_disabled(c) || + !ops->key_merge || + l->k.type != r->k.type || + bversion_cmp(l->k.version, r->k.version) || + bkey_cmp(l->k.p, bkey_start_pos(&r->k))) + return BCH_MERGE_NOMERGE; - return BCH_MERGE_NOMERGE; + ret = ops->key_merge(c, l, r); + + if (ret != BCH_MERGE_NOMERGE) + l->k.needs_whiteout |= r->k.needs_whiteout; + return ret; } static const struct old_bkey_type { diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 2a20bdef..587a04f5 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -652,8 +652,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - bool may_drop_locks) + enum six_lock_type lock_type) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -720,8 +719,7 @@ retry: if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, - lock_type, may_drop_locks)) + if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) return ERR_PTR(-EINTR); if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || @@ -731,9 +729,7 @@ retry: if (bch2_btree_node_relock(iter, level + 1)) goto retry; - trans_restart(); - trace_trans_restart_btree_node_reused(c, - iter->trans->ip); + trace_trans_restart_btree_node_reused(iter->trans->ip); return ERR_PTR(-EINTR); } } @@ -770,9 +766,9 @@ retry: struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree_iter *iter, struct btree *b, - bool may_drop_locks, enum btree_node_sibling sib) { + struct btree_trans *trans = iter->trans; struct btree *parent; struct btree_node_iter node_iter; struct bkey_packed *k; @@ -784,8 +780,10 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, if (!parent) return NULL; - if (!bch2_btree_node_relock(iter, level + 1)) - goto out_upgrade; + if (!bch2_btree_node_relock(iter, level + 1)) { + ret = ERR_PTR(-EINTR); + goto out; + } node_iter = iter->l[parent->level].iter; @@ -802,19 +800,19 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, bch2_bkey_unpack(parent, &tmp.k, k); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent, may_drop_locks); + SIX_LOCK_intent); - if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) { + if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { struct btree_iter *linked; if (!bch2_btree_node_relock(iter, level + 1)) - goto out_upgrade; + goto out; /* * We might have got -EINTR because trylock failed, and we're * holding other locks that would cause us to deadlock: */ - trans_for_each_iter(iter->trans, linked) + trans_for_each_iter(trans, linked) if (btree_iter_cmp(iter, linked) < 0) __bch2_btree_iter_unlock(linked); @@ -822,7 +820,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, btree_node_unlock(iter, level); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent, may_drop_locks); + SIX_LOCK_intent); /* * before btree_iter_relock() calls btree_iter_verify_locks(): @@ -839,17 +837,16 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, } } - bch2_btree_trans_relock(iter->trans); + bch2_trans_relock(trans); } out: if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) btree_node_unlock(iter, level + 1); - bch2_btree_trans_verify_locks(iter->trans); + if (PTR_ERR_OR_ZERO(ret) == -EINTR) + bch2_btree_iter_upgrade(iter, level + 2); - BUG_ON((!may_drop_locks || !IS_ERR(ret)) && - (iter->uptodate >= BTREE_ITER_NEED_RELOCK || - !btree_node_locked(iter, level))); + BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); if (!IS_ERR_OR_NULL(ret)) { struct btree *n1 = ret, *n2 = b; @@ -862,12 +859,9 @@ out: n2->data->min_key)); } + bch2_btree_trans_verify_locks(trans); + return ret; -out_upgrade: - if (may_drop_locks) - bch2_btree_iter_upgrade(iter, level + 2, true); - ret = ERR_PTR(-EINTR); - goto out; } void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 08e6f2a6..19e14d32 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -22,11 +22,10 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, - enum six_lock_type, bool); + enum six_lock_type); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, bool, - enum btree_node_sibling); + struct btree *, enum btree_node_sibling); void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 9f0de5cd..c2b893a9 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -170,7 +170,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags); + bch2_mark_key(c, k, k.k->size, NULL, 0, flags); fsck_err: return ret; } @@ -214,7 +214,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, u8 max_stale; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -283,7 +283,7 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id, if (ret) return ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), BTREE_ITER_SLOTS, k, ret) { @@ -422,8 +422,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - true, 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0, BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); @@ -1057,7 +1056,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) struct btree *merge[GC_MERGE_NODES]; u32 lock_seq[GC_MERGE_NODES]; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); /* * XXX: We don't have a good way of positively matching on sibling nodes diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index fe888c57..8b7e05ed 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1151,7 +1151,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c, struct btree_iter *iter; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->level, 0); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 5631f98f..e78c6cad 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -14,13 +14,18 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, struct btree_iter_level *, struct bkey *); -#define BTREE_ITER_NOT_END ((struct btree *) 1) +#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) +#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) +#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) +#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) +#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) +#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) +#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { return l < BTREE_MAX_DEPTH && - iter->l[l].b && - iter->l[l].b != BTREE_ITER_NOT_END; + (unsigned long) iter->l[l].b >= 128; } /* Returns < 0 if @k is before iter pos, > 0 if @k is after */ @@ -105,19 +110,20 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) struct btree *b = btree_iter_node(iter, level); int want = __btree_lock_want(iter, level); - if (!b || b == BTREE_ITER_NOT_END) + if (!is_btree_node(iter, level)) return false; if (race_fault()) return false; - if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) && - !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && - btree_node_lock_increment(iter, b, level, want))) + if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) || + (btree_node_lock_seq_matches(iter, b, level) && + btree_node_lock_increment(iter, b, level, want))) { + mark_btree_node_locked(iter, level, want); + return true; + } else { return false; - - mark_btree_node_locked(iter, level, want); - return true; + } } static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) @@ -140,7 +146,7 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) goto success; - if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && + if (btree_node_lock_seq_matches(iter, b, level) && btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { btree_node_unlock(iter, level); goto success; @@ -153,7 +159,7 @@ success: } static inline bool btree_iter_get_locks(struct btree_iter *iter, - bool upgrade) + bool upgrade, bool trace) { unsigned l = iter->level; int fail_idx = -1; @@ -165,6 +171,17 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, if (!(upgrade ? bch2_btree_node_upgrade(iter, l) : bch2_btree_node_relock(iter, l))) { + if (trace) + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(l, iter->l[l].lock_seq, + is_btree_node(iter, l) + ? 0 + : (unsigned long) iter->l[l].b, + is_btree_node(iter, l) + ? iter->l[l].b->lock.state.seq + : 0); + fail_idx = l; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } @@ -179,7 +196,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, */ while (fail_idx >= 0) { btree_node_unlock(iter, fail_idx); - iter->l[fail_idx].b = BTREE_ITER_NOT_END; + iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; --fail_idx; } @@ -195,8 +212,7 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, - enum six_lock_type type, - bool may_drop_locks) + enum six_lock_type type) { struct btree_iter *linked; bool ret = true; @@ -224,11 +240,11 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - if (may_drop_locks) { + if (!(iter->trans->nounlock)) { linked->locks_want = max_t(unsigned, linked->locks_want, __fls(linked->nodes_locked) + 1); - btree_iter_get_locks(linked, true); + btree_iter_get_locks(linked, true, false); } ret = false; } @@ -240,21 +256,19 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (linked->btree_id == iter->btree_id && level > __fls(linked->nodes_locked)) { - if (may_drop_locks) { + if (!(iter->trans->nounlock)) { linked->locks_want = max(level + 1, max_t(unsigned, linked->locks_want, iter->locks_want)); - btree_iter_get_locks(linked, true); + btree_iter_get_locks(linked, true, false); } ret = false; } } if (unlikely(!ret)) { - trans_restart(); - trace_trans_restart_would_deadlock(iter->trans->c, - iter->trans->ip); + trace_trans_restart_would_deadlock(iter->trans->ip); return false; } @@ -269,9 +283,6 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; - BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) && - !btree_node_locked(iter, 0)); - for (l = 0; btree_iter_node(iter, l); l++) { if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && !btree_node_locked(iter, l)) @@ -292,10 +303,10 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) #endif __flatten -static bool bch2_btree_iter_relock(struct btree_iter *iter) +static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) { return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? btree_iter_get_locks(iter, false) + ? btree_iter_get_locks(iter, false, trace) : true; } @@ -308,7 +319,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, iter->locks_want = new_locks_want; - if (btree_iter_get_locks(iter, true)) + if (btree_iter_get_locks(iter, true, true)) return true; /* @@ -319,10 +330,9 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, trans_for_each_iter(iter->trans, linked) if (linked != iter && linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0 && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_iter_get_locks(linked, true); + btree_iter_get_locks(linked, true, false); } return false; @@ -389,28 +399,21 @@ void __bch2_btree_iter_downgrade(struct btree_iter *iter, bch2_btree_trans_verify_locks(iter->trans); } -int bch2_btree_iter_unlock(struct btree_iter *iter) -{ - struct btree_iter *linked; +/* Btree transaction locking: */ - trans_for_each_iter(iter->trans, linked) - __bch2_btree_iter_unlock(linked); - - return btree_iter_err(iter); -} - -bool bch2_btree_trans_relock(struct btree_trans *trans) +bool bch2_trans_relock(struct btree_trans *trans) { struct btree_iter *iter; bool ret = true; trans_for_each_iter(trans, iter) - ret &= bch2_btree_iter_relock(iter); + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + ret &= bch2_btree_iter_relock(iter, true); return ret; } -void bch2_btree_trans_unlock(struct btree_trans *trans) +void bch2_trans_unlock(struct btree_trans *trans) { struct btree_iter *iter; @@ -418,8 +421,6 @@ void bch2_btree_trans_unlock(struct btree_trans *trans) __bch2_btree_iter_unlock(iter); } -/* Btree transaction locking: */ - /* Btree iterator: */ #ifdef CONFIG_BCACHEFS_DEBUG @@ -824,7 +825,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) trans_for_each_iter(iter->trans, linked) if (linked->l[level].b == b) { __btree_node_unlock(linked, level); - linked->l[level].b = BTREE_ITER_NOT_END; + linked->l[level].b = BTREE_ITER_NO_NODE_DROP; } } @@ -862,26 +863,28 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, * that depth */ iter->level = depth_want; - iter->l[iter->level].b = NULL; + for (i = iter->level; i < BTREE_MAX_DEPTH; i++) + iter->l[i].b = NULL; return 1; } lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, - iter, lock_type, true))) + iter, lock_type))) return -EINTR; if (likely(b == c->btree_roots[iter->btree_id].b && b->level == iter->level && !race_fault())) { for (i = 0; i < iter->level; i++) - iter->l[i].b = BTREE_ITER_NOT_END; + iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; iter->l[iter->level].b = b; + for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) + iter->l[i].b = NULL; mark_btree_node_locked(iter, iter->level, lock_type); btree_iter_node_set(iter, b); return 0; - } six_unlock_type(&b->lock, lock_type); @@ -932,7 +935,7 @@ static inline int btree_iter_down(struct btree_iter *iter) bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true); + b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -971,7 +974,7 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, #undef btree_iter_cmp_by_idx retry_all: - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); if (unlikely(ret == -ENOMEM)) { struct closure cl; @@ -987,7 +990,7 @@ retry_all: if (unlikely(ret == -EIO)) { trans->error = true; iter->flags |= BTREE_ITER_ERROR; - iter->l[iter->level].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_ERROR; goto out; } @@ -1022,12 +1025,12 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter, unsigned l = iter->level; while (btree_iter_node(iter, l) && - !(is_btree_node(iter, l) && - bch2_btree_node_relock(iter, l) && - (!check_pos || - btree_iter_pos_in_node(iter, iter->l[l].b)))) { + (!is_btree_node(iter, l) || + !bch2_btree_node_relock(iter, l) || + (check_pos && + !btree_iter_pos_in_node(iter, iter->l[l].b)))) { btree_node_unlock(iter, l); - iter->l[l].b = BTREE_ITER_NOT_END; + iter->l[l].b = BTREE_ITER_NO_NODE_UP; l++; } @@ -1041,7 +1044,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter, * Returns 0 on success, -EIO on error (error reading in a btree node). * * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch2_btree_iter_unlock(). + * stashed in the iterator and returned from bch2_trans_exit(). */ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { @@ -1050,7 +1053,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - if (bch2_btree_iter_relock(iter)) + if (bch2_btree_iter_relock(iter, false)) return 0; /* @@ -1083,7 +1086,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) return 0; iter->level = depth_want; - iter->l[iter->level].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; return ret; } } @@ -1099,7 +1102,8 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - ret = __bch2_btree_iter_traverse(iter); + ret = bch2_trans_cond_resched(iter->trans) ?: + __bch2_btree_iter_traverse(iter); if (unlikely(ret)) ret = __btree_iter_traverse_all(iter->trans, iter, ret); @@ -1111,7 +1115,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, { EBUG_ON(iter->btree_id >= BTREE_ID_NR); EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS && + (btree_node_type_is_extents(iter->btree_id) && type != BTREE_ITER_NODES)); bch2_btree_trans_verify_locks(iter->trans); @@ -1291,9 +1295,11 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return btree_iter_peek_uptodate(iter); while (1) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } k = __btree_iter_peek(iter, l); if (likely(k.k)) @@ -1345,10 +1351,17 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - k = bch2_btree_iter_peek(iter); - if (IS_ERR_OR_NULL(k.k)) - return k; + /* + * XXX: when we just need to relock we should be able to avoid + * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK + * for that to work + */ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + + return bch2_btree_iter_peek(iter); } do { @@ -1548,9 +1561,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->uptodate == BTREE_ITER_UPTODATE) return btree_iter_peek_uptodate(iter); - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } return __bch2_btree_iter_peek_slot(iter); } @@ -1587,7 +1602,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, struct bch_fs *c = trans->c; unsigned i; - if (btree_id == BTREE_ID_EXTENTS && + if (btree_node_type_is_extents(btree_id) && !(flags & BTREE_ITER_NODES)) flags |= BTREE_ITER_IS_EXTENTS; @@ -1604,7 +1619,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, iter->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(iter->l); i++) iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; prefetch(c->btree_roots[btree_id].b); } @@ -1649,11 +1664,13 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans, return ret; } -static int btree_trans_realloc_iters(struct btree_trans *trans, - unsigned new_size) +static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) { void *new_iters, *new_updates; + new_size = roundup_pow_of_two(new_size); + BUG_ON(new_size > BTREE_ITER_MAX); if (new_size <= trans->size) @@ -1694,19 +1711,13 @@ success: trans->size = new_size; if (trans->iters_live) { - trans_restart(); - trace_trans_restart_iters_realloced(trans->c, trans->ip); + trace_trans_restart_iters_realloced(trans->ip, trans->size); return -EINTR; } return 0; } -void bch2_trans_preload_iters(struct btree_trans *trans) -{ - btree_trans_realloc_iters(trans, BTREE_ITER_MAX); -} - static int btree_trans_iter_alloc(struct btree_trans *trans) { unsigned idx = __ffs64(~trans->iters_linked); @@ -1715,7 +1726,7 @@ static int btree_trans_iter_alloc(struct btree_trans *trans) goto got_slot; if (trans->nr_iters == trans->size) { - int ret = btree_trans_realloc_iters(trans, trans->size * 2); + int ret = bch2_trans_realloc_iters(trans, trans->size * 2); if (ret) return ret; } @@ -1812,7 +1823,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, for (i = 0; i < ARRAY_SIZE(iter->l); i++) iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; return iter; } @@ -1845,50 +1856,40 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, return &trans->iters[idx]; } -void *bch2_trans_kmalloc(struct btree_trans *trans, - size_t size) +static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) { - void *ret; - - if (trans->mem_top + size > trans->mem_bytes) { + if (size > trans->mem_bytes) { size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(trans->mem_top + size); + size_t new_bytes = roundup_pow_of_two(size); void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); if (!new_mem) - return ERR_PTR(-ENOMEM); + return -ENOMEM; trans->mem = new_mem; trans->mem_bytes = new_bytes; if (old_bytes) { - trans_restart(); - trace_trans_restart_mem_realloced(trans->c, trans->ip); - return ERR_PTR(-EINTR); + trace_trans_restart_mem_realloced(trans->ip, new_bytes); + return -EINTR; } } - ret = trans->mem + trans->mem_top; - trans->mem_top += size; - return ret; + return 0; } -int bch2_trans_unlock(struct btree_trans *trans) +void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - u64 iters = trans->iters_linked; - int ret = 0; + void *p; + int ret; - while (iters) { - unsigned idx = __ffs64(iters); - struct btree_iter *iter = &trans->iters[idx]; + ret = bch2_trans_preload_mem(trans, trans->mem_top + size); + if (ret) + return ERR_PTR(ret); - ret = ret ?: btree_iter_err(iter); - - __bch2_btree_iter_unlock(iter); - iters ^= 1ULL << idx; - } - - return ret; + p = trans->mem + trans->mem_top; + trans->mem_top += size; + return p; } inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) @@ -1904,7 +1905,7 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) } } -void __bch2_trans_begin(struct btree_trans *trans) +void bch2_trans_begin(struct btree_trans *trans) { u64 iters_to_unlink; @@ -1935,7 +1936,9 @@ void __bch2_trans_begin(struct btree_trans *trans) bch2_btree_iter_traverse_all(trans); } -void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) +void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) { memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); @@ -1944,12 +1947,20 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->fs_usage_deltas = NULL; + + if (expected_nr_iters > trans->size) + bch2_trans_realloc_iters(trans, expected_nr_iters); + + if (expected_mem_bytes) + bch2_trans_preload_mem(trans, expected_mem_bytes); } int bch2_trans_exit(struct btree_trans *trans) { bch2_trans_unlock(trans); + kfree(trans->fs_usage_deltas); kfree(trans->mem); if (trans->used_mempool) mempool_free(trans->iters, &trans->c->btree_iters_pool); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index a46a6a4e..177cc314 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -18,6 +18,19 @@ static inline struct btree *btree_iter_node(struct btree_iter *iter, return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; } +static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, + const struct btree *b, unsigned level) +{ + /* + * We don't compare the low bits of the lock sequence numbers because + * @iter might have taken a write lock on @b, and we don't want to skip + * the linked iterator if the sequence numbers were equal before taking + * that write lock. The lock sequence number is incremented by taking + * and releasing write locks and is even when unlocked: + */ + return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1; +} + static inline struct btree *btree_node_parent(struct btree_iter *iter, struct btree *b) { @@ -56,30 +69,20 @@ __trans_next_iter(struct btree_trans *trans, unsigned idx) static inline bool __iter_has_node(const struct btree_iter *iter, const struct btree *b) { - /* - * We don't compare the low bits of the lock sequence numbers because - * @iter might have taken a write lock on @b, and we don't want to skip - * the linked iterator if the sequence numbers were equal before taking - * that write lock. The lock sequence number is incremented by taking - * and releasing write locks and is even when unlocked: - */ - return iter->l[b->level].b == b && - iter->l[b->level].lock_seq >> 1 == b->lock.state.seq >> 1; + btree_node_lock_seq_matches(iter, b, b->level); } static inline struct btree_iter * __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, unsigned idx) { - EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); + struct btree_iter *iter = __trans_next_iter(trans, idx); - for (; idx < trans->nr_iters; idx++) - if ((trans->iters_linked & (1ULL << idx)) && - __iter_has_node(&trans->iters[idx], b)) - return &trans->iters[idx]; + while (iter && !__iter_has_node(iter, b)) + iter = __trans_next_iter(trans, iter->idx + 1); - return NULL; + return iter; } #define trans_for_each_iter_with_node(_trans, _b, _iter) \ @@ -101,22 +104,19 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_packed *, unsigned, unsigned); -int bch2_btree_iter_unlock(struct btree_iter *); - -bool bch2_btree_trans_relock(struct btree_trans *); -void bch2_btree_trans_unlock(struct btree_trans *); +bool bch2_trans_relock(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, - unsigned new_locks_want, - bool may_drop_locks) + unsigned new_locks_want) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); return iter->locks_want < new_locks_want - ? (may_drop_locks + ? (!iter->trans->nounlock ? __bch2_btree_iter_upgrade(iter, new_locks_want) : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) : iter->uptodate <= BTREE_ITER_NEED_PEEK; @@ -157,7 +157,7 @@ static inline struct bpos btree_type_successor(enum btree_id id, if (id == BTREE_ID_INODES) { pos.inode++; pos.offset = 0; - } else if (id != BTREE_ID_EXTENTS) { + } else if (!btree_node_type_is_extents(id)) { pos = bkey_successor(pos); } @@ -170,7 +170,7 @@ static inline struct bpos btree_type_predecessor(enum btree_id id, if (id == BTREE_ID_INODES) { --pos.inode; pos.offset = 0; - } else /* if (id != BTREE_ID_EXTENTS) */ { + } else { pos = bkey_predecessor(pos); } @@ -192,19 +192,18 @@ static inline int btree_iter_cmp(const struct btree_iter *l, return __btree_iter_cmp(l->btree_id, l->pos, r); } -int bch2_trans_unlock(struct btree_trans *); - /* * Unlocks before scheduling * Note: does not revalidate iterator */ -static inline void bch2_trans_cond_resched(struct btree_trans *trans) +static inline int bch2_trans_cond_resched(struct btree_trans *trans) { - if (need_resched()) { + if (need_resched() || race_fault()) { bch2_trans_unlock(trans); schedule(); - } else if (race_fault()) { - bch2_trans_unlock(trans); + return bch2_trans_relock(trans) ? 0 : -EINTR; + } else { + return 0; } } @@ -232,8 +231,6 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, unsigned flags) { - bch2_trans_cond_resched(iter->trans); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_next_slot(iter) : bch2_btree_iter_next(iter); @@ -262,7 +259,6 @@ static inline int bkey_err(struct bkey_s_c k) /* new multiple iterator interface: */ -void bch2_trans_preload_iters(struct btree_trans *); int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *); @@ -297,7 +293,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); -void __bch2_trans_begin(struct btree_trans *); +void bch2_trans_begin(struct btree_trans *); static inline void bch2_trans_begin_updates(struct btree_trans *trans) { @@ -305,27 +301,7 @@ static inline void bch2_trans_begin_updates(struct btree_trans *trans) } void *bch2_trans_kmalloc(struct btree_trans *, size_t); -void bch2_trans_init(struct btree_trans *, struct bch_fs *); +void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); int bch2_trans_exit(struct btree_trans *); -#ifdef TRACE_TRANSACTION_RESTARTS -#define bch2_trans_begin(_trans) \ -do { \ - if (is_power_of_2((_trans)->nr_restarts) && \ - (_trans)->nr_restarts >= 8) \ - pr_info("nr restarts: %zu", (_trans)->nr_restarts); \ - \ - (_trans)->nr_restarts++; \ - __bch2_trans_begin(_trans); \ -} while (0) -#else -#define bch2_trans_begin(_trans) __bch2_trans_begin(_trans) -#endif - -#ifdef TRACE_TRANSACTION_RESTARTS_ALL -#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__) -#else -#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__) -#endif - #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index e9686197..35289b0c 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -107,7 +107,7 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { - BUG_ON(!level && iter->flags & BTREE_ITER_NOUNLOCK); + EBUG_ON(!level && iter->trans->nounlock); __btree_node_unlock(iter, level); } @@ -175,20 +175,18 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter, } bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type, bool); + struct btree_iter *, enum six_lock_type); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, - enum six_lock_type type, - bool may_drop_locks) + enum six_lock_type type) { EBUG_ON(level >= BTREE_MAX_DEPTH); return likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(iter, b, level, type) || - __bch2_btree_node_lock(b, pos, level, iter, - type, may_drop_locks); + __bch2_btree_node_lock(b, pos, level, iter, type); } bool __bch2_btree_node_relock(struct btree_iter *, unsigned); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 57ef5014..f2641d56 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -193,7 +193,6 @@ enum btree_iter_type { */ #define BTREE_ITER_IS_EXTENTS (1 << 4) #define BTREE_ITER_ERROR (1 << 5) -#define BTREE_ITER_NOUNLOCK (1 << 6) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -269,7 +268,6 @@ struct btree_insert_entry { struct btree_trans { struct bch_fs *c; unsigned long ip; - size_t nr_restarts; u64 commit_start; u64 iters_linked; @@ -283,6 +281,7 @@ struct btree_trans { u8 size; unsigned used_mempool:1; unsigned error:1; + unsigned nounlock:1; unsigned mem_top; unsigned mem_bytes; @@ -297,11 +296,12 @@ struct btree_trans { u64 *journal_seq; struct disk_reservation *disk_res; unsigned flags; + unsigned journal_u64s; struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; - struct replicas_delta_list fs_usage_deltas; + struct replicas_delta_list *fs_usage_deltas; }; #define BTREE_FLAG(flag) \ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index be11efdc..32e30f75 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -47,6 +47,7 @@ enum { __BTREE_INSERT_NOMARK, __BTREE_INSERT_MARK_INMEM, __BTREE_INSERT_NO_CLEAR_REPLICAS, + __BTREE_INSERT_BUCKET_INVALIDATE, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, __BCH_HASH_SET_MUST_CREATE, @@ -93,6 +94,8 @@ enum { #define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) +#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) + /* Don't block on allocation failure (for new btree nodes: */ #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) @@ -105,6 +108,8 @@ int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); +int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, + struct bpos, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); @@ -125,7 +130,7 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *, struct btree_trans trans; \ int _ret; \ \ - bch2_trans_init(&trans, (_c)); \ + bch2_trans_init(&trans, (_c), 0, 0); \ \ do { \ bch2_trans_begin(&trans); \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index fb6bf79a..c6920b63 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -193,7 +193,9 @@ found: : gc_pos_btree_root(as->btree_id)) >= 0 && gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), - false, 0, NULL, 0, BCH_BUCKET_MARK_GC); + 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE| + BCH_BUCKET_MARK_GC); } static void __btree_node_free(struct bch_fs *c, struct btree *b) @@ -263,13 +265,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, { BUG_ON(!pending->index_update_done); - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - false, 0, - NULL, 0, 0); + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - false, 0, NULL, 0, BCH_BUCKET_MARK_GC); + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE| + BCH_BUCKET_MARK_GC); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -1074,10 +1076,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - true, 0, fs_usage, 0, 0); + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - true, 0, NULL, 0, + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT| BCH_BUCKET_MARK_GC); if (old && !btree_node_fake(old)) @@ -1170,11 +1174,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - true, 0, fs_usage, 0, 0); + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_node(b))) bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - true, 0, NULL, 0, BCH_BUCKET_MARK_GC); + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT| + BCH_BUCKET_MARK_GC); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) @@ -1550,6 +1557,7 @@ split: int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned flags) { + struct btree_trans *trans = iter->trans; struct btree *b = iter->l[0].b; struct btree_update *as; struct closure cl; @@ -1560,7 +1568,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * We already have a disk reservation and open buckets pinned; this * allocation must not block: */ - trans_for_each_iter(iter->trans, linked) + trans_for_each_iter(trans, linked) if (linked->btree_id == BTREE_ID_EXTENTS) flags |= BTREE_INSERT_USE_RESERVE; @@ -1572,10 +1580,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, if (flags & BTREE_INSERT_NOUNLOCK) return -EINTR; - bch2_btree_trans_unlock(iter->trans); + bch2_trans_unlock(trans); down_read(&c->gc_lock); - if (!bch2_btree_trans_relock(iter->trans)) + if (!bch2_trans_relock(trans)) ret = -EINTR; } @@ -1583,8 +1591,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX, - !(flags & BTREE_INSERT_NOUNLOCK))) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip); ret = -EINTR; goto out; } @@ -1596,7 +1604,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ret = PTR_ERR(as); if (ret == -EAGAIN) { BUG_ON(flags & BTREE_INSERT_NOUNLOCK); - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); ret = -EINTR; } goto out; @@ -1623,6 +1631,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, unsigned flags, enum btree_node_sibling sib) { + struct btree_trans *trans = iter->trans; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1646,8 +1655,7 @@ retry: goto out; /* XXX: can't be holding read locks */ - m = bch2_btree_node_get_sibling(c, iter, b, - !(flags & BTREE_INSERT_NOUNLOCK), sib); + m = bch2_btree_node_get_sibling(c, iter, b, sib); if (IS_ERR(m)) { ret = PTR_ERR(m); goto err; @@ -1694,8 +1702,7 @@ retry: !down_read_trylock(&c->gc_lock)) goto err_cycle_gc_lock; - if (!bch2_btree_iter_upgrade(iter, U8_MAX, - !(flags & BTREE_INSERT_NOUNLOCK))) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ret = -EINTR; goto err_unlock; } @@ -1757,7 +1764,7 @@ retry: if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) up_read(&c->gc_lock); out: - bch2_btree_trans_verify_locks(iter->trans); + bch2_btree_trans_verify_locks(trans); /* * Don't downgrade locks here: we're called after successful insert, @@ -1777,7 +1784,7 @@ err_cycle_gc_lock: if (flags & BTREE_INSERT_NOUNLOCK) goto out; - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); down_read(&c->gc_lock); up_read(&c->gc_lock); @@ -1793,7 +1800,7 @@ err: if ((ret == -EAGAIN || ret == -EINTR) && !(flags & BTREE_INSERT_NOUNLOCK)) { - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); closure_sync(&cl); ret = bch2_btree_iter_traverse(iter); if (ret) @@ -1860,6 +1867,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, __le64 seq, unsigned flags) { + struct btree_trans *trans = iter->trans; struct closure cl; struct btree *b; int ret; @@ -1868,11 +1876,11 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - bch2_btree_iter_upgrade(iter, U8_MAX, true); + bch2_btree_iter_upgrade(iter, U8_MAX); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { if (!down_read_trylock(&c->gc_lock)) { - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); down_read(&c->gc_lock); } } @@ -1891,7 +1899,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ret != -EINTR) break; - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); closure_sync(&cl); } @@ -1994,10 +2002,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - true, 0, fs_usage, 0, 0); + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - true, 0, NULL, 0, + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT|| BCH_BUCKET_MARK_GC); bch2_btree_node_free_index(as, NULL, @@ -2040,14 +2050,14 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - if (!bch2_btree_iter_upgrade(iter, U8_MAX, true)) + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) return -EINTR; if (!down_read_trylock(&c->gc_lock)) { - bch2_btree_trans_unlock(iter->trans); + bch2_trans_unlock(iter->trans); down_read(&c->gc_lock); - if (!bch2_btree_trans_relock(iter->trans)) { + if (!bch2_trans_relock(iter->trans)) { ret = -EINTR; goto err; } @@ -2058,12 +2068,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, /* bch2_btree_reserve_get will unlock */ ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { - bch2_btree_trans_unlock(iter->trans); + bch2_trans_unlock(iter->trans); up_read(&c->gc_lock); closure_sync(&cl); down_read(&c->gc_lock); - if (!bch2_btree_trans_relock(iter->trans)) { + if (!bch2_trans_relock(iter->trans)) { ret = -EINTR; goto err; } @@ -2087,12 +2097,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, if (ret != -EINTR) goto err; - bch2_btree_trans_unlock(iter->trans); + bch2_trans_unlock(iter->trans); up_read(&c->gc_lock); closure_sync(&cl); down_read(&c->gc_lock); - if (!bch2_btree_trans_relock(iter->trans)) + if (!bch2_trans_relock(iter->trans)) goto err; } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index dde1fc1f..250aae47 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -430,16 +430,15 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans) if (ret != -EAGAIN) return ret; - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, u64s, 0); if (ret) return ret; - if (!bch2_btree_trans_relock(trans)) { - trans_restart(" (iter relock after journal preres get blocked)"); - trace_trans_restart_journal_preres_get(c, trans->ip); + if (!bch2_trans_relock(trans)) { + trace_trans_restart_journal_preres_get(trans->ip); return -EINTR; } @@ -450,21 +449,13 @@ static int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned u64s = 0; int ret; - if (unlikely(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - return 0; - if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) flags |= JOURNAL_RES_GET_RESERVED; - trans_for_each_update(trans, i) - u64s += jset_u64s(i->k->k.u64s); - ret = bch2_journal_res_get(&c->journal, &trans->journal_res, - u64s, flags); + trans->journal_u64s, flags); return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } @@ -550,33 +541,29 @@ static inline int do_btree_insert_at(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; - struct btree_iter *linked; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; int ret; - if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) { - memset(&trans->fs_usage_deltas.fs_usage, 0, - sizeof(trans->fs_usage_deltas.fs_usage)); - trans->fs_usage_deltas.top = trans->fs_usage_deltas.d; - } - trans_for_each_update_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); trans_for_each_update_iter(trans, i) if (update_has_triggers(trans, i) && update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i, - &trans->fs_usage_deltas); + ret = bch2_trans_mark_update(trans, i); + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); if (ret) - return ret; + goto out_clear_replicas; } btree_trans_lock_write(c, trans); if (race_fault()) { ret = -EINTR; - trans_restart(" (race)"); - trace_trans_restart_fault_inject(c, trans->ip); + trace_trans_restart_fault_inject(trans->ip); goto out; } @@ -610,9 +597,16 @@ static inline int do_btree_insert_at(struct btree_trans *trans, * Don't get journal reservation until after we know insert will * succeed: */ - ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); - if (ret) - goto out; + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + trans->journal_u64s = 0; + + trans_for_each_update(trans, i) + trans->journal_u64s += jset_u64s(i->k->k.u64s); + + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); + if (ret) + goto out; + } if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) @@ -623,33 +617,24 @@ static inline int do_btree_insert_at(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - if (trans->flags & BTREE_INSERT_NOUNLOCK) { - /* - * linked iterators that weren't being updated may or may not - * have been traversed/locked, depending on what the caller was - * doing: - */ - trans_for_each_iter(trans, linked) - if (linked->uptodate < BTREE_ITER_NEED_RELOCK) - linked->flags |= BTREE_ITER_NOUNLOCK; - } - trans_for_each_update_iter(trans, i) if (update_has_triggers(trans, i) && !update_triggers_transactional(trans, i)) - bch2_mark_update(trans, i, fs_usage, 0); + bch2_mark_update(trans, i, fs_usage, mark_flags); - if (fs_usage) { + if (fs_usage && trans->fs_usage_deltas) bch2_replicas_delta_list_apply(c, fs_usage, - &trans->fs_usage_deltas); + trans->fs_usage_deltas); + + if (fs_usage) bch2_trans_fs_usage_apply(trans, fs_usage); - } if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && unlikely(c->gc_pos.phase)) trans_for_each_update_iter(trans, i) if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) bch2_mark_update(trans, i, NULL, + mark_flags| BCH_BUCKET_MARK_GC); trans_for_each_update(trans, i) @@ -667,6 +652,12 @@ out: } bch2_journal_res_put(&c->journal, &trans->journal_res); +out_clear_replicas: + if (trans->fs_usage_deltas) { + memset(&trans->fs_usage_deltas->fs_usage, 0, + sizeof(trans->fs_usage_deltas->fs_usage)); + trans->fs_usage_deltas->used = 0; + } return ret; } @@ -725,9 +716,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, * don't care if we got ENOSPC because we told split it * couldn't block: */ - if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) { - trans_restart(" (split)"); - trace_trans_restart_btree_node_split(c, trans->ip); + if (!ret || + ret == -EINTR || + (flags & BTREE_INSERT_NOUNLOCK)) { + trace_trans_restart_btree_node_split(trans->ip); ret = -EINTR; } break; @@ -743,25 +735,23 @@ int bch2_trans_commit_error(struct btree_trans *trans, return ret; } - if (bch2_btree_trans_relock(trans)) + if (bch2_trans_relock(trans)) return 0; - trans_restart(" (iter relock after marking replicas)"); - trace_trans_restart_mark_replicas(c, trans->ip); + trace_trans_restart_mark_replicas(trans->ip); ret = -EINTR; break; case BTREE_INSERT_NEED_JOURNAL_RES: - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); if (ret) return ret; - if (bch2_btree_trans_relock(trans)) + if (bch2_trans_relock(trans)) return 0; - trans_restart(" (iter relock after journal res get blocked)"); - trace_trans_restart_journal_res_get(c, trans->ip); + trace_trans_restart_journal_res_get(trans->ip); ret = -EINTR; break; default: @@ -773,8 +763,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, int ret2 = bch2_btree_iter_traverse_all(trans); if (ret2) { - trans_restart(" (traverse)"); - trace_trans_restart_traverse(c, trans->ip); + trace_trans_restart_traverse(trans->ip); return ret2; } @@ -785,8 +774,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (!(flags & BTREE_INSERT_ATOMIC)) return 0; - trans_restart(" (atomic)"); - trace_trans_restart_atomic(c, trans->ip); + trace_trans_restart_atomic(trans->ip); } return ret; @@ -808,16 +796,11 @@ static int __bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct btree_iter *linked; int ret; trans_for_each_update_iter(trans, i) { - unsigned old_locks_want = i->iter->locks_want; - unsigned old_uptodate = i->iter->uptodate; - - if (!bch2_btree_iter_upgrade(i->iter, 1, true)) { - trans_restart(" (failed upgrade, locks_want %u uptodate %u)", - old_locks_want, old_uptodate); + if (!bch2_btree_iter_upgrade(i->iter, 1)) { + trace_trans_restart_upgrade(trans->ip); ret = -EINTR; goto err; } @@ -831,18 +814,20 @@ static int __bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) goto err; + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + trans_for_each_update_leaf(trans, i) bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); + trans->nounlock = false; + trans_for_each_update_iter(trans, i) bch2_btree_iter_downgrade(i->iter); err: /* make sure we didn't drop or screw up locks: */ bch2_btree_trans_verify_locks(trans); - trans_for_each_iter(trans, linked) - linked->flags &= ~BTREE_ITER_NOUNLOCK; - return ret; } @@ -883,7 +868,7 @@ int bch2_trans_commit(struct btree_trans *trans, if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) return -EROFS; - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); ret = bch2_fs_read_write_early(c); if (ret) @@ -891,7 +876,7 @@ int bch2_trans_commit(struct btree_trans *trans, percpu_ref_get(&c->writes); - if (!bch2_btree_trans_relock(trans)) { + if (!bch2_trans_relock(trans)) { ret = -EINTR; goto err; } @@ -965,20 +950,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans, return i; } -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.p = iter->pos; - - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); - return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags); -} - /** * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs @@ -995,7 +966,9 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct btree_iter *iter; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); @@ -1003,35 +976,24 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); return ret; } -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - u64 *journal_seq) +int bch2_btree_delete_at_range(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + u64 *journal_seq) { - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; int ret = 0; - - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); - - iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); - +retry: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - /* really shouldn't be using a bare, unpadded bkey_i */ struct bkey_i delete; bkey_init(&delete.k); @@ -1049,26 +1011,72 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, delete.k.p = iter->pos; if (iter->flags & BTREE_ITER_IS_EXTENTS) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); + /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); bch2_extent_trim_atomic(&delete, iter); } - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &delete)); - - ret = bch2_trans_commit(&trans, NULL, journal_seq, + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); + ret = bch2_trans_commit(trans, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); - if (ret == -EINTR) - ret = 0; if (ret) break; - bch2_trans_cond_resched(&trans); + bch2_trans_cond_resched(trans); } - bch2_trans_exit(&trans); + if (ret == -EINTR) { + ret = 0; + goto retry; + } + + return ret; + +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags); +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + /* + * XXX: whether we need mem/more iters depends on whether this btree id + * has triggers + */ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); + + iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); + + ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); + ret = bch2_trans_exit(&trans) ?: ret; + BUG_ON(ret == -EINTR); return ret; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 9f09e5be..3cfe684a 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -495,9 +495,11 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c) buckets = bucket_array(ca); + preempt_disable(); for_each_bucket(g, buckets) bch2_dev_usage_update(c, ca, c->usage_base, old, g->mark, false); + preempt_enable(); } } @@ -544,6 +546,67 @@ static inline void update_cached_sectors(struct bch_fs *c, update_replicas(c, fs_usage, &r.e, sectors); } +static struct replicas_delta_list * +replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +{ + struct replicas_delta_list *d = trans->fs_usage_deltas; + unsigned new_size = d ? (d->size + more) * 2 : 128; + + if (!d || d->used + more > d->size) { + d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); + BUG_ON(!d); + + d->size = new_size; + trans->fs_usage_deltas = d; + } + return d; +} + +static inline void update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry *r, + s64 sectors) +{ + struct replicas_delta_list *d; + struct replicas_delta *n; + unsigned b = replicas_entry_bytes(r) + 8; + + d = replicas_deltas_realloc(trans, b); + + n = (void *) d->d + d->used; + n->delta = sectors; + memcpy(&n->r, r, replicas_entry_bytes(r)); + d->used += b; +} + +static inline void update_cached_sectors_list(struct btree_trans *trans, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + update_replicas_list(trans, &r.e, sectors); +} + +void bch2_replicas_delta_list_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + + acc_u64s((u64 *) fs_usage, + (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); + + while (d != top) { + BUG_ON((void *) d > (void *) top); + + update_replicas(c, fs_usage, &d->r, d->delta); + + d = (void *) d + replicas_entry_bytes(&d->r) + 8; + } +} + #define do_mark_fn(fn, c, pos, flags, ...) \ ({ \ int gc, ret = 0; \ @@ -623,23 +686,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, } static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, - bool inserting, struct bch_fs_usage *fs_usage, - unsigned journal_seq, unsigned flags, - bool gc) + u64 journal_seq, unsigned flags) { + bool gc = flags & BCH_BUCKET_MARK_GC; struct bkey_alloc_unpacked u; struct bch_dev *ca; struct bucket *g; struct bucket_mark old, m; - if (!inserting) - return 0; - /* * alloc btree is read in by bch2_alloc_read, not gc: */ - if (flags & BCH_BUCKET_MARK_GC) + if ((flags & BCH_BUCKET_MARK_GC) && + !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); @@ -650,18 +710,21 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, g = __bucket(ca, k.k->p.offset, gc); u = bch2_alloc_unpack(k); - old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({ + old = bucket_cmpxchg(g, m, ({ m.gen = u.gen; m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; m.cached_sectors = u.cached_sectors; - if (!(flags & BCH_BUCKET_MARK_GC)) { + if (journal_seq) { m.journal_seq_valid = 1; m.journal_seq = journal_seq; } })); + if (!(flags & BCH_BUCKET_MARK_ALLOC_READ)) + bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); + g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; g->oldest_gen = u.oldest_gen; @@ -672,7 +735,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, * not: */ - if (old.cached_sectors) { + if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) && + old.cached_sectors) { update_cached_sectors(c, fs_usage, ca->dev_idx, -old.cached_sectors); trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), @@ -759,11 +823,12 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, static void bucket_set_stripe(struct bch_fs *c, const struct bch_stripe *v, - bool enabled, struct bch_fs_usage *fs_usage, u64 journal_seq, - bool gc) + unsigned flags) { + bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE); + bool gc = flags & BCH_BUCKET_MARK_GC; unsigned i; for (i = 0; i < v->nr_blocks; i++) { @@ -789,9 +854,9 @@ static bool bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - unsigned journal_seq, unsigned flags, - bool gc) + u64 journal_seq, unsigned flags) { + bool gc = flags & BCH_BUCKET_MARK_GC; struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); @@ -858,9 +923,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - s64 sectors, unsigned flags, - bool gc) + s64 sectors, unsigned flags) { + bool gc = flags & BCH_BUCKET_MARK_GC; struct stripe *m; unsigned old, new, nr_data; int blocks_nonempty_delta; @@ -913,8 +978,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - unsigned journal_seq, unsigned flags, - bool gc) + unsigned journal_seq, unsigned flags) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -935,7 +999,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ? sectors : ptr_disk_sectors_delta(p, sectors); bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, - fs_usage, journal_seq, flags, gc); + fs_usage, journal_seq, flags); if (p.ptr.cached) { if (disk_sectors && !stale) @@ -948,7 +1012,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, for (i = 0; i < p.ec_nr; i++) { ret = bch2_mark_stripe_ptr(c, p.ec[i], data_type, fs_usage, - disk_sectors, flags, gc); + disk_sectors, flags); if (ret) return ret; } @@ -964,11 +1028,10 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, } static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, - bool inserting, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags, - bool gc) + u64 journal_seq, unsigned flags) { + bool gc = flags & BCH_BUCKET_MARK_GC; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); size_t idx = s.k->p.offset; struct stripe *m = genradix_ptr(&c->stripes[gc], idx); @@ -976,19 +1039,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, spin_lock(&c->ec_stripes_heap_lock); - if (!m || (!inserting && !m->alive)) { + if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) { spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); return -1; } - if (!gc && m->alive) - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - - if (inserting) { + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) { m->sectors = le16_to_cpu(s.v->sectors); m->algorithm = s.v->algorithm; m->nr_blocks = s.v->nr_blocks; @@ -996,11 +1054,11 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, bch2_bkey_to_replicas(&m->r.e, k); - /* - * XXX: account for stripes somehow here - */ + /* + * XXX: account for stripes somehow here + */ #if 0 - update_replicas(c, fs_usage, &m->r.e, stripe_sectors); + update_replicas(c, fs_usage, &m->r.e, stripe_sectors); #endif /* gc recalculates these fields: */ @@ -1013,53 +1071,54 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, } if (!gc) - bch2_stripes_heap_insert(c, m, idx); - else - m->alive = true; + bch2_stripes_heap_update(c, m, idx); + m->alive = true; + } else { + if (!gc) + bch2_stripes_heap_del(c, m, idx); + memset(m, 0, sizeof(*m)); } spin_unlock(&c->ec_stripes_heap_lock); - bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); + bucket_set_stripe(c, s.v, fs_usage, 0, flags); return 0; } int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, - bool inserting, s64 sectors, + struct bkey_s_c k, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - bool gc = flags & BCH_BUCKET_MARK_GC; int ret = 0; preempt_disable(); - if (!fs_usage || gc) - fs_usage = fs_usage_ptr(c, journal_seq, gc); + if (!fs_usage || (flags & BCH_BUCKET_MARK_GC)) + fs_usage = fs_usage_ptr(c, journal_seq, + flags & BCH_BUCKET_MARK_GC); switch (k.k->type) { case KEY_TYPE_alloc: - ret = bch2_mark_alloc(c, k, inserting, - fs_usage, journal_seq, flags, gc); + ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: - ret = bch2_mark_extent(c, k, inserting - ? c->opts.btree_node_size - : -c->opts.btree_node_size, - BCH_DATA_BTREE, - fs_usage, journal_seq, flags, gc); + sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + + ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE, + fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, - fs_usage, journal_seq, flags, gc); + fs_usage, journal_seq, flags); break; case KEY_TYPE_stripe: - ret = bch2_mark_stripe(c, k, inserting, - fs_usage, journal_seq, flags, gc); + ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: - if (inserting) + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) fs_usage->nr_inodes++; else fs_usage->nr_inodes--; @@ -1083,14 +1142,14 @@ int bch2_mark_key_locked(struct bch_fs *c, } int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - bool inserting, s64 sectors, + s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { int ret; percpu_down_read_preempt_disable(&c->mark_lock); - ret = bch2_mark_key_locked(c, k, inserting, sectors, + ret = bch2_mark_key_locked(c, k, sectors, fs_usage, journal_seq, flags); percpu_up_read_preempt_enable(&c->mark_lock); @@ -1130,9 +1189,9 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, sectors = old.k->p.offset - new->k.p.offset; BUG_ON(sectors <= 0); - bch2_mark_key_locked(c, old, true, sectors, + bch2_mark_key_locked(c, old, sectors, fs_usage, trans->journal_res.seq, - flags); + BCH_BUCKET_MARK_INSERT|flags); sectors = bkey_start_offset(&new->k) - old.k->p.offset; @@ -1142,8 +1201,9 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, BUG_ON(sectors >= 0); } - return bch2_mark_key_locked(c, old, false, sectors, fs_usage, - trans->journal_res.seq, flags) ?: 1; + return bch2_mark_key_locked(c, old, sectors, fs_usage, + trans->journal_res.seq, + BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1; } int bch2_mark_update(struct btree_trans *trans, @@ -1162,10 +1222,11 @@ int bch2_mark_update(struct btree_trans *trans, return 0; if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) - bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, + bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), - fs_usage, trans->journal_res.seq, flags); + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) return 0; @@ -1246,46 +1307,6 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, /* trans_mark: */ -static inline void update_replicas_list(struct replicas_delta_list *d, - struct bch_replicas_entry *r, - s64 sectors) -{ - d->top->delta = sectors; - memcpy(&d->top->r, r, replicas_entry_bytes(r)); - - d->top = (void *) d->top + replicas_entry_bytes(r) + 8; - - BUG_ON((void *) d->top > (void *) d->d + sizeof(d->pad)); -} - -static inline void update_cached_sectors_list(struct replicas_delta_list *d, - unsigned dev, s64 sectors) -{ - struct bch_replicas_padded r; - - bch2_replicas_entry_cached(&r.e, dev); - - update_replicas_list(d, &r.e, sectors); -} - -void bch2_replicas_delta_list_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - - acc_u64s((u64 *) fs_usage, - (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); - - while (d != r->top) { - BUG_ON((void *) d > (void *) r->top); - - update_replicas(c, fs_usage, &d->r, d->delta); - - d = (void *) d + replicas_entry_bytes(&d->r) + 8; - } -} - static int trans_get_key(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos, struct btree_insert_entry **insert, @@ -1347,8 +1368,7 @@ static int trans_update_key(struct btree_trans *trans, static int bch2_trans_mark_pointer(struct btree_trans *trans, struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type, - struct replicas_delta_list *d) + s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); @@ -1409,8 +1429,7 @@ out: static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, - s64 sectors, enum bch_data_type data_type, - struct replicas_delta_list *d) + s64 sectors, enum bch_data_type data_type) { struct bch_replicas_padded r; struct btree_insert_entry *insert; @@ -1455,7 +1474,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, bch2_bkey_to_replicas(&r.e, s.s_c); - update_replicas_list(d, &r.e, sectors); + update_replicas_list(trans, &r.e, sectors); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1463,8 +1482,7 @@ out: static int bch2_trans_mark_extent(struct btree_trans *trans, struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type, - struct replicas_delta_list *d) + s64 sectors, enum bch_data_type data_type) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1487,7 +1505,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, : ptr_disk_sectors_delta(p, sectors); ret = bch2_trans_mark_pointer(trans, p, disk_sectors, - data_type, d); + data_type); if (ret < 0) return ret; @@ -1495,7 +1513,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, if (p.ptr.cached) { if (disk_sectors && !stale) - update_cached_sectors_list(d, p.ptr.dev, + update_cached_sectors_list(trans, p.ptr.dev, disk_sectors); } else if (!p.ec_nr) { dirty_sectors += disk_sectors; @@ -1503,7 +1521,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, } else { for (i = 0; i < p.ec_nr; i++) { ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i], - disk_sectors, data_type, d); + disk_sectors, data_type); if (ret) return ret; } @@ -1513,29 +1531,32 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, } if (dirty_sectors) - update_replicas_list(d, &r.e, dirty_sectors); + update_replicas_list(trans, &r.e, dirty_sectors); return 0; } -int bch2_trans_mark_key(struct btree_trans *trans, - struct bkey_s_c k, - bool inserting, s64 sectors, - struct replicas_delta_list *d) +int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + s64 sectors, unsigned flags) { + struct replicas_delta_list *d; struct bch_fs *c = trans->c; switch (k.k->type) { case KEY_TYPE_btree_ptr: - return bch2_trans_mark_extent(trans, k, inserting - ? c->opts.btree_node_size - : -c->opts.btree_node_size, - BCH_DATA_BTREE, d); + sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + + return bch2_trans_mark_extent(trans, k, sectors, + BCH_DATA_BTREE); case KEY_TYPE_extent: - return bch2_trans_mark_extent(trans, k, - sectors, BCH_DATA_USER, d); + return bch2_trans_mark_extent(trans, k, sectors, + BCH_DATA_USER); case KEY_TYPE_inode: - if (inserting) + d = replicas_deltas_realloc(trans, 0); + + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) d->fs_usage.nr_inodes++; else d->fs_usage.nr_inodes--; @@ -1543,6 +1564,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + d = replicas_deltas_realloc(trans, 0); + sectors *= replicas; replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(d->fs_usage.persistent_reserved)); @@ -1557,8 +1580,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, } int bch2_trans_mark_update(struct btree_trans *trans, - struct btree_insert_entry *insert, - struct replicas_delta_list *d) + struct btree_insert_entry *insert) { struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; @@ -1570,9 +1592,10 @@ int bch2_trans_mark_update(struct btree_trans *trans, return 0; ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(insert->k), true, + bkey_i_to_s_c(insert->k), bpos_min(insert->k->k.p, b->key.k.p).offset - - bkey_start_offset(&insert->k->k), d); + bkey_start_offset(&insert->k->k), + BCH_BUCKET_MARK_INSERT); if (ret) return ret; @@ -1606,8 +1629,8 @@ int bch2_trans_mark_update(struct btree_trans *trans, sectors = k.k->p.offset - insert->k->k.p.offset; BUG_ON(sectors <= 0); - ret = bch2_trans_mark_key(trans, k, true, - sectors, d); + ret = bch2_trans_mark_key(trans, k, sectors, + BCH_BUCKET_MARK_INSERT); if (ret) return ret; @@ -1619,7 +1642,8 @@ int bch2_trans_mark_update(struct btree_trans *trans, BUG_ON(sectors >= 0); } - ret = bch2_trans_mark_key(trans, k, false, sectors, d); + ret = bch2_trans_mark_key(trans, k, sectors, + BCH_BUCKET_MARK_OVERWRITE); if (ret) return ret; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a32c25d8..65a934f8 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -248,15 +248,17 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -#define BCH_BUCKET_MARK_GC (1 << 0) -#define BCH_BUCKET_MARK_NOATOMIC (1 << 1) +#define BCH_BUCKET_MARK_INSERT (1 << 0) +#define BCH_BUCKET_MARK_OVERWRITE (1 << 1) +#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2) +#define BCH_BUCKET_MARK_GC (1 << 3) +#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4) +#define BCH_BUCKET_MARK_NOATOMIC (1 << 5) -int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, - bool, s64, struct bch_fs_usage *, - u64, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, - bool, s64, struct bch_fs_usage *, - u64, unsigned); +int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64, + struct bch_fs_usage *, u64, unsigned); +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, + struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, unsigned); @@ -269,11 +271,9 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, void bch2_replicas_delta_list_apply(struct bch_fs *, struct bch_fs_usage *, struct replicas_delta_list *); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, - bool, s64, struct replicas_delta_list *); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, - struct btree_insert_entry *, - struct replicas_delta_list *); + struct btree_insert_entry *); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); /* disk reservations: */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index a333b9ec..309a5fb6 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -100,11 +100,10 @@ struct replicas_delta { } __packed; struct replicas_delta_list { + unsigned size; + unsigned used; struct bch_fs_usage fs_usage; - - struct replicas_delta *top; struct replicas_delta d[0]; - u8 pad[256]; }; /* diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index b6cf6801..a4c1b8ad 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -280,22 +280,8 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } -static inline bool bch2_checksum_mergeable(unsigned type) -{ - - switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: - return true; - default: - return false; - } -} - -static struct bch_csum bch2_checksum_merge(unsigned type, - struct bch_csum a, - struct bch_csum b, size_t b_len) +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, + struct bch_csum b, size_t b_len) { BUG_ON(!bch2_checksum_mergeable(type)); diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 580eff66..2c0fbbb8 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -8,6 +8,22 @@ #include #include +static inline bool bch2_checksum_mergeable(unsigned type) +{ + + switch (type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: + return true; + default: + return false; + } +} + +struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, + struct bch_csum, size_t); + static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len) { return crc64_be(crc, p, len); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index a22ac8d6..47b8dd74 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -220,7 +220,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_trans_init(&trans, i->c); + bch2_trans_init(&trans, i->c, 0, 0); iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); k = bch2_btree_iter_peek(iter); @@ -274,7 +274,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (!i->size || !bkey_cmp(POS_MAX, i->from)) return i->ret; - bch2_trans_init(&trans, i->c); + bch2_trans_init(&trans, i->c, 0, 0); for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); @@ -327,7 +327,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_trans_init(&trans, i->c); + bch2_trans_init(&trans, i->c, 0, 0); iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index b379780e..11e62887 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -312,7 +312,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, struct bkey_s_c k; u64 inum = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc, hash_info, dir_inum, name, 0); @@ -369,7 +369,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, if (!dir_emit_dots(file, ctx)) return 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(inode->v.i_ino, ctx->pos), 0, k, ret) { diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index a31d6cb2..43cceb02 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -113,7 +113,7 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) bkey_val_u64s(k.k) < stripe_val_u64s(s)) return "incorrect value size"; - return NULL; + return bch2_bkey_ptrs_invalid(c, k); } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -134,6 +134,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, (u64) s->ptrs[i].offset, stripe_blockcount_get(s, i)); + + bch2_bkey_ptrs_to_text(out, c, k); } static int ptr_matches_stripe(struct bch_fs *c, @@ -177,6 +179,25 @@ static int extent_matches_stripe(struct bch_fs *c, return -1; } +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) +{ + struct bkey_s_c_extent e; + const union bch_extent_entry *entry; + + if (!bkey_extent_is_data(k.k)) + return false; + + e = bkey_s_c_to_extent(k); + + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; + + return false; +} + static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i_stripe *s, struct open_buckets *blocks, @@ -419,7 +440,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) if (!buf) return -ENOMEM; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, stripe_idx), @@ -541,7 +562,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c, if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) return ret; - bch2_btree_trans_unlock(iter->trans); + bch2_trans_unlock(iter->trans); ret = -EINTR; if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) @@ -589,17 +610,21 @@ void bch2_stripes_heap_update(struct bch_fs *c, ec_stripes_heap *h = &c->ec_stripes_heap; size_t i; - heap_verify_backpointer(c, idx); + if (m->alive) { + heap_verify_backpointer(c, idx); - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - i = m->heap_idx; - heap_sift_up(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - heap_sift_down(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); - heap_verify_backpointer(c, idx); + heap_verify_backpointer(c, idx); + } else { + bch2_stripes_heap_insert(c, m, idx); + } if (stripe_idx_to_delete(c) >= 0) schedule_work(&c->ec_stripe_delete_work); @@ -676,7 +701,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -743,8 +768,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, BKEY_PADDED(k) tmp; int ret = 0, dev, idx; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(pos), @@ -753,12 +777,19 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { + bch2_btree_iter_next(iter); + continue; + } + idx = extent_matches_stripe(c, &s->key.v, k); if (idx < 0) { bch2_btree_iter_next(iter); continue; } + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + dev = s->key.v.ptrs[idx].dev; bkey_reassemble(&tmp.k, k); @@ -1207,7 +1238,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); BUG_ON(!new_key); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -1243,10 +1274,12 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) if (ret) return ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret) - bch2_mark_key(c, k, true, 0, NULL, 0, 0); + bch2_mark_key(c, k, 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); ret = bch2_trans_exit(&trans) ?: ret; if (ret) { @@ -1257,7 +1290,9 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) for_each_journal_key(*journal_keys, i) if (i->btree_id == BTREE_ID_EC) bch2_mark_key(c, bkey_i_to_s_c(i->k), - true, 0, NULL, 0, 0); + 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); return 0; } @@ -1270,7 +1305,7 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) size_t i, idx = 0; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f8f29251..dffcc144 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -500,43 +500,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) } } -static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; - struct bch_dev *ca; - - if (ptr->dev >= c->sb.nr_devices || - !c->devs[ptr->dev]) - return "pointer to invalid device"; - - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!ca) - return "pointer to invalid device"; - - bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) - return "multiple pointers to same device"; - - if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) - return "offset past end of device"; - - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) - return "offset before first bucket"; - - if (bucket_remainder(ca, ptr->offset) + - size_ondisk > ca->mi.bucket_size) - return "spans multiple buckets"; - - return NULL; -} - -static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -590,37 +555,109 @@ static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -/* Btree ptrs */ +static const char *extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + struct bch_dev *ca; -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) + if (!bch2_dev_exists2(c, ptr->dev)) + return "pointer to invalid device"; + + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!ca) + return "pointer to invalid device"; + + bkey_for_each_ptr(ptrs, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) + return "multiple pointers to same device"; + + if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) + return "offset past end of device"; + + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) + return "offset before first bucket"; + + if (bucket_remainder(ca, ptr->offset) + + size_ondisk > ca->mi.bucket_size) + return "spans multiple buckets"; + + return NULL; +} + +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; const char *reason; + unsigned nonce = UINT_MAX; - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + if (k.k->type == KEY_TYPE_btree_ptr) + size_ondisk = c->opts.btree_node_size; bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (!extent_entry_is_ptr(entry)) + if (k.k->type == KEY_TYPE_btree_ptr && + !extent_entry_is_ptr(entry)) return "has non ptr field"; - } - bkey_for_each_ptr(ptrs, ptr) { - reason = extent_ptr_invalid(c, k, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + reason = extent_ptr_invalid(c, k, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + if (crc.offset + crc.live_size > + crc.uncompressed_size) + return "checksum offset + key size > uncompressed size"; + + size_ondisk = crc.compressed_size; + + if (!bch2_checksum_type_valid(c, crc.csum_type)) + return "invalid checksum type"; + + if (crc.compression_type >= BCH_COMPRESSION_NR) + return "invalid compression type"; + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + return "incorrect nonce"; + } + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } return NULL; } +/* Btree ptrs */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; + + return bch2_bkey_ptrs_invalid(c, k); +} + void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { @@ -665,13 +702,7 @@ err: void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const char *invalid; - - bkey_ptrs_to_text(out, c, k); - - invalid = bch2_btree_ptr_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); + bch2_bkey_ptrs_to_text(out, c, k); } /* Extents */ @@ -1260,60 +1291,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; - - if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX) + if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) return "value too big"; - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e.s_c, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - } - } - - return NULL; + return bch2_bkey_ptrs_invalid(c, k); } void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, @@ -1374,62 +1355,66 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const char *invalid; + bch2_bkey_ptrs_to_text(out, c, k); +} - bkey_ptrs_to_text(out, c, k); +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - invalid = bch2_extent_invalid(c, k); - if (invalid) - pr_buf(out, " invalid: %s", invalid); +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src) +{ +#define set_common_fields(_dst, _src) \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (extent_entry_type(to_entry(dst))) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } static void bch2_extent_crc_init(union bch_extent_crc *crc, struct bch_extent_crc_unpacked new) { -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - ._compressed_size = _crc.compressed_size - 1, \ - ._uncompressed_size = _crc.uncompressed_size - 1, \ - .offset = _crc.offset - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - common_fields(new), - .csum = *((__le32 *) &new.csum.lo), - }; - return; - } + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc128; + else + BUG(); - if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - common_fields(new), - .nonce = new.nonce, - .csum_lo = new.csum.lo, - .csum_hi = *((__le16 *) &new.csum.hi), - }; - return; - } - - if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - common_fields(new), - .nonce = new.nonce, - .csum = new.csum, - }; - return; - } -#undef common_fields - BUG(); + bch2_extent_crc_pack(crc, new); } void bch2_extent_crc_append(struct bkey_i_extent *e, @@ -1454,10 +1439,15 @@ static inline void __extent_entry_insert(struct bkey_i_extent *e, void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, struct extent_ptr_decoded *p) { - struct bch_extent_crc_unpacked crc; + struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL); union bch_extent_entry *pos; unsigned i; + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = e->v.start; + goto found; + } + extent_for_each_crc(extent_i_to_s(e), crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = extent_entry_next(pos); @@ -1535,46 +1525,101 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, { struct bkey_s_extent el = bkey_i_to_s_extent(l); struct bkey_s_extent er = bkey_i_to_s_extent(r); - union bch_extent_entry *en_l, *en_r; + union bch_extent_entry *en_l = el.v->start; + union bch_extent_entry *en_r = er.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k)) return BCH_MERGE_NOMERGE; + crc_l = bch2_extent_crc_unpack(el.k, NULL); + extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; - struct bch_dev *ca; + en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); + + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return BCH_MERGE_NOMERGE; + + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; + struct bch_dev *ca; + + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; + + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); + + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) + return BCH_MERGE_NOMERGE; + + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r)); + + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; + + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; + + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; + + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; + + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; + + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; + + break; + default: + return BCH_MERGE_NOMERGE; + } + } + + extent_for_each_entry(el, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - !extent_entry_is_ptr(en_l)) - return BCH_MERGE_NOMERGE; + if (!extent_entry_is_crc(en_l)) + continue; - lp = &en_l->ptr; - rp = &en_r->ptr; + crc_l = bch2_extent_crc_unpack(el.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(er.k, entry_to_crc(en_r)); - if (lp->offset + el.k->size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - } - - l->k.needs_whiteout |= r->k.needs_whiteout; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); } bch2_key_resize(&l->k, l->k.size + r->k.size); @@ -1670,7 +1715,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, end.offset += size; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, BTREE_ITER_SLOTS, k, err) { @@ -1745,11 +1790,6 @@ enum merge_result bch2_reservation_merge(struct bch_fs *c, li->v.nr_replicas != ri->v.nr_replicas) return BCH_MERGE_NOMERGE; - l->k.needs_whiteout |= r->k.needs_whiteout; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { bch2_key_resize(&l->k, KEY_SIZE_MAX); bch2_cut_front(l->k.p, r); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 77d69841..9bf156d0 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -358,6 +358,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); + /* bch_btree_ptr: */ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 7133482e..81a86664 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -322,10 +322,10 @@ static int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta || new_i_size > inode->ei_inode.bi_size) { if (c->opts.new_inode_updates) { - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); mutex_lock(&inode->ei_update_lock); - if (!bch2_btree_trans_relock(trans)) { + if (!bch2_trans_relock(trans)) { mutex_unlock(&inode->ei_update_lock); return -EINTR; } @@ -435,8 +435,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BUG_ON(k->k.p.inode != inode->v.i_ino); - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -998,7 +997,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, } bkey_reassemble(&tmp.k, k); - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); if (readpages_iter) { @@ -1054,7 +1053,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); BUG_ON(ret); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_SLOTS); @@ -1103,7 +1102,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); bio_add_page_contig(&rbio->bio, page); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_SLOTS); @@ -2101,8 +2100,7 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, BTREE_ITER_INTENT); @@ -2148,7 +2146,7 @@ static inline int range_has_data(struct bch_fs *c, struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) @@ -2404,8 +2402,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); /* * We need i_mutex to keep the page cache consistent with the extents @@ -2520,8 +2517,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, unsigned replicas = io_opts(c, inode).data_replicas; int ret; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); inode_lock(&inode->v); inode_dio_wait(&inode->v); @@ -2732,7 +2728,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), 0, k, ret) { @@ -2805,7 +2801,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index dc6c7dfb..a324278b 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -164,7 +164,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, struct bch_inode_unpacked inode_u; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -355,7 +355,7 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, if (!tmpfile) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 8, 1024); retry: bch2_trans_begin(&trans); @@ -507,7 +507,7 @@ static int __bch2_link(struct bch_fs *c, int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 4, 1024); retry: bch2_trans_begin(&trans); @@ -594,7 +594,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) int ret; bch2_lock_inodes(dir, inode); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 4, 1024); retry: bch2_trans_begin(&trans); @@ -801,13 +801,13 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, return ret; } + bch2_trans_init(&trans, c, 8, 2048); + bch2_lock_inodes(i.src_dir, i.dst_dir, i.src_inode, i.dst_inode); - bch2_trans_init(&trans, c); - if (S_ISDIR(i.src_inode->v.i_mode) && inode_attrs_changing(i.dst_dir, i.src_inode)) { ret = -EXDEV; @@ -968,7 +968,7 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt if (ret) goto err; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); kfree(acl); @@ -1123,7 +1123,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(ei->v.i_ino, start >> 9), 0, k, ret) @@ -1511,7 +1511,7 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons */ c1 = bch2_path_to_fs(devs[0]); - if (!c1) + if (IS_ERR(c1)) return c; for (i = 1; i < nr_devs; i++) { diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 998c10ab..433552df 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -57,7 +57,7 @@ static int remove_dirent(struct btree_trans *trans, name.name = buf; /* Unlock so we don't deadlock, after copying name: */ - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); if (ret) { @@ -450,8 +450,7 @@ static int check_extents(struct bch_fs *c) u64 i_sectors; int ret = 0; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); @@ -546,8 +545,7 @@ static int check_dirents(struct bch_fs *c) bch_verbose(c, "checking dirents"); - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); hash_check_init(&h); @@ -703,8 +701,7 @@ static int check_xattrs(struct bch_fs *c) hash_check_init(&h); - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(BCACHEFS_ROOT_INO, 0), 0); @@ -917,8 +914,7 @@ static int check_directory_structure(struct bch_fs *c, u64 d_inum; int ret = 0; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking directory structure"); @@ -1014,7 +1010,7 @@ retry: if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, "unreachable directory found (inum %llu)", k.k->p.inode)) { - bch2_btree_trans_unlock(&trans); + bch2_trans_unlock(&trans); ret = reattach_inode(c, lostfound_inode, k.k->p.inode); if (ret) { @@ -1084,8 +1080,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, u64 d_inum; int ret; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); @@ -1228,7 +1223,7 @@ static int check_inode(struct btree_trans *trans, ret = bch2_inode_unpack(inode, &u); - bch2_btree_trans_unlock(trans); + bch2_trans_unlock(trans); if (bch2_fs_inconsistent_on(ret, c, "error unpacking inode %llu in fsck", @@ -1333,8 +1328,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, int ret = 0, ret2 = 0; u64 nlinks_pos; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(range_start, 0), 0); @@ -1458,8 +1452,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c) struct bkey_s_c_inode inode; int ret; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { if (k.k->type != KEY_TYPE_inode) diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index d2748e70..59ae6d07 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -390,7 +390,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) if (ret) return ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 5df690f9..dc922a91 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -285,7 +285,7 @@ int bch2_write_index_default(struct bch_write_op *op) BUG_ON(bch2_keylist_empty(keys)); bch2_verify_keylist_sorted(keys); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), @@ -432,21 +432,32 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { + struct bch_fs *c = op->c; struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); - struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p = { .crc = crc }; + struct open_bucket *ob; + unsigned i; op->pos.offset += crc.uncompressed_size; - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.version = version; + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; - bch2_extent_crc_append(e, crc); - bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, - crc.compressed_size); + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; - if (op->flags & BCH_WRITE_CACHED) - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->cached = true; + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + p.ptr = ob->ptr; + p.ptr.cached = !ca->mi.durability || + (op->flags & BCH_WRITE_CACHED) != 0; + p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_extent_ptr_decoded_append(e, &p); + + BUG_ON(crc.compressed_size > ob->sectors_free); + ob->sectors_free -= crc.compressed_size; + } bch2_keylist_push(&op->insert_keys); } @@ -1253,7 +1264,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio flags &= ~BCH_READ_LAST_FRAGMENT; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, BTREE_ITER_SLOTS); @@ -1301,7 +1312,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; @@ -1314,7 +1325,7 @@ retry: bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_trans_unlock(&trans); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, bvec_iter.bi_size, (k.k->p.offset - bvec_iter.bi_sector) << 9); @@ -1404,13 +1415,13 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) struct bkey_i_extent *e; BKEY_PADDED(k) new; struct bch_extent_crc_unpacked new_crc; - unsigned offset; + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; if (rbio->pick.crc.compression_type) return; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -1427,24 +1438,19 @@ retry: e = bkey_i_to_extent(&new.k); if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || + rbio->pick.ptr, data_offset) || bversion_cmp(e->k.version, rbio->version)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < rbio->pos.offset || - e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(&e->k) < data_offset || + e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; - /* The extent might have been partially overwritten since we read it: */ - offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset); - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - offset, e->k.size, - rbio->pick.crc.csum_type)) { + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(&e->k) - data_offset, e->k.size, + rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); goto out; } @@ -1848,7 +1854,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) BCH_READ_USER_MAPPED; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); @@ -1869,7 +1875,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_trans_unlock(&trans); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 3ec80437..0a174dff 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -963,6 +963,8 @@ void bch2_fs_journal_stop(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bch2_journal_flush_all_pins(j); + wait_event(j->wait, journal_entry_close(j)); /* do we need to write another journal entry? */ diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 93ee5e88..231f5da2 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -257,7 +257,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) unsigned i, nr, new_nr; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < BTREE_ID_NR; i++) { struct btree_iter *iter; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 190b545b..74e17fa9 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -41,8 +41,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) BKEY_PADDED(key) tmp; int ret = 0; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); @@ -112,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { diff --git a/libbcachefs/move.c b/libbcachefs/move.c index d39f5633..97890918 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -61,8 +61,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct keylist *keys = &op->insert_keys; int ret = 0; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), @@ -500,7 +499,7 @@ int bch2_move_data(struct bch_fs *c, INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; stats->btree_id = BTREE_ID_EXTENTS; @@ -634,7 +633,7 @@ static int bch2_move_btree(struct bch_fs *c, enum data_cmd cmd; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_BTREE; diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 2b0edb68..8a42660c 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -360,7 +360,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), BTREE_ITER_PREFETCH, k, ret) { @@ -432,7 +432,7 @@ int bch2_fs_quota_read(struct bch_fs *c) return ret; } - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { @@ -725,7 +725,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 70fd9a27..535e2b6a 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -213,8 +213,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) bool split_compressed = false; int ret; - bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); retry: bch2_trans_begin(&trans); @@ -258,13 +257,9 @@ retry: } while (bkey_cmp(iter->pos, k->k.p) < 0); if (split_compressed) { - memset(&trans.fs_usage_deltas.fs_usage, 0, - sizeof(trans.fs_usage_deltas.fs_usage)); - trans.fs_usage_deltas.top = trans.fs_usage_deltas.d; - - ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false, + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), -((s64) k->k.size), - &trans.fs_usage_deltas) ?: + BCH_BUCKET_MARK_OVERWRITE) ?: bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index c2744c7d..67570676 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -262,7 +262,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) if (k.k->type == KEY_TYPE_extent) { diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 265db89a..96bca800 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -34,7 +34,7 @@ static void test_delete(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, BTREE_ITER_INTENT); @@ -66,7 +66,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, BTREE_ITER_INTENT); @@ -94,7 +94,7 @@ static void test_iterate(struct bch_fs *c, u64 nr) u64 i; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); delete_test_keys(c); @@ -139,7 +139,7 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) u64 i; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); delete_test_keys(c); @@ -189,7 +189,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) u64 i; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); delete_test_keys(c); @@ -243,7 +243,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) u64 i; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); delete_test_keys(c); @@ -304,7 +304,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr) struct btree_iter *iter; struct bkey_s_c k; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0); @@ -323,7 +323,7 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr) struct btree_iter *iter; struct bkey_s_c k; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); @@ -429,7 +429,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr) struct bkey_s_c k; u64 i; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, @@ -450,7 +450,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) int ret; u64 i; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, @@ -502,10 +502,10 @@ static void seq_insert(struct bch_fs *c, u64 nr) bkey_cookie_init(&insert.k_i); - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i)); @@ -523,10 +523,11 @@ static void seq_lookup(struct bch_fs *c, u64 nr) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k) + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) ; bch2_trans_exit(&trans); } @@ -538,10 +539,10 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, - BTREE_ITER_INTENT, k) { + BTREE_ITER_INTENT, k, ret) { struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index fd58829a..41a9753e 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -125,7 +125,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, struct bkey_s_c_xattr xattr; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &inode->ei_str_hash, inode->v.i_ino, @@ -276,7 +276,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) u64 inum = dentry->d_inode->i_ino; int ret; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS(inum, 0), 0, k, ret) {