diff --git a/.bcachefs_revision b/.bcachefs_revision index 77bb3639..6ceb95f8 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -fee79cd6543ed687efe86458e3c4479eff818488 +5a3a4087af27aa10da5f23cb174a439946153584 diff --git a/include/linux/bio.h b/include/linux/bio.h index e93341e6..cdbbcb39 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -113,13 +113,17 @@ static inline void *bio_data(struct bio *bio) #define __bio_kunmap_atomic(addr) kunmap_atomic(addr) -struct bvec_iter_all { - unsigned done; -}; +static inline struct bio_vec *bio_next_segment(const struct bio *bio, + struct bvec_iter_all *iter) +{ + if (iter->idx >= bio->bi_vcnt) + return NULL; -#define bio_for_each_segment_all(bvl, bio, i, iter) \ - for (i = 0, bvl = (bio)->bi_io_vec, iter = (struct bvec_iter_all) { 0 }; \ - i < (bio)->bi_vcnt; i++, bvl++) + return &bio->bi_io_vec[iter->idx]; +} + +#define bio_for_each_segment_all(bvl, bio, iter) \ + for ((iter).idx = 0; (bvl = bio_next_segment((bio), &(iter))); (iter).idx++) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 89b65b82..5bc68b42 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -43,6 +43,10 @@ struct bvec_iter { current bvec */ }; +struct bvec_iter_all { + int idx; +}; + /* * various member access, note that bio_data should of course not be used * on highmem page vectors diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7a457729..9814179a 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1164,7 +1164,7 @@ static int bch2_allocator_thread(void *arg) */ if (!nr || (nr < ALLOC_SCAN_BATCH(ca) && - !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { + !fifo_empty(&ca->free[RESERVE_NONE]))) { ret = wait_buckets_available(c, ca); if (ret) { up_read(&c->gc_lock); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index e64f8449..697d5768 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -693,8 +693,7 @@ retry_blocking: } void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, - struct open_buckets *obs, - enum bch_data_type data_type) + struct open_buckets *obs) { struct open_buckets ptrs = { .nr = 0 }; struct open_bucket *ob, *ob2; @@ -725,7 +724,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { mutex_lock(&wp->lock); - bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); mutex_unlock(&wp->lock); } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 6d8ffb0c..687f973e 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -106,7 +106,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, - struct open_buckets *, enum bch_data_type); + struct open_buckets *); void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, struct write_point *); diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 667170b5..4577d77a 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -657,7 +657,7 @@ struct bch_reservation { /* Maximum possible size of an entire extent value: */ #define BKEY_EXTENT_VAL_U64s_MAX \ - (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) #define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 6fa6ac1f..f01405dd 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -145,7 +145,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) } if (ops->key_debugcheck) - ops->key_debugcheck(c, b, k); + ops->key_debugcheck(c, k); } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index e6e97cda..8568b65c 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -26,8 +26,7 @@ struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, struct bkey_s_c); - void (*key_debugcheck)(struct bch_fs *, struct btree *, - struct bkey_s_c); + void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(const struct bkey_format *, struct bkey_packed *); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 046524c8..41694951 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -674,10 +674,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); retry: - rcu_read_lock(); b = btree_cache_find(bc, k); - rcu_read_unlock(); - if (unlikely(!b)) { /* * We must have the parent locked to call bch2_btree_node_fill(), @@ -878,10 +875,7 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, BUG_ON(!btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - rcu_read_lock(); b = btree_cache_find(bc, k); - rcu_read_unlock(); - if (b) return; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 5c77a955..f4adb07a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -762,6 +762,8 @@ out: percpu_down_write(&c->mark_lock); bch2_gc_free(c); percpu_up_write(&c->mark_lock); + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); goto again; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a28d2dd7..40cd87d7 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -526,6 +526,10 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, unsigned offset = __btree_node_key_to_offset(b, where); int shift = new_u64s - clobber_u64s; unsigned old_end = t->end_offset - shift; + unsigned orig_iter_pos = node_iter->data[0].k; + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -534,18 +538,12 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && btree_iter_pos_cmp(iter, b, where) > 0) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - bch2_btree_node_iter_push(node_iter, b, where, end); - - if (!b->level && - node_iter == &iter->l[0].iter) - bkey_disassemble(b, - bch2_btree_node_iter_peek_all(node_iter, b), - &iter->k); + goto fixup_done; + } else { + /* Iterator is after key that changed */ + return; } - - goto iter_current_key_not_modified; found: set->end = t->end_offset; @@ -561,40 +559,25 @@ found: if (set->k == set->end) bch2_btree_node_iter_set_drop(node_iter, set); } else { + /* Iterator is after key that changed */ set->k = (int) set->k + shift; - goto iter_current_key_not_modified; + return; } - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - bch2_btree_node_iter_sort(node_iter, b); - if (!b->level && node_iter == &iter->l[0].iter) { - /* - * not legal to call bkey_debugcheck() here, because we're - * called midway through the update path after update has been - * marked but before deletes have actually happened: - */ -#if 0 - __btree_iter_peek_all(iter, &iter->l[0], &iter->k); -#endif - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_peek_all(&l->iter, l->b); +fixup_done: + if (node_iter->data[0].k != orig_iter_pos) + iter_current_key_modified = true; - if (unlikely(!k)) - iter->k.type = KEY_TYPE_deleted; - else - bkey_disassemble(l->b, k, &iter->k); - } -iter_current_key_not_modified: /* * When a new key is added, and the node iterator now points to that * key, the iterator might have skipped past deleted keys that should * come after the key the iterator now points to. We have to rewind to - * before those deleted keys - otherwise bch2_btree_node_iter_prev_all() - * breaks: + * before those deleted keys - otherwise + * bch2_btree_node_iter_prev_all() breaks: */ if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && (b->level || (iter->flags & BTREE_ITER_IS_EXTENTS))) { struct bset_tree *t; @@ -622,7 +605,21 @@ iter_current_key_not_modified: } } - bch2_btree_node_iter_verify(node_iter, b); + if (!b->level && + node_iter == &iter->l[0].iter && + iter_current_key_modified) { + struct bkey_packed *k = + bch2_btree_node_iter_peek_all(node_iter, b); + + if (likely(k)) { + bkey_disassemble(b, k, &iter->k); + } else { + /* XXX: for extents, calculate size of hole? */ + iter->k.type = KEY_TYPE_deleted; + } + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } } void bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -635,14 +632,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, struct bset_tree *t = bch2_bkey_to_bset(b, where); struct btree_iter *linked; - if (node_iter != &iter->l[b->level].iter) + if (node_iter != &iter->l[b->level].iter) { __bch2_btree_node_iter_fix(iter, b, node_iter, t, - where, clobber_u64s, new_u64s); + where, clobber_u64s, new_u64s); + bch2_btree_node_iter_verify(node_iter, b); + } - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, - &linked->l[b->level].iter, t, - where, clobber_u64s, new_u64s); + &linked->l[b->level].iter, t, + where, clobber_u64s, new_u64s); + __bch2_btree_iter_verify(linked, b); + } } static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, @@ -685,6 +686,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bch2_btree_node_iter_peek(&l->iter, l->b)); } +static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, + struct btree_iter_level *l) +{ + return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_prev(&l->iter, l->b)); +} + static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) @@ -743,18 +751,29 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } +static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) +{ + return bkey_cmp(iter->pos, b->data->min_key) < 0; +} + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return __btree_iter_pos_cmp(iter, NULL, - bkey_to_packed(&b->key), true) < 0; + int cmp = bkey_cmp(b->key.k.p, iter->pos); + + if (!cmp && + (iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(b->key.k.p, POS_MAX)) + cmp = -1; + return cmp < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, struct btree *b) { return iter->btree_id == b->btree_id && - bkey_cmp(iter->pos, b->data->min_key) >= 0 && + !btree_iter_pos_before_node(iter, b) && !btree_iter_pos_after_node(iter, b); } @@ -956,10 +975,10 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -int __must_check __bch2_btree_iter_traverse(struct btree_iter *); +static int btree_iter_traverse_one(struct btree_iter *); static int __btree_iter_traverse_all(struct btree_trans *trans, - struct btree_iter *orig_iter, int ret) + struct btree_iter *orig_iter, int ret) { struct bch_fs *c = trans->c; struct btree_iter *iter; @@ -1003,7 +1022,7 @@ retry_all: iter = &trans->iters[sorted[i]]; do { - ret = __bch2_btree_iter_traverse(iter); + ret = btree_iter_traverse_one(iter); } while (ret == -EINTR); if (ret) @@ -1021,16 +1040,27 @@ int bch2_btree_iter_traverse_all(struct btree_trans *trans) return __btree_iter_traverse_all(trans, NULL, 0); } -static unsigned btree_iter_up_until_locked(struct btree_iter *iter, - bool check_pos) +static inline bool btree_iter_good_node(struct btree_iter *iter, + unsigned l, int check_pos) +{ + if (!is_btree_node(iter, l) || + !bch2_btree_node_relock(iter, l)) + return false; + + if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + return false; + if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + return false; + return true; +} + +static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + int check_pos) { unsigned l = iter->level; while (btree_iter_node(iter, l) && - (!is_btree_node(iter, l) || - !bch2_btree_node_relock(iter, l) || - (check_pos && - !btree_iter_pos_in_node(iter, iter->l[l].b)))) { + !btree_iter_good_node(iter, l, check_pos)) { btree_node_unlock(iter, l); iter->l[l].b = BTREE_ITER_NO_NODE_UP; l++; @@ -1048,7 +1078,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +static int btree_iter_traverse_one(struct btree_iter *iter) { unsigned depth_want = iter->level; @@ -1062,7 +1092,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos * here unnecessary */ - iter->level = btree_iter_up_until_locked(iter, true); + iter->level = btree_iter_up_until_good_node(iter, 0); /* * If we've got a btree node locked (i.e. we aren't about to relock the @@ -1070,8 +1100,11 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary */ - if (btree_iter_node(iter, iter->level)) + if (btree_iter_node(iter, iter->level)) { + BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); + btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); + } /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that @@ -1100,12 +1133,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) return 0; } -int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; ret = bch2_trans_cond_resched(iter->trans) ?: - __bch2_btree_iter_traverse(iter); + btree_iter_traverse_one(iter); if (unlikely(ret)) ret = __btree_iter_traverse_all(iter->trans, iter, ret); @@ -1234,19 +1267,11 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } -void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) { - int cmp = bkey_cmp(new_pos, iter->pos); - unsigned level; + unsigned l = btree_iter_up_until_good_node(iter, cmp); - if (!cmp) - return; - - iter->pos = new_pos; - - level = btree_iter_up_until_locked(iter, true); - - if (btree_iter_node(iter, level)) { + if (btree_iter_node(iter, l)) { /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1254,37 +1279,98 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) * is expensive). */ if (cmp < 0 || - !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) - __btree_iter_init(iter, level); + !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) + __btree_iter_init(iter, l); /* Don't leave it locked if we're not supposed to: */ - if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level); + if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, l); } - if (level != iter->level) + return l; +} + +void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + int cmp = bkey_cmp(new_pos, iter->pos); + unsigned l; + + if (!cmp) + return; + + iter->pos = new_pos; + + l = btree_iter_pos_changed(iter, cmp); + + if (l != iter->level) btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); else btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } +static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) { + bkey_init(&iter->k); + iter->k.p = POS_MAX; + return false; + } + + iter->pos = btree_type_successor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, 1); + return true; +} + +static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + + iter->pos = l->b->data->min_key; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MIN)) { + bkey_init(&iter->k); + iter->k.p = POS_MIN; + return false; + } + + iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, -1); + return true; +} + static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c ret = { .k = &iter->k }; if (!bkey_deleted(&iter->k)) { - EBUG_ON(bch2_btree_node_iter_end(&l->iter)); - ret.v = bkeyp_val(&l->b->format, - __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + struct bkey_packed *_k = + __bch2_btree_node_iter_peek_all(&l->iter, l->b); + + ret.v = bkeyp_val(&l->b->format, _k); + + if (debug_check_iterators(iter->trans->c)) { + struct bkey k = bkey_unpack_key(l->b, _k); + BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + } + + if (debug_check_bkeys(iter->trans->c)) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); } - if (debug_check_bkeys(iter->trans->c) && - !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; } +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; @@ -1297,24 +1383,16 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return btree_iter_peek_uptodate(iter); while (1) { - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - } + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); k = __btree_iter_peek(iter, l); if (likely(k.k)) break; - /* got to the end of the leaf, iterator needs to be traversed: */ - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - if (!bkey_cmp(iter->pos, POS_MAX)) + if (!btree_iter_set_pos_to_next_leaf(iter)) return bkey_s_c_null; - - iter->pos = btree_type_successor(iter->btree_id, iter->pos); } /* @@ -1329,22 +1407,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return k; } -static noinline -struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) -{ - struct btree_iter_level *l = &iter->l[0]; - - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - if (!bkey_cmp(iter->pos, POS_MAX)) - return bkey_s_c_null; - - iter->pos = btree_type_successor(iter->btree_id, iter->pos); - - return bch2_btree_iter_peek(iter); -} - +/** + * bch2_btree_iter_next: returns first key greater than iterator's current + * position + */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; @@ -1353,15 +1419,19 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - iter->pos = btree_type_successor(iter->btree_id, iter->k.p); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; + /* * XXX: when we just need to relock we should be able to avoid * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK * for that to work */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); return bch2_btree_iter_peek(iter); } @@ -1369,9 +1439,12 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) do { bch2_btree_node_iter_advance(&l->iter, l->b); p = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (unlikely(!p)) - return bch2_btree_iter_peek_next_leaf(iter); - } while (bkey_whiteout(p)); + } while (likely(p) && bkey_whiteout(p)); + + if (unlikely(!p)) + return btree_iter_set_pos_to_next_leaf(iter) + ? bch2_btree_iter_peek(iter) + : bkey_s_c_null; k = __btree_iter_unpack(iter, l, &iter->k, p); @@ -1380,51 +1453,79 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return k; } -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +/** + * bch2_btree_iter_peek_prev: returns first key less than or equal to + * iterator's current position + */ +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *p; struct bkey_s_c k; int ret; bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - k = bch2_btree_iter_peek(iter); - if (IS_ERR(k.k)) - return k; - } + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); while (1) { - p = bch2_btree_node_iter_prev(&l->iter, l->b); - if (likely(p)) - break; - - iter->pos = l->b->data->min_key; - if (!bkey_cmp(iter->pos, POS_MIN)) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(iter, - btree_type_predecessor(iter->btree_id, iter->pos)); - ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); - p = bch2_btree_node_iter_peek(&l->iter, l->b); - if (p) + k = __btree_iter_peek(iter, l); + if (!k.k || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + k = __btree_iter_prev(iter, l); + + if (likely(k.k)) break; + + if (!btree_iter_set_pos_to_prev_leaf(iter)) + return bkey_s_c_null; } - k = __btree_iter_unpack(iter, l, &iter->k, p); - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); - iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; return k; } +/** + * bch2_btree_iter_prev: returns first key less than iterator's current + * position + */ +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + /* + * XXX: when we just need to relock we should be able to avoid + * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK + * for that to work + */ + iter->pos = btree_type_predecessor(iter->btree_id, + iter->pos); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + return bch2_btree_iter_peek_prev(iter); + } + + k = __btree_iter_prev(iter, l); + if (unlikely(!k.k)) + return btree_iter_set_pos_to_prev_leaf(iter) + ? bch2_btree_iter_peek(iter) + : bkey_s_c_null; + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); + iter->pos = bkey_start_pos(k.k); + return k; +} + static inline struct bkey_s_c __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { @@ -1565,11 +1666,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->uptodate == BTREE_ITER_UPTODATE) return btree_iter_peek_uptodate(iter); - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - } + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); return __bch2_btree_iter_peek_slot(iter); } @@ -1671,7 +1770,10 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates; + void *new_iters, *new_updates, *new_sorted; + size_t iters_bytes; + size_t updates_bytes; + size_t sorted_bytes; new_size = roundup_pow_of_two(new_size); @@ -1684,9 +1786,13 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, bch2_trans_unlock(trans); - new_iters = kmalloc(sizeof(struct btree_iter) * new_size + - sizeof(struct btree_insert_entry) * (new_size + 4), - GFP_NOFS); + iters_bytes = sizeof(struct btree_iter) * new_size; + updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4); + sorted_bytes = sizeof(u8) * (new_size + 4); + + new_iters = kmalloc(iters_bytes + + updates_bytes + + sorted_bytes, GFP_NOFS); if (new_iters) goto success; @@ -1695,7 +1801,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, trans->used_mempool = true; success: - new_updates = new_iters + sizeof(struct btree_iter) * new_size; + new_updates = new_iters + iters_bytes; + new_sorted = new_updates + updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); @@ -1710,9 +1817,10 @@ success: if (trans->iters != trans->iters_onstack) kfree(trans->iters); - trans->iters = new_iters; - trans->updates = new_updates; - trans->size = new_size; + trans->iters = new_iters; + trans->updates = new_updates; + trans->updates_sorted = new_sorted; + trans->size = new_size; if (trans->iters_live) { trace_trans_restart_iters_realloced(trans->ip, trans->size); @@ -1957,6 +2065,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->updates_sorted = trans->updates_sorted_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -1981,3 +2090,18 @@ int bch2_trans_exit(struct btree_trans *trans) return trans->error ? -EIO : 0; } + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ + mempool_exit(&c->btree_iters_pool); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + unsigned nr = BTREE_ITER_MAX; + + return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * (nr + 4) + + sizeof(u8) * (nr + 4)); +} diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 249df21b..e4967215 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -134,7 +134,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_iter *); + +static inline int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return iter->uptodate >= BTREE_ITER_NEED_RELOCK + ? __bch2_btree_iter_traverse(iter) + : 0; +} + int bch2_btree_iter_traverse_all(struct btree_trans *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); @@ -142,6 +151,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); @@ -303,4 +314,7 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t); void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); int bch2_trans_exit(struct btree_trans *); +void bch2_fs_btree_iter_exit(struct bch_fs *); +int bch2_fs_btree_iter_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index ea07ba19..592c3b4e 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -212,7 +212,7 @@ static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter EBUG_ON(iter->l[b->level].b != b); EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); - if (!six_trylock_write(&b->lock)) + if (unlikely(!six_trylock_write(&b->lock))) __bch2_btree_node_lock_write(b, iter); } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index f4e1bfe1..b0da0963 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -261,8 +261,6 @@ struct btree_insert_entry { }; bool deferred; - bool triggered; - bool marked; }; #define BTREE_ITER_MAX 64 @@ -291,6 +289,7 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; + u8 *updates_sorted; /* update path: */ struct journal_res journal_res; @@ -302,6 +301,7 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; + u8 updates_sorted_onstack[6]; struct replicas_delta_list *fs_usage_deltas; }; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 616c103c..36e34b3d 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -43,7 +43,6 @@ enum { __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, - __BTREE_INSERT_NOMARK_INSERT, __BTREE_INSERT_NOMARK_OVERWRITES, __BTREE_INSERT_NOMARK, __BTREE_INSERT_MARK_INMEM, @@ -81,9 +80,6 @@ enum { #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -/* Don't mark new key, just overwrites: */ -#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT) - /* Don't mark overwrites, just new key: */ #define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) @@ -123,8 +119,13 @@ int bch2_trans_commit(struct btree_trans *, struct disk_reservation *, u64 *, unsigned); -struct btree_insert_entry *bch2_trans_update(struct btree_trans *, - struct btree_insert_entry); +static inline void bch2_trans_update(struct btree_trans *trans, + struct btree_insert_entry entry) +{ + EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); + + trans->updates[trans->nr_updates++] = entry; +} #define bch2_trans_do(_c, _journal_seq, _flags, _do) \ ({ \ @@ -144,18 +145,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *, _ret; \ }) -/* - * We sort transaction entries so that if multiple iterators point to the same - * leaf node they'll be adjacent: - */ -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - !i->deferred && - i[0].iter->l[0].b == i[-1].iter->l[0].b; -} - #define __trans_next_update(_trans, _i, _filter) \ ({ \ while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\ @@ -175,8 +164,4 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, #define trans_for_each_update_iter(trans, i) \ __trans_for_each_update(trans, i, !(i)->deferred) -#define trans_for_each_update_leaf(trans, i) \ - __trans_for_each_update(trans, i, !(i)->deferred && \ - !same_leaf_as_prev(trans, i)) - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index c0a84153..7d983b21 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -19,12 +19,32 @@ #include #include +static inline bool same_leaf_as_prev(struct btree_trans *trans, + unsigned sorted_idx) +{ + struct btree_insert_entry *i = trans->updates + + trans->updates_sorted[sorted_idx]; + struct btree_insert_entry *prev = sorted_idx + ? trans->updates + trans->updates_sorted[sorted_idx - 1] + : NULL; + + return !i->deferred && + prev && + i->iter->l[0].b == prev->iter->l[0].b; +} + +#define trans_for_each_update_sorted(_trans, _i, _iter) \ + for (_iter = 0; \ + _iter < _trans->nr_updates && \ + (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ + _iter++) + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { bch2_btree_node_lock_write(b, iter); - if (btree_node_just_written(b) && + if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) bch2_btree_iter_reinit_node(iter, b); @@ -36,20 +56,21 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans) +static void btree_trans_lock_write(struct btree_trans *trans, bool lock) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned iter; - trans_for_each_update_leaf(trans, i) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); -} + trans_for_each_update_sorted(trans, i, iter) { + if (same_leaf_as_prev(trans, iter)) + continue; -static void btree_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update_leaf(trans, i) - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + if (lock) + bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); + else + bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + } } static inline int btree_trans_cmp(struct btree_insert_entry l, @@ -59,6 +80,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l, btree_iter_cmp(l.iter, r.iter); } +static inline void btree_trans_sort_updates(struct btree_trans *trans) +{ + struct btree_insert_entry *l, *r; + unsigned nr = 0, pos; + + trans_for_each_update(trans, l) { + for (pos = 0; pos < nr; pos++) { + r = trans->updates + trans->updates_sorted[pos]; + + if (btree_trans_cmp(*l, *r) <= 0) + break; + } + + memmove(&trans->updates_sorted[pos + 1], + &trans->updates_sorted[pos], + (nr - pos) * sizeof(trans->updates_sorted[0])); + + trans->updates_sorted[pos] = l - trans->updates; + nr++; + } + + BUG_ON(nr != trans->nr_updates); +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -106,7 +151,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bch2_bset_delete(b, k, clobber_u64s); bch2_btree_node_iter_fix(iter, b, node_iter, k, clobber_u64s, 0); - bch2_btree_iter_verify(iter, b); return true; } @@ -116,7 +160,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; bch2_btree_node_iter_fix(iter, b, node_iter, k, k->u64s, k->u64s); - bch2_btree_iter_verify(iter, b); if (bkey_whiteout(&insert->k)) { reserve_whiteout(b, k); @@ -138,10 +181,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, k->u64s); - bch2_btree_iter_verify(iter, b); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, k->u64s); return true; } @@ -488,12 +529,12 @@ static int btree_trans_check_can_insert(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { struct btree_insert_entry *i; - unsigned u64s = 0; + unsigned iter, u64s = 0; int ret; - trans_for_each_update_iter(trans, i) { + trans_for_each_update_sorted(trans, i, iter) { /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) + if (!same_leaf_as_prev(trans, iter)) u64s = 0; u64s += i->k->k.u64s; @@ -542,7 +583,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; - bool saw_non_marked; unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ? BCH_BUCKET_MARK_BUCKET_INVALIDATE : 0; @@ -551,35 +591,32 @@ static inline int do_btree_insert_at(struct btree_trans *trans, trans_for_each_update_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + /* + * note: running triggers will append more updates to the list of + * updates as we're walking it: + */ trans_for_each_update_iter(trans, i) - i->marked = false; - - do { - saw_non_marked = false; - - trans_for_each_update_iter(trans, i) { - if (i->marked) - continue; - - saw_non_marked = true; - i->marked = true; - - if (update_has_triggers(trans, i) && - update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - if (ret) - goto out_clear_replicas; - } + if (update_has_triggers(trans, i) && + update_triggers_transactional(trans, i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + if (ret) + goto out_clear_replicas; } - } while (saw_non_marked); - trans_for_each_update(trans, i) - btree_insert_entry_checks(trans, i); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_update(trans, i) + btree_insert_entry_checks(trans, i); bch2_btree_trans_verify_locks(trans); - btree_trans_lock_write(c, trans); + /* + * No more updates can be added - sort updates so we can take write + * locks in the correct order: + */ + btree_trans_sort_updates(trans); + + btree_trans_lock_write(trans, true); if (race_fault()) { ret = -EINTR; @@ -597,8 +634,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, goto out; trans_for_each_update_iter(trans, i) { - if (i->deferred || - !btree_node_type_needs_gc(i->iter->btree_id)) + if (!btree_node_type_needs_gc(i->iter->btree_id)) continue; if (!fs_usage) { @@ -664,7 +700,7 @@ out: (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && trans->journal_res.ref); - btree_trans_unlock_write(trans); + btree_trans_lock_write(trans, false); if (fs_usage) { bch2_fs_usage_scratch_put(c, fs_usage); @@ -689,19 +725,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, { struct bch_fs *c = trans->c; unsigned flags = trans->flags; - struct btree_insert_entry *src, *dst; - - src = dst = trans->updates; - - while (src < trans->updates + trans->nr_updates) { - if (!src->triggered) { - *dst = *src; - dst++; - } - src++; - } - - trans->nr_updates = dst - trans->updates; /* * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree @@ -816,6 +839,7 @@ static int __bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned iter; int ret; trans_for_each_update_iter(trans, i) { @@ -837,8 +861,10 @@ static int __bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update_leaf(trans, i) - bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); + trans_for_each_update_sorted(trans, i, iter) + if (!same_leaf_as_prev(trans, iter)) + bch2_foreground_maybe_merge(c, i->iter, + 0, trans->flags); trans->nounlock = false; @@ -858,7 +884,8 @@ int bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; - unsigned orig_mem_top = trans->mem_top; + unsigned orig_nr_updates = trans->nr_updates; + unsigned orig_mem_top = trans->mem_top; int ret = 0; if (!trans->nr_updates) @@ -931,39 +958,20 @@ out_noupdates: err: ret = bch2_trans_commit_error(trans, i, ret); + /* free updates and memory used by triggers, they'll be reexecuted: */ + trans->nr_updates = orig_nr_updates; + trans->mem_top = orig_mem_top; + /* can't loop if it was passed in and we changed it: */ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ret = -EINTR; - if (!ret) { - /* free memory used by triggers, they'll be reexecuted: */ - trans->mem_top = orig_mem_top; + if (!ret) goto retry; - } goto out; } -struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans, - struct btree_insert_entry entry) -{ - struct btree_insert_entry *i; - - BUG_ON(trans->nr_updates >= trans->nr_iters + 4); - - for (i = trans->updates; - i < trans->updates + trans->nr_updates; - i++) - if (btree_trans_cmp(entry, *i) < 0) - break; - - memmove(&i[1], &i[0], - (void *) &trans->updates[trans->nr_updates] - (void *) i); - trans->nr_updates++; - *i = entry; - return i; -} - /** * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 1516df22..6a4773a9 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1265,11 +1265,10 @@ int bch2_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) - bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), - 0, insert->k->k.size, - fs_usage, trans->journal_res.seq, - BCH_BUCKET_MARK_INSERT|flags); + bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), + 0, insert->k->k.size, + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) return 0; @@ -1359,11 +1358,8 @@ static int trans_get_key(struct btree_trans *trans, struct btree_insert_entry *i; int ret; - for (i = trans->updates; - i < trans->updates + trans->nr_updates; - i++) - if (!i->deferred && - i->iter->btree_id == btree_id && + trans_for_each_update_iter(trans, i) + if (i->iter->btree_id == btree_id && (btree_node_type_is_extents(btree_id) ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && bkey_cmp(pos, i->k->k.p) < 0 @@ -1391,8 +1387,8 @@ static void *trans_update_key(struct btree_trans *trans, struct btree_iter *iter, unsigned u64s) { + struct btree_insert_entry *i; struct bkey_i *new_k; - unsigned i; new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); if (IS_ERR(new_k)) @@ -1401,19 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans, bkey_init(&new_k->k); new_k->k.p = iter->pos; - for (i = 0; i < trans->nr_updates; i++) - if (!trans->updates[i].deferred && - trans->updates[i].iter == iter) { - trans->updates[i].k = new_k; + trans_for_each_update_iter(trans, i) + if (i->iter == iter) { + i->k = new_k; return new_k; } - bch2_trans_update(trans, ((struct btree_insert_entry) { - .iter = iter, - .k = new_k, - .triggered = true, - })); - + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k)); return new_k; } @@ -1496,6 +1486,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, bch2_fs_inconsistent_on(overflow, c, "bucket sector count overflow: %u + %lli > U16_MAX", old, sectors); + BUG_ON(overflow); a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ret = PTR_ERR_OR_ZERO(a); diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index e55aa98c..a5c947e8 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -127,7 +127,6 @@ static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, do_encrypt(c->chacha20, nonce, key, sizeof(key)); desc->tfm = c->poly1305; - desc->flags = 0; crypto_shash_init(desc); crypto_shash_update(desc, key, sizeof(key)); } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 0742d2c1..be2eca0f 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1173,12 +1173,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) struct ec_stripe_new *s = NULL; mutex_lock(&h->lock); - bch2_open_buckets_stop_dev(c, ca, - &h->blocks, - BCH_DATA_USER); - bch2_open_buckets_stop_dev(c, ca, - &h->parity, - BCH_DATA_USER); + bch2_open_buckets_stop_dev(c, ca, &h->blocks); + bch2_open_buckets_stop_dev(c, ca, &h->parity); if (!h->s) goto unlock; diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 1aaff44e..304ff925 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -4,6 +4,8 @@ #include "io.h" #include "super.h" +#define FSCK_ERR_RATELIMIT_NR 10 + bool bch2_inconsistent_error(struct bch_fs *c) { set_bit(BCH_FS_ERROR, &c->flags); @@ -97,8 +99,8 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, found: list_move(&s->list, &c->fsck_errors); s->nr++; - suppressing = s->nr == 10; - print = s->nr <= 10; + suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; + print = s->nr <= FSCK_ERR_RATELIMIT_NR; buf = s->buf; print: va_start(args, fmt); @@ -152,10 +154,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c) struct fsck_err_state *s, *n; mutex_lock(&c->fsck_error_lock); - set_bit(BCH_FS_FSCK_DONE, &c->flags); list_for_each_entry_safe(s, n, &c->fsck_errors, list) { - if (s->nr > 10) + if (s->nr > FSCK_ERR_RATELIMIT_NR) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); list_del(&s->list); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index ecebd791..e10ea43b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -672,8 +672,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ptrs_invalid(c, k); } -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -877,13 +876,6 @@ static void verify_extent_nonoverlapping(struct bch_fs *c, #endif } -static void verify_modified_extent(struct btree_iter *iter, - struct bkey_packed *k) -{ - bch2_btree_iter_verify(iter, iter->l[0].b); - bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); -} - static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { @@ -896,6 +888,9 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + node_iter = l->iter; k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); if (k && !bkey_written(l->b, k) && @@ -922,7 +917,6 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_bset_insert(l->b, &l->iter, k, insert, 0); bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); - bch2_btree_iter_verify(iter, l->b); } static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) @@ -942,12 +936,13 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return ret; } -static int __bch2_extent_atomic_end(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters, + bool overwrite) { int ret = 0; @@ -977,6 +972,20 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans, break; *nr_iters += 1; + + if (overwrite && + k.k->type == KEY_TYPE_reflink_v) { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + if (le64_to_cpu(r.v->refcount) == 1) + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + } + + /* + * if we're going to be deleting an entry from + * the reflink btree, need more iters... + */ + if (*nr_iters >= max_iters) { struct bpos pos = bkey_start_pos(k.k); pos.offset += r_k.k->p.offset - idx; @@ -994,11 +1003,11 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans, return ret; } -int bch2_extent_atomic_end(struct btree_trans *trans, - struct btree_iter *iter, +int bch2_extent_atomic_end(struct btree_iter *iter, struct bkey_i *insert, struct bpos *end) { + struct btree_trans *trans = iter->trans; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; @@ -1011,8 +1020,8 @@ int bch2_extent_atomic_end(struct btree_trans *trans, *end = bpos_min(insert->k.p, b->key.k.p); - ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert), - 0, end, &nr_iters, 10); + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), + 0, end, &nr_iters, 10, false); if (ret) return ret; @@ -1031,8 +1040,8 @@ int bch2_extent_atomic_end(struct btree_trans *trans, offset = bkey_start_offset(&insert->k) - bkey_start_offset(k.k); - ret = __bch2_extent_atomic_end(trans, k, offset, - end, &nr_iters, 20); + ret = count_iters_for_insert(trans, k, offset, + end, &nr_iters, 20, true); if (ret) return ret; @@ -1050,7 +1059,7 @@ int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) struct bpos end; int ret; - ret = bch2_extent_atomic_end(iter->trans, iter, k, &end); + ret = bch2_extent_atomic_end(iter, k, &end); if (ret) return ret; @@ -1063,7 +1072,7 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) struct bpos end; int ret; - ret = bch2_extent_atomic_end(iter->trans, iter, k, &end); + ret = bch2_extent_atomic_end(iter, k, &end); if (ret) return ret; @@ -1137,15 +1146,16 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ bch2_cut_back(bkey_start_pos(&insert->k), k.k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); /* @@ -1156,7 +1166,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, bch2_bset_fix_invalidated_key(l->b, _k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { @@ -1173,12 +1182,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, bch2_bset_delete(l->b, _k, _k->u64s); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - bch2_btree_iter_verify(iter, l->b); } else { extent_save(l->b, _k, k.k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); } break; @@ -1208,7 +1215,8 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); extent_bset_insert(c, iter, &split.k); break; @@ -1265,6 +1273,8 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c, btree_account_key_drop(l->b, _k); _k->type = KEY_TYPE_discard; reserve_whiteout(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); } break; } @@ -1359,10 +1369,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, if (s.deleting) tmp.k.k.type = KEY_TYPE_discard; - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, - bkey_i_to_s_c(&tmp.k)); - EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); extent_bset_insert(c, iter, &tmp.k); @@ -1387,8 +1393,7 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ptrs_invalid(c, k); } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; @@ -1762,6 +1767,12 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, if (ret == BCH_MERGE_NOMERGE) return false; + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k)); + if (debug_check_bkeys(c) && + ret == BCH_MERGE_PARTIAL) + bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k)); + /* * check if we overlap with deleted extents - would break the sort * order: @@ -1798,7 +1809,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, bch2_bset_fix_invalidated_key(b, m); bch2_btree_node_iter_fix(iter, b, node_iter, m, m->u64s, m->u64s); - verify_modified_extent(iter, m); return ret == BCH_MERGE_MERGE; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 189ae4c7..613d76af 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -389,8 +389,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); /* bch_btree_ptr: */ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, - struct bkey_s_c); +void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); @@ -405,7 +404,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); /* bch_extent: */ const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); enum merge_result bch2_extent_merge(struct bch_fs *, @@ -433,8 +432,8 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, .key_merge = bch2_reservation_merge, \ } -int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bpos *); +int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, + struct bpos *); int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); @@ -455,12 +454,11 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c); bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); -static inline bool bkey_extent_is_data(const struct bkey *k) +static inline bool bkey_extent_is_direct_data(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_extent: - case KEY_TYPE_reflink_p: case KEY_TYPE_reflink_v: return true; default: @@ -468,6 +466,12 @@ static inline bool bkey_extent_is_data(const struct bkey *k) } } +static inline bool bkey_extent_is_data(const struct bkey *k) +{ + return bkey_extent_is_direct_data(k) || + k->type == KEY_TYPE_reflink_p; +} + /* * Should extent be counted under inode->i_sectors? */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d635ebb5..aff70324 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -749,6 +749,9 @@ static void bch2_set_page_dirty(struct bch_fs *c, struct bch_page_state *s = bch2_page_state(page); unsigned i, dirty_sectors = 0; + WARN_ON(page_offset(page) + offset + len > + round_up(i_size_read(&inode->v), block_bytes(c))); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { @@ -780,6 +783,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_page_reservation res; + unsigned len; + loff_t isize; int ret = VM_FAULT_LOCKED; bch2_page_reservation_init(c, inode, &res); @@ -797,21 +802,27 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) pagecache_add_get(&mapping->add_lock); lock_page(page); - if (page->mapping != mapping || - page_offset(page) > i_size_read(&inode->v)) { + isize = i_size_read(&inode->v); + + if (page->mapping != mapping || page_offset(page) >= isize) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; } - if (bch2_page_reservation_get(c, inode, page, &res, - 0, PAGE_SIZE, true)) { + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) <= isize) + len = PAGE_SIZE; + else + len = offset_in_page(isize); + + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; goto out; } - bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE); + bch2_set_page_dirty(c, inode, page, &res, 0, len); wait_for_stable_page(page); out: if (current->pagecache_lock != &mapping->add_lock) @@ -884,9 +895,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct bvec_iter_all iter; struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, bio, i, iter) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -1287,10 +1297,10 @@ static void bch2_writepage_io_done(struct closure *cl) struct bio *bio = &io->op.op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.op.error) { - bio_for_each_segment_all(bvec, bio, i, iter) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); @@ -1298,8 +1308,8 @@ static void bch2_writepage_io_done(struct closure *cl) lock_page(bvec->bv_page); s = bch2_page_state(bvec->bv_page); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; unlock_page(bvec->bv_page); } } @@ -1325,7 +1335,7 @@ static void bch2_writepage_io_done(struct closure *cl) i_sectors_acct(c, io->op.inode, NULL, io->op.sectors_added - (s64) io->new_sectors); - bio_for_each_segment_all(bvec, bio, i, iter) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1490,6 +1500,10 @@ do_io: BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page, sectors << 9, offset << 9)); + /* Check for writing past i_size: */ + WARN_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c))); + w->io->op.op.res.sectors += reserved_sectors; w->io->op.new_i_size = i_size; @@ -1994,16 +2008,17 @@ static void bch2_dio_write_loop_async(struct closure *); static long bch2_dio_write_loop(struct dio_write *dio) { bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_fs *c = dio->iop.op.c; struct kiocb *req = dio->req; struct address_space *mapping = req->ki_filp->f_mapping; struct bch_inode_info *inode = dio->iop.inode; struct bio *bio = &dio->iop.op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; + unsigned unaligned; loff_t offset; bool sync; long ret; - int i; if (dio->loop) goto loop; @@ -2036,6 +2051,21 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (unlikely(ret < 0)) goto err; + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); + ret = -EFAULT; + goto err; + } + /* gup might have faulted pages back in: */ ret = write_invalidate_inode_pages_range(mapping, offset, @@ -2076,7 +2106,7 @@ err_wait_io: closure_sync(&dio->cl); loop: - bio_for_each_segment_all(bv, bio, i, iter) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->iop.op.error) break; @@ -2086,8 +2116,8 @@ loop: ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); err: __pagecache_block_put(&mapping->add_lock); - bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res); - bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res); + bch2_disk_reservation_put(c, &dio->iop.op.res); + bch2_quota_reservation_put(c, inode, &dio->quota_res); if (dio->free_iov) kfree(dio->iter.iov); @@ -2530,6 +2560,16 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (unlikely(ret)) goto err; + /* + * When extending, we're going to write the new i_size to disk + * immediately so we need to flush anything above the current on disk + * i_size first: + * + * Also, when extending we need to flush the page that i_size currently + * straddles - if it's mapped to userspace, we need to ensure that + * userspace has to redirty it and call .mkwrite -> set_page_dirty + * again to allocate the part of the page that was extended. + */ if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, inode->ei_inode.bi_size, @@ -2608,16 +2648,16 @@ err: return ret; } -static long bch2_fcollapse(struct bch_inode_info *inode, - loff_t offset, loff_t len) +static long bch2_fcollapse_finsert(struct bch_inode_info *inode, + loff_t offset, loff_t len, + bool insert) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct btree_trans trans; - struct btree_iter *src, *dst; - BKEY_PADDED(k) copy; - struct bkey_s_c k; - loff_t new_size; + struct btree_iter *src, *dst, *del = NULL; + loff_t shift, new_size; + u64 src_start; int ret; if ((offset | len) & (block_bytes(c) - 1)) @@ -2635,92 +2675,188 @@ static long bch2_fcollapse(struct bch_inode_info *inode, inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - ret = -EINVAL; - if (offset + len >= inode->v.i_size) - goto err; + if (insert) { + ret = -EFBIG; + if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) + goto err; - if (inode->v.i_size < len) - goto err; + ret = -EINVAL; + if (offset >= inode->v.i_size) + goto err; - new_size = inode->v.i_size - len; + src_start = U64_MAX; + shift = len; + } else { + ret = -EINVAL; + if (offset + len >= inode->v.i_size) + goto err; + + src_start = offset + len; + shift = -len; + } + + new_size = inode->v.i_size + shift; ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) goto err; - dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - BUG_ON(IS_ERR_OR_NULL(dst)); + if (insert) { + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + } else { + ret = __bch2_fpunch(c, inode, offset >> 9, + (offset + len) >> 9); + if (ret) + goto err; + } src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_SLOTS); + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); BUG_ON(IS_ERR_OR_NULL(src)); - while (bkey_cmp(dst->pos, - POS(inode->v.i_ino, - round_up(new_size, block_bytes(c)) >> 9)) < 0) { - struct disk_reservation disk_res; + dst = bch2_trans_copy_iter(&trans, src); + BUG_ON(IS_ERR_OR_NULL(dst)); + + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + BKEY_PADDED(k) copy; + struct bkey_i delete; + struct bkey_s_c k; + struct bpos next_pos; + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; + unsigned commit_flags = BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE; + + k = insert + ? bch2_btree_iter_peek_prev(src) + : bch2_btree_iter_peek(src); + if ((ret = bkey_err(k))) + goto bkey_err; + + if (!k.k || k.k->p.inode != inode->v.i_ino) + break; + + BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); + + if (insert && + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; +reassemble: + bkey_reassemble(©.k, k); + + if (insert && + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { + bch2_cut_front(move_pos, ©.k); + bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); + } + + copy.k.k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); ret = bch2_btree_iter_traverse(dst); if (ret) goto bkey_err; - bch2_btree_iter_set_pos(src, - POS(dst->pos.inode, dst->pos.offset + (len >> 9))); - - k = bch2_btree_iter_peek_slot(src); - if ((ret = bkey_err(k))) - goto bkey_err; - - bkey_reassemble(©.k, k); - - bch2_cut_front(src->pos, ©.k); - copy.k.k.p.offset -= len >> 9; - - ret = bch2_extent_trim_atomic(©.k, dst); + ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); if (ret) goto bkey_err; - BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); + if (bkey_cmp(atomic_end, copy.k.k.p)) { + if (insert) { + move_pos = atomic_end; + move_pos.offset -= shift >> 9; + goto reassemble; + } else { + bch2_cut_back(atomic_end, ©.k.k); + } + } - ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); + bkey_init(&delete.k); + delete.k.p = src->pos; + bch2_key_resize(&delete.k, copy.k.k.size); - bch2_trans_begin_updates(&trans); + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - ret = bch2_extent_update(&trans, inode, - &disk_res, NULL, - dst, ©.k, - 0, true, true, NULL); + /* + * If the new and old keys overlap (because we're moving an + * extent that's bigger than the amount we're collapsing by), + * we need to trim the delete key here so they don't overlap + * because overlaps on insertions aren't handled before + * triggers are run, so the overwrite will get double counted + * by the triggers machinery: + */ + if (insert && + bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { + bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); + } else if (!insert && + bkey_cmp(copy.k.k.p, + bkey_start_pos(&delete.k)) > 0) { + bch2_cut_front(copy.k.k.p, &delete); + + del = bch2_trans_copy_iter(&trans, src); + BUG_ON(IS_ERR_OR_NULL(del)); + + bch2_btree_iter_set_pos(del, + bkey_start_pos(&delete.k)); + } + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, ©.k)); + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(del ?: src, &delete)); + + if (copy.k.k.size == k.k->size) { + /* + * If we're moving the entire extent, we can skip + * running triggers: + */ + commit_flags |= BTREE_INSERT_NOMARK; + } else { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); + + ret = bch2_disk_reservation_get(c, &disk_res, + copy.k.k.size, nr_ptrs, + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + } + + ret = bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, + commit_flags); bch2_disk_reservation_put(c, &disk_res); bkey_err: + if (del) + bch2_trans_iter_free(&trans, del); + del = NULL; + + if (!ret) + bch2_btree_iter_set_pos(src, next_pos); + if (ret == -EINTR) ret = 0; if (ret) goto err; - /* - * XXX: if we error here we've left data with multiple - * pointers... which isn't a _super_ serious problem... - */ bch2_trans_cond_resched(&trans); } bch2_trans_unlock(&trans); - ret = __bch2_fpunch(c, inode, - round_up(new_size, block_bytes(c)) >> 9, - U64_MAX); - if (ret) - goto err; - - i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + if (!insert) { + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + } err: bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); @@ -2889,8 +3025,11 @@ long bch2_fallocate_dispatch(struct file *file, int mode, if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) return bch2_fpunch(inode, offset, len); + if (mode == FALLOC_FL_INSERT_RANGE) + return bch2_fcollapse_finsert(inode, offset, len, true); + if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch2_fcollapse(inode, offset, len); + return bch2_fcollapse_finsert(inode, offset, len, false); return -EOPNOTSUPP; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index e3738757..50a7d8c1 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -509,7 +509,7 @@ retry: if (fsck_err_on(w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c, + k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, "extent type %u offset %llu past end of inode %llu, i_size %llu", k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { bch2_trans_unlock(&trans); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index e2ec5bea..ab8c2560 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -124,9 +124,8 @@ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i, iter) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -1210,10 +1209,15 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) return rbio; } +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ static void bch2_rbio_done(struct bch_read_bio *rbio) { - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); bio_endio(&rbio->bio); } diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 9595ba79..26a2c4fb 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -304,11 +304,10 @@ static void move_free(struct closure *cl) struct moving_context *ctxt = io->write.ctxt; struct bvec_iter_all iter; struct bio_vec *bv; - int i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i, iter) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -438,7 +437,8 @@ static int bch2_move_extent(struct bch_fs *c, GFP_KERNEL)) goto err_free; - io->rbio.opts = io_opts; + io->rbio.c = c; + io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); io->rbio.bio.bi_vcnt = pages; bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); @@ -548,7 +548,7 @@ peek: if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bkey_extent_is_data(k.k)) + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; if (cur_inum != k.k->p.inode) { diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 4797d620..84b3fb6e 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -42,9 +42,6 @@ void bch2_rebalance_add_key(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; - if (!bkey_extent_is_data(k.k)) - return; - if (!io_opts->background_target && !io_opts->background_compression) return; @@ -72,30 +69,26 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned nr_replicas = 0; - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return DATA_SKIP; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + nr_replicas += !p.ptr.cached; - extent_for_each_ptr_decode(e, p, entry) - if (rebalance_ptr_pred(c, p, io_opts)) - goto found; + if (rebalance_ptr_pred(c, p, io_opts)) + goto found; + } - return DATA_SKIP; + if (nr_replicas < io_opts->data_replicas) + goto found; + + return DATA_SKIP; found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; - } - default: - return DATA_SKIP; - } + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } struct rebalance_work { diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index c9558ccb..98d9a143 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -281,8 +281,7 @@ retry: if (ret) goto err; - ret = bch2_extent_atomic_end(&trans, split_iter, - k, &atomic_end); + ret = bch2_extent_atomic_end(split_iter, k, &atomic_end); if (ret) goto err; @@ -936,7 +935,9 @@ out: ret = 0; err: fsck_err: + set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); + journal_keys_free(&journal_keys); journal_entries_free(&journal_entries); kfree(clean); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index d0602725..bb9da2bb 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -16,11 +16,16 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } -static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry *e) { -#ifdef CONFIG_BCACHES_DEBUG +#ifdef CONFIG_BCACHEFS_DEBUG unsigned i; + BUG_ON(e->data_type >= BCH_DATA_NR); + BUG_ON(!e->nr_devs); + BUG_ON(e->nr_required > 1 && + e->nr_required >= e->nr_devs); + for (i = 0; i + 1 < e->nr_devs; i++) BUG_ON(e->devs[i] >= e->devs[i + 1]); #endif @@ -158,7 +163,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, }; BUG_ON(!new_entry->data_type); - verify_replicas_entry_sorted(new_entry); + verify_replicas_entry(new_entry); new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); if (!new.entries) @@ -185,7 +190,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, if (unlikely(entry_size > r->entry_size)) return -1; - verify_replicas_entry_sorted(search); + verify_replicas_entry(search); #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, @@ -216,7 +221,7 @@ static bool bch2_replicas_marked_locked(struct bch_fs *c, if (!search->nr_devs) return true; - verify_replicas_entry_sorted(search); + verify_replicas_entry(search); return __replicas_has_entry(&c->replicas, search) && (!check_gc_replicas || @@ -360,6 +365,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_cpu new_r, new_gc; int ret = -ENOMEM; + verify_replicas_entry(new_entry); + memset(&new_r, 0, sizeof(new_r)); memset(&new_gc, 0, sizeof(new_gc)); @@ -875,9 +882,8 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi goto err; err = "invalid replicas entry: bad nr_required"; - if (!e->nr_required || - (e->nr_required > 1 && - e->nr_required >= e->nr_devs)) + if (e->nr_required > 1 && + e->nr_required >= e->nr_devs) goto err; err = "invalid replicas entry: invalid device"; diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 091bf7a8..ef30c73a 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -42,7 +42,6 @@ bch2_hash_info_init(struct bch_fs *c, u8 digest[SHA256_DIGEST_SIZE]; desc->tfm = c->sha256; - desc->flags = 0; crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, sizeof(bi->bi_hash_seed), digest); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index bd4b3188..4145832f 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -494,6 +494,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_btree_iter_exit(c); bch2_fs_btree_cache_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); @@ -505,7 +506,6 @@ static void bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[0]); kfree(c->usage_base); free_percpu(c->pcpu); - mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); @@ -758,15 +758,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || - mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, - sizeof(struct btree_iter) * BTREE_ITER_MAX + - sizeof(struct btree_insert_entry) * - (BTREE_ITER_MAX + 4)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || bch2_fs_replicas_init(c) || bch2_fs_btree_cache_init(c) || + bch2_fs_btree_iter_init(c) || bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || diff --git a/linux/bio.c b/linux/bio.c index d9b860a0..797204f8 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -167,9 +167,8 @@ void bio_free_pages(struct bio *bio) { struct bvec_iter_all iter; struct bio_vec *bvec; - int i; - bio_for_each_segment_all(bvec, bio, i, iter) + bio_for_each_segment_all(bvec, bio, iter) __free_page(bvec->bv_page); }