From 8c20176f2ce40fc8b0151e5a7d17561dd0eda0b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2019 20:36:59 -0500 Subject: [PATCH] Update bcachefs sources to d372ddcbfa bcachefs: Reorganize extents.c --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 2 + libbcachefs/bcachefs_format.h | 11 +- libbcachefs/bkey.h | 11 + libbcachefs/bkey_methods.c | 22 +- libbcachefs/bkey_on_stack.h | 35 + libbcachefs/bkey_sort.c | 25 +- libbcachefs/bset.c | 40 +- libbcachefs/bset.h | 17 +- libbcachefs/btree_gc.c | 2 +- libbcachefs/btree_io.c | 53 +- libbcachefs/btree_update_interior.c | 34 +- libbcachefs/btree_update_leaf.c | 4 +- libbcachefs/ec.c | 27 +- libbcachefs/extent_update.c | 532 +++++++ libbcachefs/extent_update.h | 18 + libbcachefs/extents.c | 2003 ++++++++++----------------- libbcachefs/extents.h | 370 +++-- libbcachefs/fs-io.c | 122 +- libbcachefs/fs.c | 29 +- libbcachefs/io.c | 168 ++- libbcachefs/io.h | 5 +- libbcachefs/journal.c | 2 +- libbcachefs/journal_io.c | 2 +- libbcachefs/migrate.c | 16 +- libbcachefs/move.c | 27 +- libbcachefs/recovery.c | 16 +- libbcachefs/reflink.c | 19 +- libbcachefs/super.c | 2 + libbcachefs/util.c | 2 +- libbcachefs/util.h | 2 +- 31 files changed, 1934 insertions(+), 1686 deletions(-) create mode 100644 libbcachefs/bkey_on_stack.h create mode 100644 libbcachefs/extent_update.c create mode 100644 libbcachefs/extent_update.h diff --git a/.bcachefs_revision b/.bcachefs_revision index e0172a41..9543a550 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -b1a4dc53be10a4c3132fccaaf604d73861a52d2d +d372ddcbfabef5fcfd29bad150865cccc3faf172 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 323b663d..a6b9b0e6 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -725,6 +725,8 @@ struct bch_fs { atomic64_t key_version; + mempool_t large_bkey_pool; + /* REBALANCE */ struct bch_fs_rebalance rebalance; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d619e5ca..3d85012a 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -338,7 +338,8 @@ static inline void bkey_init(struct bkey *k) x(quota, 13) \ x(stripe, 14) \ x(reflink_p, 15) \ - x(reflink_v, 16) + x(reflink_v, 16) \ + x(inline_data, 17) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -911,6 +912,13 @@ struct bch_reflink_v { __u64 _data[0]; }; +/* Inline data */ + +struct bch_inline_data { + struct bch_val v; + u8 data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1315,6 +1323,7 @@ enum bch_sb_features { BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_REFLINK = 6, BCH_FEATURE_NEW_SIPHASH = 7, + BCH_FEATURE_INLINE_DATA = 8, BCH_FEATURE_NR, }; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index b26f4934..f2d5f300 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -33,6 +33,16 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) +static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, + struct bkey_packed *end) +{ + k = bkey_next(k); + + while (k != end && !k->u64s) + k = (void *) ((u64 *) k + 1); + return k; +} + #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) @@ -554,6 +564,7 @@ BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); BKEY_VAL_ACCESSORS(reflink_p); BKEY_VAL_ACCESSORS(reflink_v); +BKEY_VAL_ACCESSORS(inline_data); /* byte order helpers */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index f01405dd..5312184c 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, .key_invalid = empty_val_key_invalid, \ } +static const char *key_type_inline_data_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + return NULL; +} + +static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); +} + +static const struct bkey_ops bch2_bkey_ops_inline_data = { + .key_invalid = key_type_inline_data_invalid, + .val_to_text = key_type_inline_data_to_text, +}; + static const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() @@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if ((btree_node_type_is_extents(type) || - type == BKEY_TYPE_BTREE) && - bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) + if (type == BKEY_TYPE_BTREE && + bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; if (btree_node_type_is_extents(type)) { diff --git a/libbcachefs/bkey_on_stack.h b/libbcachefs/bkey_on_stack.h new file mode 100644 index 00000000..d4739038 --- /dev/null +++ b/libbcachefs/bkey_on_stack.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_ON_STACK_H +#define _BCACHEFS_BKEY_ON_STACK_H + +#include "bcachefs.h" + +struct bkey_on_stack { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bkey_on_stack_init(struct bkey_on_stack *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bkey_on_stack_exit(struct bkey_on_stack *s, + struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_ON_STACK_H */ diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 2cac269b..daef8e5c 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_on_stack.h" #include "bkey_sort.h" #include "bset.h" #include "extents.h" @@ -74,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter, { i->k += __btree_node_offset_to_key(b, i->k)->u64s; + while (i->k != i->end && + !__btree_node_offset_to_key(b, i->k)->u64s) + i->k++; + if (i->k == i->end) *i = iter->data[--iter->used]; } @@ -118,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) { - iter->data->k = bkey_next(iter->data->k); + iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end); BUG_ON(iter->data->k > iter->data->end); @@ -292,8 +297,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; + struct bkey_on_stack split; memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&split); heap_resort(iter, extent_sort_cmp, NULL); @@ -343,29 +350,29 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, if (bkey_cmp(l.k->p, r.k->p) >= 0) { sort_key_next(iter, b, _r); } else { - __bch2_cut_front(l.k->p, r); + bch2_cut_front_s(l.k->p, r); extent_save(b, rk, r.k); } extent_sort_sift(iter, b, _r - iter->data); } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; + bkey_on_stack_realloc(&split, c, l.k->u64s); /* * r wins, but it overlaps in the middle of l - split l: */ - bkey_reassemble(&tmp.k, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); + bkey_reassemble(split.k, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), split.k); - __bch2_cut_front(r.k->p, l); + bch2_cut_front_s(r.k->p, l); extent_save(b, lk, l.k); extent_sort_sift(iter, b, 0); extent_sort_append(c, f, &nr, dst->start, - &prev, bkey_i_to_s(&tmp.k)); + &prev, bkey_i_to_s(split.k)); } else { - bch2_cut_back(bkey_start_pos(r.k), l.k); + bch2_cut_back_s(bkey_start_pos(r.k), l); extent_save(b, lk, l.k); } } @@ -373,6 +380,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, extent_sort_advance_prev(f, &nr, dst->start, &prev); dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + + bkey_on_stack_exit(&split, c); return nr; } diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index b7618e2b..a0f0b0ea 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -76,7 +76,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) for (_k = i->start, k = bkey_unpack_key(b, _k); _k < vstruct_last(i); _k = _n, k = n) { - _n = bkey_next(_k); + _n = bkey_next_skip_noops(_k, vstruct_last(i)); bch2_bkey_to_text(&PBUF(buf), &k); printk(KERN_ERR "block %u key %5u: %s\n", set, @@ -144,9 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) struct btree_nr_keys nr = { 0 }; for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) + bset_tree_for_each_key(b, t, k) if (!bkey_whiteout(k)) btree_keys_account_key_add(&nr, t - b->set, k); @@ -612,7 +610,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next(k); + k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -803,9 +801,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) rw_aux_tree(b, t)[0].offset = __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) { + bset_tree_for_each_key(b, t, k) { if (t->size == bset_rw_tree_capacity(b, t)) break; @@ -838,7 +834,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next(k); + prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -854,10 +850,10 @@ retry: EBUG_ON(tree_to_bkey(b, t, j) != k); } - while (bkey_next(k) != btree_bkey_last(b, t)) - k = bkey_next(k); + while (k != btree_bkey_last(b, t)) + prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); - t->max_key = bkey_unpack_pos(b, k); + t->max_key = bkey_unpack_pos(b, prev); /* Then we build the tree */ eytzinger1_for_each(j, t->size) @@ -983,7 +979,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next(i)) + for (i = p; i != k; i = bkey_next_skip_noops(i, k)) if (i->type >= min_key_type) ret = i; @@ -993,9 +989,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, if (btree_keys_expensive_checks(b)) { BUG_ON(ret >= orig_k); - for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); + for (i = ret + ? bkey_next_skip_noops(ret, orig_k) + : btree_bkey_first(b, t); i != orig_k; - i = bkey_next(i)) + i = bkey_next_skip_noops(i, orig_k)) BUG_ON(i->type >= min_key_type); } @@ -1030,7 +1028,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; - if (bkey_next(k) == btree_bkey_last(b, t)) { + if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { t->max_key = bkey_unpack_pos(b, k); for (j = 1; j < t->size; j = j * 2 + 1) @@ -1154,7 +1152,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next(k); + k = bkey_next_skip_noops(k, end); if (k == end) break; @@ -1403,12 +1401,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, m) > 0) - m = bkey_next(m); + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, search, m) > 0) - m = bkey_next(m); + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); @@ -1642,6 +1640,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, EBUG_ON(iter->data->k > iter->data->end); + while (!__btree_node_iter_set_end(iter, 0) && + !__bch2_btree_node_iter_peek_all(iter, b)->u64s) + iter->data->k++; + if (unlikely(__btree_node_iter_set_end(iter, 0))) { bch2_btree_node_iter_set_drop(iter, iter->data); return; diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index ccc0866d..2653a74b 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; } -#define for_each_bset(_b, _t) \ +#define for_each_bset(_b, _t) \ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) +#define bset_tree_for_each_key(_b, _t, _k) \ + for (_k = btree_bkey_first(_b, _t); \ + _k != btree_bkey_last(_b, _t); \ + _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) + static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; @@ -564,6 +569,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, n->unpacked_keys += sign; } +static inline void btree_keys_account_val_delta(struct btree *b, + struct bkey_packed *k, + int delta) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + + b->nr.live_u64s += delta; + b->nr.bset_u64s[t - b->set] += delta; +} + #define btree_keys_account_key_add(_nr, _bset_idx, _k) \ btree_keys_account_key(_nr, _bset_idx, _k, 1) #define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index c4c2e1a3..8bbf60b0 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -922,7 +922,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next(k)) { + k = bkey_next_skip_noops(k, vstruct_last(s2))) { last = k; u64s += k->u64s; } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 591980d2..c345262d 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -26,34 +26,33 @@ static void verify_no_dups(struct btree *b, struct bkey_packed *end) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k; + struct bkey_packed *k, *p; - for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { - struct bkey l = bkey_unpack_key(b, k); - struct bkey r = bkey_unpack_key(b, bkey_next(k)); + if (start == end) + return; + + for (p = start, k = bkey_next_skip_noops(start, end); + k != end; + p = k, k = bkey_next_skip_noops(k, end)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); BUG_ON(btree_node_is_extents(b) ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); } #endif } -static void clear_needs_whiteout(struct bset *i) +static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = false; -} - -static void set_needs_whiteout(struct bset *i) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = true; + for (k = i->start; + k != vstruct_last(i); + k = bkey_next_skip_noops(k, vstruct_last(i))) + k->needs_whiteout = v; } static void btree_bounce_free(struct bch_fs *c, unsigned order, @@ -168,7 +167,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_next_skip_noops(k, end); if (bkey_deleted(k) && btree_node_is_extents(b)) continue; @@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_next_skip_noops(k, end); if (!bkey_whiteout(k)) { bkey_copy(out, k); @@ -680,14 +679,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b, struct bkey tmp; const char *invalid; - if (btree_err_on(!k->u64s, - BTREE_ERR_FIXABLE, c, b, i, - "KEY_U64s 0: %zu bytes of metadata lost", - vstruct_end(i) - (void *) k)) { - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - if (btree_err_on(bkey_next(k) > vstruct_last(i), BTREE_ERR_FIXABLE, c, b, i, "key extends past end of bset")) { @@ -756,7 +747,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, prev_pos = u.k->p; prev = k; - k = bkey_next(k); + k = bkey_next_skip_noops(k, vstruct_last(i)); } SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); @@ -915,12 +906,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry continue; } - k = bkey_next(k); + k = bkey_next_skip_noops(k, vstruct_last(i)); } bch2_bset_build_aux_tree(b, b->set, false); - set_needs_whiteout(btree_bset_first(b)); + set_needs_whiteout(btree_bset_first(b), true); btree_node_reset_sib_u64s(b); out: @@ -1425,7 +1416,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, : bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); - clear_needs_whiteout(i); + set_needs_whiteout(i, false); /* do we have data to write? */ if (b->written && !i->u64s) @@ -1579,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) } for_each_bset(b, t) - set_needs_whiteout(bset(b, t)); + set_needs_whiteout(bset(b, t), true); bch2_btree_verify(c, b); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 9d5687ec..f8a30cb3 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) bch2_bkey_format_add_pos(s, b->data->min_key); for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) + bset_tree_for_each_key(b, t, k) if (!bkey_whiteout(k)) { uk = bkey_unpack_key(b, k); bch2_bkey_format_add_key(s, &uk); @@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as, */ k = set1->start; while (1) { - if (bkey_next(k) == vstruct_last(set1)) + struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); + + if (n == vstruct_last(set1)) break; if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) break; @@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as, nr_unpacked++; prev = k; - k = bkey_next(k); + k = n; } BUG_ON(!prev); @@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, { struct btree_node_iter node_iter; struct bkey_i *k = bch2_keylist_front(keys); - struct bkey_packed *p; + struct bkey_packed *src, *dst, *n; struct bset *i; BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); @@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, * for the pivot: */ i = btree_bset_first(b); - p = i->start; - while (p != vstruct_last(i)) - if (bkey_deleted(p)) { - le16_add_cpu(&i->u64s, -p->u64s); - set_btree_bset_end(b, b->set); - memmove_u64s_down(p, bkey_next(p), - (u64 *) vstruct_last(i) - - (u64 *) p); - } else - p = bkey_next(p); + src = dst = i->start; + while (src != vstruct_last(i)) { + n = bkey_next_skip_noops(src, vstruct_last(i)); + if (!bkey_deleted(src)) { + memmove_u64s_down(dst, src, src->u64s); + dst = bkey_next(dst); + } + src = n; + } + + i->u64s = cpu_to_le16((u64 *) dst - i->_data); + set_btree_bset_end(b, b->set); BUG_ON(b->nsets != 1 || b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 051368cd..54893b7b 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -10,7 +10,7 @@ #include "buckets.h" #include "debug.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" @@ -886,7 +886,7 @@ retry: /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); + bch2_cut_back(end, &delete); ret = bch2_extent_trim_atomic(&delete, iter); if (ret) diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index ad92d3b4..e0ca0c5d 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -776,10 +777,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; int ret = 0, dev, idx; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -789,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { bch2_btree_iter_next(iter); continue; @@ -804,19 +807,20 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, dev = s->key.v.ptrs[idx].dev; - bkey_reassemble(&tmp.k, k); - e = bkey_i_to_s_extent(&tmp.k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + e = bkey_i_to_s_extent(sk.k); - extent_for_each_ptr(e, ptr) - if (ptr->dev != dev) + extent_for_each_ptr(e, ptr) { + if (ptr->dev == dev) + ec_ptr = ptr; + else ptr->cached = true; + } - ptr = (void *) bch2_extent_has_device(e.c, dev); - BUG_ON(!ptr); + extent_stripe_ptr_add(e, s, ec_ptr, idx); - extent_stripe_ptr_add(e, s, ptr, idx); - - bch2_trans_update(&trans, iter, &tmp.k); + bch2_trans_update(&trans, iter, sk.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -829,6 +833,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, } bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return ret; } diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c new file mode 100644 index 00000000..91ceb5d5 --- /dev/null +++ b/libbcachefs/extent_update.c @@ -0,0 +1,532 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_on_stack.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "debug.h" +#include "extents.h" +#include "extent_update.h" + +/* + * This counts the number of iterators to the alloc & ec btrees we'll need + * inserting/removing this extent: + */ +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + + return ret; +} + +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters, + bool overwrite) +{ + int ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + ret = 1; + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } + } + + return ret; +} + +#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) + +int bch2_extent_atomic_end(struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) +{ + struct btree_trans *trans = iter->trans; + struct btree *b; + struct btree_node_iter node_iter; + struct bkey_packed *_k; + unsigned nr_iters = 0; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + b = iter->l[0].b; + node_iter = iter->l[0].iter; + + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); + + *end = bpos_min(insert->k.p, b->key.k.p); + + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, + &nr_iters, EXTENT_ITERS_MAX / 2, false); + if (ret < 0) + return ret; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; + + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) + break; + + if (bkey_cmp(bkey_start_pos(&insert->k), + bkey_start_pos(k.k)) > 0) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + + ret = count_iters_for_insert(trans, k, offset, end, + &nr_iters, EXTENT_ITERS_MAX, true); + if (ret) + break; + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return ret < 0 ? ret : 0; +} + +int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, k); + return 0; +} + +int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + return !bkey_cmp(end, k->k.p); +} + +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *trans, + struct btree_insert_entry *insert, + unsigned *u64s) +{ + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; + enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_s_c k; + int sectors; + + /* + * We avoid creating whiteouts whenever possible when deleting, but + * those optimizations mean we may potentially insert two whiteouts + * instead of one (when we overlap with the front of one extent and the + * back of another): + */ + if (bkey_whiteout(&insert->k->k)) + *u64s += BKEY_U64s; + + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) + return BTREE_INSERT_OK; + + k = bkey_disassemble(l->b, _k, &unpacked); + + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + /* account for having to split existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; + + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; + + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, + sectors, flags)) { + case 0: + break; + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); + } + } + + return BTREE_INSERT_OK; +} + +static void verify_extent_nonoverlapping(struct bch_fs *c, + struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; + + if (!expensive_debug_checks(c)) + return; + + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); + } +#endif + +#endif +} + +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +} + +static void +extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_packed *_k, struct bkey_s k, + enum bch_extent_overlap overlap) +{ + struct btree_iter_level *l = &iter->l[0]; + int u64s_delta; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + /* insert overlaps with start of k: */ + u64s_delta = bch2_cut_front_s(insert->k.p, k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + break; + + case BCH_EXTENT_OVERLAP_BACK: + /* insert overlaps with end of k: */ + u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + + /* + * As the auxiliary tree is indexed by the end of the + * key and we've just changed the end, update the + * auxiliary tree. + */ + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + break; + + case BCH_EXTENT_OVERLAP_ALL: { + /* The insert key completely covers k, invalidate k */ + if (!bkey_whiteout(k.k)) + btree_account_key_drop(l->b, _k); + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; + + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, u64s, 0); + } else { + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + } + + break; + } + case BCH_EXTENT_OVERLAP_MIDDLE: { + struct bkey_on_stack split; + + bkey_on_stack_init(&split); + bkey_on_stack_realloc(&split, c, k.k->u64s); + + /* + * The insert key falls 'in the middle' of k + * The insert key splits k in 3: + * - start only in k, preserve + * - middle common section, invalidate in k + * - end only in k, preserve + * + * We update the old key to preserve the start, + * insert will be the new common section, + * we manually insert the end that we are preserving. + * + * modify k _before_ doing the insert (which will move + * what k points to) + */ + bkey_reassemble(split.k, k.s_c); + split.k->k.needs_whiteout |= bkey_written(l->b, _k); + + bch2_cut_back(bkey_start_pos(&insert->k), split.k); + BUG_ON(bkey_deleted(&split.k->k)); + + u64s_delta = bch2_cut_front_s(insert->k.p, k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + BUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + + extent_bset_insert(c, iter, split.k); + bkey_on_stack_exit(&split, c); + break; + } + } +} + +/** + * bch_extent_insert_fixup - insert a new extent and deal with overlaps + * + * this may result in not actually doing the insert, or inserting some subset + * of the insert key. For cmpxchg operations this is where that logic lives. + * + * All subsets of @insert that need to be inserted are inserted using + * bch2_btree_insert_and_journal(). If @b or @res fills up, this function + * returns false, setting @iter->pos for the prefix of @insert that actually got + * inserted. + * + * BSET INVARIANTS: this function is responsible for maintaining all the + * invariants for bsets of extents in memory. things get really hairy with 0 + * size extents + * + * within one bset: + * + * bkey_start_pos(bkey_next(k)) >= k + * or bkey_start_offset(bkey_next(k)) >= k->offset + * + * i.e. strict ordering, no overlapping extents. + * + * multiple bsets (i.e. full btree node): + * + * ∀ k, j + * k.size != 0 ∧ j.size != 0 → + * ¬ (k > bkey_start_pos(j) ∧ k < j) + * + * i.e. no two overlapping keys _of nonzero size_ + * + * We can't realistically maintain this invariant for zero size keys because of + * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j + * there may be another 0 size key between them in another bset, and it will + * thus overlap with the merged key. + * + * In addition, the end of iter->pos indicates how much has been processed. + * If the end of iter->pos is not the same as the end of insert, then + * key insertion needs to continue/be retried. + */ +void bch2_insert_fixup_extent(struct btree_trans *trans, + struct btree_insert_entry *insert_entry) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert_entry->iter; + struct bkey_i *insert = insert_entry->k; + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + bool deleting = bkey_whiteout(&insert->k); + bool update_journal = !deleting; + bool update_btree = !deleting; + struct bkey_i whiteout = *insert; + struct bkey_packed *_k; + struct bkey unpacked; + + EBUG_ON(iter->level); + EBUG_ON(!insert->k.size); + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + + if (!bkey_whiteout(k.k)) + update_journal = true; + + if (!update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); + goto next; + } + + /* + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: + */ + if (deleting && + !update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + _k->type = KEY_TYPE_discard; + reserve_whiteout(l->b, _k); + bch2_btree_iter_fix_key_modified(iter, + l->b, _k); + } + break; + } + + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { + insert->k.needs_whiteout = true; + update_btree = true; + } + + if (update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(l->b, _k); + _k->needs_whiteout = false; + } + + extent_squash(c, iter, insert, _k, k, overlap); + + if (!update_btree) + bch2_cut_front(cur_end, insert); +next: + node_iter = l->iter; + + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) + break; + } + + l->iter = node_iter; + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); + + if (update_btree) { + if (deleting) + insert->k.type = KEY_TYPE_discard; + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + + extent_bset_insert(c, iter, insert); + } + + if (update_journal) { + struct bkey_i *k = !deleting ? insert : &whiteout; + + if (deleting) + k->k.type = KEY_TYPE_discard; + + EBUG_ON(bkey_deleted(&k->k) || !k->k.size); + + bch2_btree_journal_key(trans, iter, k); + } + + bch2_cut_front(insert->k.p, insert); +} diff --git a/libbcachefs/extent_update.h b/libbcachefs/extent_update.h new file mode 100644 index 00000000..89d18e4b --- /dev/null +++ b/libbcachefs/extent_update.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENT_UPDATE_H +#define _BCACHEFS_EXTENT_UPDATE_H + +#include "bcachefs.h" + +int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, + struct bpos *); +int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, + unsigned *); +void bch2_insert_fixup_extent(struct btree_trans *, + struct btree_insert_entry *); + +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b9c69792..6bcc1786 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -9,12 +9,10 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" +#include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "debug.h" -#include "dirent.h" #include "disk_groups.h" #include "error.h" #include "extents.h" @@ -24,85 +22,18 @@ #include "super.h" #include "super-io.h" #include "util.h" -#include "xattr.h" #include -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - bkey_for_each_ptr(p, ptr) - nr_ptrs++; - - return nr_ptrs; -} - -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) -{ - unsigned nr_ptrs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - nr_ptrs += !ptr->cached; - BUG_ON(!nr_ptrs); - break; - } - case KEY_TYPE_reservation: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return nr_ptrs; -} - -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) -{ - unsigned durability = 0; - struct bch_dev *ca; - - if (p.ptr.cached) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); - - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) - durability = max_t(unsigned, durability, ca->mi.durability); - - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); - - if (WARN_ON(!s)) - goto out; - - durability = max_t(unsigned, durability, s->nr_redundant); - } -out: - return durability; -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); - - return durability; -} +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, unsigned dev) @@ -222,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return ret; } -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) +/* KEY_TYPE_btree_ptr: */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + return bch2_bkey_ptrs_invalid(c, k); +} - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + const char *err; + char buf[160]; + struct bucket_mark mark; + struct bch_dev *ca; - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bkey_for_each_ptr(ptrs, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + + mark = ptr_bucket_mark(ca, ptr); + + err = "stale"; + if (gen_after(mark.gen, ptr->gen)) + goto err; + + err = "inconsistent"; + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) + goto err; + } + + return; +err: + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); +} + +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} + +/* KEY_TYPE_extent: */ + +const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + char buf[160]; + + /* + * XXX: we should be doing most/all of these checks at startup time, + * where we check bch2_bkey_invalid() in btree_node_read_done() + * + * But note that we can't check for stale pointers or incorrect gc marks + * until after journal replay is done (it might be an extent that's + * going to get overwritten during replay) + */ + + if (percpu_down_read_trylock(&c->mark_lock)) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + percpu_up_read(&c->mark_lock); + } + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; + + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); + + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); } } -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); + bch2_bkey_ptrs_to_text(out, c, k); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +enum merge_result bch2_extent_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_extent r = bkey_s_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; + union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) + return BCH_MERGE_NOMERGE; - return NULL; -} + crc_l = bch2_extent_crc_unpack(l.k, NULL); -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return BCH_MERGE_NOMERGE; - return false; -} + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; + struct bch_dev *ca; -/* extent specific utility code */ + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) -{ - const struct bch_extent_ptr *ptr; + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) + return BCH_MERGE_NOMERGE; - return NULL; -} - -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; - } - - return NULL; -} - -unsigned bch2_extent_is_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) - ret += p.crc.compressed_size; - - return ret; -} - -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; - - return false; -} - -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) -{ - union bch_extent_entry *i = ptrs.start; - - if (i == entry) - return NULL; - - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; -} - -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; - bool drop_crc = true; - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; - - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) - break; - - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; break; } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - dst = prev; + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; + + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; + + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; + + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; + + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; + + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; + + break; + default: + return BCH_MERGE_NOMERGE; + } } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; - return dst; + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + + if (!extent_entry_is_crc(en_l)) + continue; + + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + pr_buf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} + +enum merge_result bch2_reservation_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_reservation r = bkey_s_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + bch2_cut_front_s(l.k->p, r.s); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -466,52 +524,404 @@ restart_narrow_pointers: return ret; } -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); +} + +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +{ + return bch2_bkey_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + } + + return ret; +} + +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; + + return ret; +} + +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bpos end = pos; + struct bkey_s_c k; + bool ret = true; + int err; + + end.offset += size; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: + + if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { + ret = false; break; } } + bch2_trans_exit(&trans); + + return ret; +} + +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) +{ + unsigned durability = 0; + struct bch_dev *ca; + + if (p.ptr.cached) + return 0; + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + + if (p.has_ec) { + struct stripe *s = + genradix_ptr(&c->stripes[0], p.ec.idx); + + if (WARN_ON(!s)) + goto out; + + durability = max_t(unsigned, durability, s->nr_redundant); + } +out: + return durability; +} + +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, p); + + return durability; +} + +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; + + if (target && extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra && + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; + extra -= n; + } + } + + if (extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra) { + entry->ptr.cached = true; + extra -= n; + } + } +} + +void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) +{ + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->u64s++; + break; + default: + BUG(); + } +} + +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; + + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); + + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); + } +} + +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + src = extent_entry_next(to_entry(ptr)); + if (src != ptrs.end && + !extent_entry_is_crc(src)) + drop_crc = false; + + dst = to_entry(ptr); + while ((prev = extent_entry_prev(ptrs, dst))) { + if (extent_entry_is_ptr(prev)) + break; + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; + break; + } + + dst = prev; + } + + memmove_u64s_down(dst, src, + (u64 *) ptrs.end - (u64 *) src); + k.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; +} + +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} + +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; + + return false; +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + /* will only happen if all pointers were cached: */ + if (!bch2_bkey_nr_ptrs(k.s_c)) + k.k->type = KEY_TYPE_discard; + + return bkey_whiteout(k.k); } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -662,70 +1072,50 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -/* Btree ptrs */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) +void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - return bch2_bkey_ptrs_invalid(c, k); -} + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - mark = ptr_bucket_mark(ca, ptr); - - err = "stale"; - if (gen_after(mark.gen, ptr->gen)) - goto err; - - err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size) - goto err; + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } - - return; -err: - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); } -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} +/* Generic extent code: */ -/* Extents */ - -void __bch2_cut_front(struct bpos where, struct bkey_s k) +int bch2_cut_front_s(struct bpos where, struct bkey_s k) { + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return; + return 0; EBUG_ON(bkey_cmp(where, k.k->p) > 0); @@ -733,15 +1123,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) k.k->size -= sub; - if (!k.k->size) + if (!k.k->size) { k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } switch (k.k->type) { - case KEY_TYPE_deleted: - case KEY_TYPE_discard: - case KEY_TYPE_error: - case KEY_TYPE_cookie: - break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); @@ -779,975 +1166,59 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) le64_add_cpu(&p.v->idx, sub); break; } - case KEY_TYPE_reservation: + case KEY_TYPE_inline_data: { + struct bkey_s_inline_data d = bkey_s_to_inline_data(k); + + sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); + + memmove(d.v->data, + d.v->data + sub, + bkey_val_bytes(d.k) - sub); + + new_val_u64s -= sub >> 3; break; - default: - BUG(); } + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } -bool bch2_cut_back(struct bpos where, struct bkey *k) +int bch2_cut_back_s(struct bpos where, struct bkey_s k) { + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; u64 len = 0; - if (bkey_cmp(where, k->p) >= 0) - return false; + if (bkey_cmp(where, k.k->p) >= 0) + return 0; - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); + EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); - len = where.offset - bkey_start_offset(k); + len = where.offset - bkey_start_offset(k.k); - k->p = where; - k->size = len; + k.k->p = where; + k.k->size = len; - if (!len) - k->type = KEY_TYPE_deleted; - - return true; -} - -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - return ret; -} - -static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters, - unsigned max_iters, - bool overwrite) -{ - int ret = 0; - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= max_iters) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter *iter; - struct bkey_s_c r_k; - - for_each_btree_key(trans, iter, - BTREE_ID_REFLINK, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret) { - if (bkey_cmp(bkey_start_pos(r_k.k), - POS(0, idx + sectors)) >= 0) - break; - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - - if (*nr_iters >= max_iters) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += r_k.k->p.offset - idx; - - *end = bpos_min(*end, pos); - ret = 1; - break; - } - } - - bch2_trans_iter_put(trans, iter); - break; - } - } - - return ret; -} - -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) - -int bch2_extent_atomic_end(struct btree_iter *iter, - struct bkey_i *insert, - struct bpos *end) -{ - struct btree_trans *trans = iter->trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey_packed *_k; - unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - b = iter->l[0].b; - node_iter = iter->l[0].iter; - - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - - *end = bpos_min(insert->k.p, b->key.k.p); - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2, false); - if (ret < 0) - return ret; - - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - unsigned offset = 0; - - if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) - break; - - if (bkey_cmp(bkey_start_pos(&insert->k), - bkey_start_pos(k.k)) > 0) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX, true); - if (ret) - break; - - bch2_btree_node_iter_advance(&node_iter, b); - } - - return ret < 0 ? ret : 0; -} - -int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - bch2_cut_back(end, &k->k); - return 0; -} - -int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - return !bkey_cmp(end, k->k.p); -} - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *trans, - struct btree_insert_entry *insert, - unsigned *u64s) -{ - struct btree_iter_level *l = &insert->iter->l[0]; - struct btree_node_iter node_iter = l->iter; - enum bch_extent_overlap overlap; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_s_c k; - int sectors; - - /* - * We avoid creating whiteouts whenever possible when deleting, but - * those optimizations mean we may potentially insert two whiteouts - * instead of one (when we overlap with the front of one extent and the - * back of another): - */ - if (bkey_whiteout(&insert->k->k)) - *u64s += BKEY_U64s; - - _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - /* account for having to split existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_extent_is_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - __bch2_cut_front(insert->k.p, k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - break; - - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch2_cut_back(bkey_start_pos(&insert->k), k.k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - break; - - case BCH_EXTENT_OVERLAP_ALL: { - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); - - k.k->size = 0; + if (!len) { k.k->type = KEY_TYPE_deleted; - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - - break; + new_val_u64s = 0; } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bkey_written(l->b, _k); - - bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - - extent_bset_insert(c, iter, &split.k); - break; - } - } -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_insert_entry *insert_entry) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert_entry->iter; - struct bkey_i *insert = insert_entry->k; - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - bool deleting = bkey_whiteout(&insert->k); - bool update_journal = !deleting; - bool update_btree = !deleting; - struct bkey_i whiteout = *insert; - struct bkey_packed *_k; - struct bkey unpacked; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (!bkey_whiteout(k.k)) - update_journal = true; - - if (!update_journal) { - bch2_cut_front(cur_end, insert); - bch2_cut_front(cur_end, &whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (deleting && - !update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - bch2_btree_iter_fix_key_modified(iter, - l->b, _k); - } - break; - } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - update_btree = true; - } - - if (update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; - } - - extent_squash(c, iter, insert, _k, k, overlap); - - if (!update_btree) - bch2_cut_front(cur_end, insert); -next: - node_iter = l->iter; - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - } - - l->iter = node_iter; - bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - - if (update_btree) { - if (deleting) - insert->k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - extent_bset_insert(c, iter, insert); - } - - if (update_journal) { - struct bkey_i *k = !deleting ? insert : &whiteout; - - if (deleting) - k->k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&k->k) || !k->k.size); - - bch2_btree_journal_key(trans, iter, k); - } - - bch2_cut_front(insert->k.p, insert); -} - -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - char buf[160]; - - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - if (percpu_down_read_trylock(&c->mark_lock)) { - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - percpu_up_read(&c->mark_lock); - } - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - extent_for_each_ptr_decode(e, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); - unsigned stale = gen_after(mark.gen, p.ptr.gen); - unsigned disk_sectors = ptr_disk_sectors(p); - unsigned mark_sectors = p.ptr.cached - ? mark.cached_sectors - : mark.dirty_sectors; - - bch2_fs_bug_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); - - bch2_fs_bug_on(!stale && - (mark.data_type != BCH_DATA_USER || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); - } -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) -{ -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = *((__le32 *) &src.csum.lo); - break; - case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); - break; - case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; - break; - default: - BUG(); - } -#undef set_common_fields -} - -void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - enum bch_extent_entry_type type; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size - 1 <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size - 1 <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size - 1 <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new, type); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - if (p->has_ec) { - p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec)); - } -} - -/* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - - /* will only happen if all pointers were cached: */ - if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_discard; - - return bkey_whiteout(k.k); -} - -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; - - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); - - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; - - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_l.compression_type) - return BCH_MERGE_NOMERGE; - - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; - - if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; - } - } - - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; - - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (!extent_entry_is_crc(en_l)) - continue; - - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; -} - -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { - ret = false; - break; - } - } - bch2_trans_exit(&trans); - - return ret; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) - ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_NONE; - break; - } - case KEY_TYPE_reservation: - ret = bkey_s_c_to_reservation(k).v->nr_replicas; + case KEY_TYPE_inline_data: + new_val_u64s = min(new_val_u64s, k.k->size << 6); break; } - return ret; -} - -/* KEY_TYPE_reservation: */ - -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - pr_buf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} - -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); - - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - __bch2_cut_front(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index cc7ee906..1140d01a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -40,6 +40,9 @@ struct btree_insert_entry; (union bch_extent_entry *) (_entry)); \ }) +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -185,10 +188,52 @@ struct bkey_ptrs { union bch_extent_entry *end; }; -/* iterate over bkey ptrs */ +static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) + }; + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), + }; + } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } +} + +static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +{ + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + + return (struct bkey_ptrs) { + (void *) p.start, + (void *) p.end + }; +} #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ @@ -281,53 +326,121 @@ out: \ #define bkey_for_each_crc(_k, _p, _crc, _iter) \ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) +/* Iterate over pointers in KEY_TYPE_extent: */ + +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ + extent_entry_last(_e),_entry) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +#define extent_ptr_next(_e, _ptr) \ + __bkey_ptr_next(_ptr, extent_entry_last(_e)) + +#define extent_for_each_ptr(_e, _ptr) \ + __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ + extent_entry_last(_e), _ptr, _entry) + /* utility code common to all keys with pointers: */ -static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: { - struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - return (struct bkey_ptrs_c) { - e.v->start, - extent_entry_last(e) - }; - } - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - return (struct bkey_ptrs_c) { - to_entry(&s.v->ptrs[0]), - to_entry(&s.v->ptrs[s.v->nr_blocks]), - }; - } - case KEY_TYPE_reflink_v: { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); +void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); +int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *); - return (struct bkey_ptrs_c) { - r.v->start, - bkey_val_end(r), - }; - } +/* KEY_TYPE_btree_ptr: */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ +} + +/* KEY_TYPE_extent: */ + +const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); +void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +enum merge_result bch2_extent_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_extent (struct bkey_ops) { \ + .key_invalid = bch2_extent_invalid, \ + .key_debugcheck = bch2_extent_debugcheck, \ + .val_to_text = bch2_extent_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ +} + +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +enum merge_result bch2_reservation_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reservation (struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ +} + +/* Extent checksum entries: */ + +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, + struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); +void bch2_extent_crc_append(struct bkey_i *, + struct bch_extent_crc_unpacked); + +/* Generic code for keys with pointers: */ + +static inline bool bkey_extent_is_direct_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return true; default: - return (struct bkey_ptrs_c) { NULL, NULL }; + return false; } } -static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +static inline bool bkey_extent_is_data(const struct bkey *k) { - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + return bkey_extent_is_direct_data(k) || + k->type == KEY_TYPE_inline_data || + k->type == KEY_TYPE_reflink_p; +} - return (struct bkey_ptrs) { - (void *) p.start, - (void *) p.end - }; +/* + * Should extent be counted under inode->i_sectors? + */ +static inline bool bkey_extent_is_allocation(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: + case KEY_TYPE_inline_data: + return true; + default: + return false; + } } static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) @@ -369,154 +482,18 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) } unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); -int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, - struct extent_ptr_decoded *); - -void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -void bch2_bkey_drop_device(struct bkey_s, unsigned); -const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); - -/* bch_btree_ptr: */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); - -#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ - .key_debugcheck = bch2_btree_ptr_debugcheck, \ - .val_to_text = bch2_btree_ptr_to_text, \ - .swab = bch2_ptr_swab, \ -} - -/* bch_extent: */ - -const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); -void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -enum merge_result bch2_extent_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); - -#define bch2_bkey_ops_extent (struct bkey_ops) { \ - .key_invalid = bch2_extent_invalid, \ - .key_debugcheck = bch2_extent_debugcheck, \ - .val_to_text = bch2_extent_to_text, \ - .swab = bch2_ptr_swab, \ - .key_normalize = bch2_extent_normalize, \ - .key_merge = bch2_extent_merge, \ -} - -/* bch_reservation: */ - -const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_reservation_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); - -#define bch2_bkey_ops_reservation (struct bkey_ops) { \ - .key_invalid = bch2_reservation_invalid, \ - .val_to_text = bch2_reservation_to_text, \ - .key_merge = bch2_reservation_merge, \ -} - -int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, - struct bpos *); -int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, - unsigned *); -void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_insert_entry *); - void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, unsigned, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent, unsigned); - -unsigned bch2_extent_is_compressed(struct bkey_s_c); - -bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_extent_ptr, u64); - -static inline bool bkey_extent_is_direct_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - return bkey_extent_is_direct_data(k) || - k->type == KEY_TYPE_reflink_p; -} - -/* - * Should extent be counted under inode->i_sectors? - */ -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reservation: - case KEY_TYPE_reflink_p: - case KEY_TYPE_reflink_v: - return true; - default: - return false; - } -} - -/* Extent entry iteration: */ - -#define extent_for_each_entry_from(_e, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, \ - extent_entry_last(_e),_entry) - -#define extent_for_each_entry(_e, _entry) \ - extent_for_each_entry_from(_e, _entry, (_e).v->start) - -#define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) - -#define extent_for_each_ptr(_e, _ptr) \ - __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) - -#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ - extent_entry_last(_e), _ptr, _entry) - -void bch2_extent_crc_append(struct bkey_i *, - struct bch_extent_crc_unpacked); +void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); - union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -537,14 +514,34 @@ do { \ } \ } while (0) -void __bch2_cut_front(struct bpos, struct bkey_s); +void bch2_bkey_drop_device(struct bkey_s, unsigned); +const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); + +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); + +void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos, struct bkey_s); +int bch2_cut_back_s(struct bpos, struct bkey_s); static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) { - __bch2_cut_front(where, bkey_i_to_s(k)); + bch2_cut_front_s(where, bkey_i_to_s(k)); } -bool bch2_cut_back(struct bpos, struct bkey *); +static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) +{ + bch2_cut_back_s(where, bkey_i_to_s(k)); +} /** * bch_key_resize - adjust size of @k @@ -576,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst, BUG_ON(!bch2_bkey_pack_key(dst, src, f)); } -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); - #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index fd6eb00e..bce25dde 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -3,11 +3,13 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "fs.h" #include "fs-io.h" #include "fsck.h" @@ -730,7 +732,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) struct bvec_iter iter; struct bio_vec bv; unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_allocated(k); + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = k.k->type == KEY_TYPE_reservation ? SECTOR_RESERVED : SECTOR_ALLOCATED; @@ -748,6 +750,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) } } +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + static void readpage_bio_extend(struct readpages_iter *iter, struct bio *bio, unsigned sectors_this_extent, @@ -801,15 +815,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; + struct bkey_on_stack sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; int ret = 0; rbio->c = c; rbio->start_time = local_clock(); + + bkey_on_stack_init(&sk); retry: while (1) { - BKEY_PADDED(k) tmp; struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -821,15 +837,16 @@ retry: if (ret) break; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + k = bkey_i_to_s_c(sk.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) break; @@ -837,22 +854,9 @@ retry: bch2_trans_unlock(trans); - if (readpages_iter) { - bool want_full_extent = false; - - if (bkey_extent_is_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *i; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, i) - want_full_extent |= ((p.crc.csum_type != 0) | - (p.crc.compression_type != 0)); - } - - readpage_bio_extend(readpages_iter, &rbio->bio, - sectors, want_full_extent); - } + if (readpages_iter) + readpage_bio_extend(readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -866,7 +870,7 @@ retry: bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); @@ -875,8 +879,12 @@ retry: if (ret == -EINTR) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); + if (ret) { + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); + } + + bkey_on_stack_exit(&sk, c); } int bch2_readpages(struct file *file, struct address_space *mapping, @@ -1046,6 +1054,18 @@ static void bch2_writepage_io_done(struct closure *cl) } } + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_segment_all(bvec, bio, iter) { + struct bch_page_state *s; + + s = __bch2_page_state(bvec->bv_page); + spin_lock(&s->lock); + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + /* * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: @@ -1089,6 +1109,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) * possible, else allocating a new one: */ static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, struct bch_writepage_state *w, struct bch_inode_info *inode, u64 sector, @@ -1113,6 +1134,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->pos = POS(inode->v.i_ino, sector); op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } static int __bch2_writepage(struct page *page, @@ -1223,7 +1245,7 @@ do_io: bch2_writepage_do_io(w); if (!w->io) - bch2_writepage_io_alloc(c, w, inode, sector, + bch2_writepage_io_alloc(c, wbc, w, inode, sector, nr_replicas_this_write); atomic_inc(&s->write_count); @@ -1240,9 +1262,6 @@ do_io: w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; - if (wbc->sync_mode == WB_SYNC_ALL) - w->io->op.wbio.bio.bi_opf |= REQ_SYNC; - offset += sectors; } @@ -2382,6 +2401,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bkey_on_stack copy; struct btree_trans trans; struct btree_iter *src, *dst, *del = NULL; loff_t shift, new_size; @@ -2391,6 +2411,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; + bkey_on_stack_init(©); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); /* @@ -2459,7 +2480,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, while (1) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); - BKEY_PADDED(k) copy; struct bkey_i delete; struct bkey_s_c k; struct bpos next_pos; @@ -2484,34 +2504,35 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) break; reassemble: - bkey_reassemble(©.k, k); + bkey_on_stack_realloc(©, c, k.k->u64s); + bkey_reassemble(copy.k, k); if (insert && bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { - bch2_cut_front(move_pos, ©.k); - bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); + bch2_cut_front(move_pos, copy.k); + bch2_btree_iter_set_pos(src, bkey_start_pos(©.k->k)); } - copy.k.k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); + copy.k->k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); - ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); + ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); if (ret) goto bkey_err; - if (bkey_cmp(atomic_end, copy.k.k.p)) { + if (bkey_cmp(atomic_end, copy.k->k.p)) { if (insert) { move_pos = atomic_end; move_pos.offset -= shift >> 9; goto reassemble; } else { - bch2_cut_back(atomic_end, ©.k.k); + bch2_cut_back(atomic_end, copy.k); } } bkey_init(&delete.k); delete.k.p = src->pos; - bch2_key_resize(&delete.k, copy.k.k.size); + bch2_key_resize(&delete.k, copy.k->k.size); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2524,12 +2545,12 @@ reassemble: * by the triggers machinery: */ if (insert && - bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { - bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); + bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { + bch2_cut_back(bkey_start_pos(©.k->k), &delete); } else if (!insert && - bkey_cmp(copy.k.k.p, + bkey_cmp(copy.k->k.p, bkey_start_pos(&delete.k)) > 0) { - bch2_cut_front(copy.k.k.p, &delete); + bch2_cut_front(copy.k->k.p, &delete); del = bch2_trans_copy_iter(&trans, src); BUG_ON(IS_ERR_OR_NULL(del)); @@ -2538,10 +2559,10 @@ reassemble: bkey_start_pos(&delete.k)); } - bch2_trans_update(&trans, dst, ©.k); + bch2_trans_update(&trans, dst, copy.k); bch2_trans_update(&trans, del ?: src, &delete); - if (copy.k.k.size == k.k->size) { + if (copy.k->k.size == k.k->size) { /* * If we're moving the entire extent, we can skip * running triggers: @@ -2550,10 +2571,10 @@ reassemble: } else { /* We might end up splitting compressed extents: */ unsigned nr_ptrs = - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ret = bch2_disk_reservation_get(c, &disk_res, - copy.k.k.size, nr_ptrs, + copy.k->k.size, nr_ptrs, BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); } @@ -2588,6 +2609,7 @@ bkey_err: } err: bch2_trans_exit(&trans); + bkey_on_stack_exit(©, c); bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; @@ -2671,11 +2693,11 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter->pos, &reservation.k_i); - bch2_cut_back(end_pos, &reservation.k); + bch2_cut_front(iter->pos, &reservation.k_i); + bch2_cut_back(end_pos, &reservation.k_i); sectors = reservation.k.size; - reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); + reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); if (!bkey_extent_is_allocation(k.k)) { ret = bch2_quota_reservation_add(c, inode, @@ -2686,7 +2708,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, } if (reservation.v.nr_replicas < replicas || - bch2_extent_is_compressed(k)) { + bch2_bkey_sectors_compressed(k)) { ret = bch2_disk_reservation_get(c, &disk_res, sectors, replicas, 0); if (unlikely(ret)) diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index cd3540d0..1a0e3942 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "acl.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" @@ -850,7 +851,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) cur, prev; + struct bkey_on_stack cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; @@ -859,6 +860,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; + bkey_on_stack_init(&cur); + bkey_on_stack_init(&prev); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -873,15 +876,17 @@ retry: continue; } - bkey_reassemble(&cur.k, k); - k = bkey_i_to_s_c(&cur.k); + bkey_on_stack_realloc(&cur, c, k.k->u64s); + bkey_on_stack_realloc(&prev, c, k.k->u64s); + bkey_reassemble(cur.k, k); + k = bkey_i_to_s_c(cur.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &cur.k); + &offset_into_extent, cur.k); if (ret) break; @@ -891,19 +896,19 @@ retry: bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + offset_into_extent), - &cur.k); - bch2_key_resize(&cur.k.k, sectors); - cur.k.k.p = iter->pos; - cur.k.k.p.offset += cur.k.k.size; + cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter->pos; + cur.k->k.p.offset += cur.k->k.size; if (have_extent) { ret = bch2_fill_extent(c, info, - bkey_i_to_s_c(&prev.k), 0); + bkey_i_to_s_c(prev.k), 0); if (ret) break; } - bkey_copy(&prev.k, &cur.k); + bkey_copy(prev.k, cur.k); have_extent = true; if (k.k->type == KEY_TYPE_reflink_v) @@ -916,10 +921,12 @@ retry: goto retry; if (!ret && have_extent) - ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&cur, c); + bkey_on_stack_exit(&prev, c); return ret < 0 ? ret : 0; } diff --git a/libbcachefs/io.c b/libbcachefs/io.c index e3ef662e..ca891b52 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -18,7 +19,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" #include "inode.h" #include "io.h" #include "journal.h" @@ -191,8 +192,8 @@ static int sum_sector_overwrites(struct btree_trans *trans, for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { if (!may_allocate && - bch2_bkey_nr_ptrs_allocated(old) < - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) { + bch2_bkey_nr_ptrs_fully_allocated(old) < + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { ret = -ENOSPC; break; } @@ -334,7 +335,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); + bch2_cut_back(end, &delete); bch2_trans_begin_updates(trans); @@ -384,12 +385,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; + struct bkey_on_stack sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; struct btree_iter *iter; int ret; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -397,13 +400,15 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { - BKEY_PADDED(k) tmp; + k = bch2_keylist_front(keys); - bkey_copy(&tmp.k, bch2_keylist_front(keys)); + bkey_on_stack_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); bch2_trans_begin_updates(&trans); - ret = bch2_extent_update(&trans, iter, &tmp.k, + ret = bch2_extent_update(&trans, iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta); if (ret == -EINTR) @@ -411,13 +416,12 @@ int bch2_write_index_default(struct bch_write_op *op) if (ret) break; - if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); - else + if (bkey_cmp(iter->pos, k->k.p) >= 0) bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return ret; } @@ -519,16 +523,19 @@ static void __bch2_write_index(struct bch_write_op *op) for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); - bkey_copy(dst, src); - bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, - test_bit(ptr->dev, op->failed.d)); + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { - ret = -EIO; - goto err; + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { + ret = -EIO; + goto err; + } } + if (dst != src) + memmove_u64s_down(dst, src, src->u64s); dst = bkey_next(dst); } @@ -1086,7 +1093,7 @@ again: bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf |= REQ_OP_WRITE; if (!skip_put) closure_get(bio->bi_private); @@ -1123,6 +1130,47 @@ flush_io: goto again; } +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct closure *cl = &op->cl; + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + continue_at_nobarrier(cl, bch2_write_index, NULL); + return; +err: + bch2_write_done(&op->cl); +} + /** * bch_write - handle a write to a cache device or flash only volume * @@ -1144,22 +1192,22 @@ void bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; + unsigned data_len; BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + if (bio_sectors(bio) & (c->opts.block_size - 1)) { __bcache_io_error(c, "misaligned write"); op->error = -EIO; goto err; } - op->start_time = local_clock(); - - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { __bcache_io_error(c, "read only"); @@ -1169,12 +1217,25 @@ void bch2_write(struct closure *cl) bch2_increment_clock(c, bio_sectors(bio), WRITE); + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } + continue_at_nobarrier(cl, __bch2_write, NULL); return; err: if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) bch2_disk_reservation_put(c, &op->res); - closure_return(cl); + if (op->end_io) + op->end_io(op); + if (cl->parent) + closure_return(cl); + else + closure_debug_destroy(cl); } /* Cache promotion on read */ @@ -1456,13 +1517,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio { struct btree_trans trans; struct btree_iter *iter; - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -1474,11 +1536,12 @@ retry: if (bkey_err(k)) goto err; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); - if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), + if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, rbio->pos.offset - rbio->pick.crc.offset)) { @@ -1495,6 +1558,7 @@ retry: out: bch2_rbio_done(rbio); bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; @@ -1507,12 +1571,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, { struct btree_trans trans; struct btree_iter *iter; + struct bkey_on_stack sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -1520,18 +1586,18 @@ retry: for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { - BKEY_PADDED(k) tmp; unsigned bytes, sectors, offset_into_extent; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + k = bkey_i_to_s_c(sk.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) break; @@ -1570,6 +1636,7 @@ err: rbio->bio.bi_status = BLK_STS_IOERR; out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); bch2_rbio_done(rbio); } @@ -1626,7 +1693,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) new; + struct bkey_on_stack new; struct bch_extent_crc_unpacked new_crc; u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; @@ -1634,6 +1701,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) if (rbio->pick.crc.compression_type) return; + bkey_on_stack_init(&new); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -1644,8 +1712,9 @@ retry: if (IS_ERR_OR_NULL(k.k)) goto out; - bkey_reassemble(&new.k, k); - k = bkey_i_to_s_c(&new.k); + bkey_on_stack_realloc(&new, c, k.k->u64s); + bkey_reassemble(new.k, k); + k = bkey_i_to_s_c(new.k); if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) @@ -1664,10 +1733,10 @@ retry: goto out; } - if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) + if (!bch2_bkey_narrow_crcs(new.k, new_crc)) goto out; - bch2_trans_update(&trans, iter, &new.k); + bch2_trans_update(&trans, iter, new.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -1676,6 +1745,7 @@ retry: goto retry; out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&new, c); } /* Inner part that may run in process context */ @@ -1872,6 +1942,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; + if (k.k->type == KEY_TYPE_inline_data) { + struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_val_bytes(d.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, d.v->data); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -2100,6 +2183,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_trans trans; struct btree_iter *iter; + struct bkey_on_stack sk; struct bkey_s_c k; unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| @@ -2113,6 +2197,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -2121,7 +2206,6 @@ retry: POS(inode, rbio->bio.bi_iter.bi_sector), BTREE_ITER_SLOTS); while (1) { - BKEY_PADDED(k) tmp; unsigned bytes, sectors, offset_into_extent; bch2_btree_iter_set_pos(iter, @@ -2132,15 +2216,16 @@ retry: if (ret) goto err; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + k = bkey_i_to_s_c(sk.k); + ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) goto err; @@ -2172,6 +2257,7 @@ retry: } out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return; err: if (ret == -EINTR) diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 91aaa58f..45c95094 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -30,10 +30,11 @@ enum bch_write_flags { BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_NOPUT_RESERVATION = (1 << 7), + BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), - BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), + BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), }; static inline u64 *op_journal_seq(struct bch_write_op *op) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 5c3e146e..9f03a479 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -945,7 +945,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) w = j->buf + !state.idx; ret = state.prev_buf_unwritten && - bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx); + bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); spin_unlock(&j->lock); return ret; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 387377da..7112a25d 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1100,7 +1100,7 @@ void bch2_journal_write(struct closure *cl) for_each_rw_member(ca, c, i) if (journal_flushes_device(ca) && - !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index de8522f7..4dacbd63 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -4,6 +4,7 @@ */ #include "bcachefs.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(key) tmp; + struct bkey_on_stack sk; int ret = 0; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, @@ -58,9 +60,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags continue; } - bkey_reassemble(&tmp.key, k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); - ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key), + ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), dev_idx, flags, false); if (ret) break; @@ -70,11 +73,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k)); + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - bch2_trans_update(&trans, iter, &tmp.key); + bch2_trans_update(&trans, iter, sk.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -92,6 +95,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags } ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&sk, c); BUG_ON(ret == -EINTR); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index ab20e981..acdc1730 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -96,10 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); + bch2_cut_front(iter->pos, &new->k_i); - bch2_cut_front(iter->pos, insert); - bch2_cut_back(new->k.p, &insert->k); - bch2_cut_back(insert->k.p, &new->k); + bch2_cut_front(iter->pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); if (m->data_cmd == DATA_REWRITE) bch2_bkey_drop_device(bkey_i_to_s(insert), @@ -133,11 +135,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - + nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && - bch2_extent_is_compressed(k) && + bch2_bkey_sectors_compressed(k) && nr > 0) { ret = bch2_disk_reservation_add(c, &op->res, keylist_sectors(keys) * nr, 0); @@ -168,8 +170,6 @@ next: if (bch2_keylist_empty(keys)) goto out; } - - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); continue; nomatch: if (m->ctxt) @@ -251,7 +251,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, */ #if 0 int nr = (int) io_opts.data_replicas - - bch2_bkey_nr_dirty_ptrs(k); + bch2_bkey_nr_ptrs_allocated(k); #endif int nr = (int) io_opts.data_replicas; @@ -490,7 +490,7 @@ static int __bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -499,6 +499,7 @@ static int __bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; @@ -578,8 +579,9 @@ peek: } /* unlock before doing IO: */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_realloc(&sk, c, k.k->u64s); + bkey_reassemble(sk.k, k); + k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, @@ -598,7 +600,7 @@ peek: if (rate) bch2_ratelimit_increment(rate, k.k->size); next: - atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k), + atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), &stats->sectors_seen); next_nondata: bch2_btree_iter_next(iter); @@ -606,6 +608,7 @@ next_nondata: } out: ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&sk, c); return ret; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index d1184bf6..d4002b7f 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -177,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); } else { struct bkey_i *split = kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); @@ -186,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) goto err; bkey_copy(split, i[0].k); - bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), split); keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { .btree_id = i[0].btree_id, .allocated = true, @@ -254,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, * Some extents aren't equivalent - w.r.t. what the triggers do * - if they're split: */ - bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) || + bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || k->k.type == KEY_TYPE_reflink_p; bool remark = false; int ret; @@ -289,7 +289,7 @@ retry: bkey_cmp(atomic_end, k->k.p) < 0) { ret = bch2_disk_reservation_add(c, &disk_res, k->k.size * - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); @@ -298,7 +298,7 @@ retry: bkey_copy(split, k); bch2_cut_front(split_iter->pos, split); - bch2_cut_back(atomic_end, &split->k); + bch2_cut_back(atomic_end, split); bch2_trans_update(&trans, split_iter, split); bch2_btree_iter_set_pos(iter, split->k.p); @@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c) write_sb = true; } + if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA); + write_sb = true; + } + if (!test_bit(BCH_FS_ERROR, &c->flags)) { c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; write_sb = true; diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 6e71c5e8..4de65bf7 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "extents.h" #include "inode.h" @@ -39,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c, if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { bch2_key_resize(l.k, KEY_SIZE_MAX); - __bch2_cut_front(l.k->p, _r); + bch2_cut_front_s(l.k->p, _r); return BCH_MERGE_PARTIAL; } @@ -160,7 +161,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct btree_trans trans; struct btree_iter *dst_iter, *src_iter; struct bkey_s_c src_k; - BKEY_PADDED(k) new_dst, new_src; + BKEY_PADDED(k) new_dst; + struct bkey_on_stack new_src; struct bpos dst_end = dst_start, src_end = src_start; struct bpos dst_want, src_want; u64 src_done, dst_done; @@ -183,6 +185,7 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset += remap_sectors; src_end.offset += remap_sectors; + bkey_on_stack_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, @@ -222,14 +225,15 @@ s64 bch2_remap_range(struct bch_fs *c, break; if (src_k.k->type == KEY_TYPE_extent) { - bkey_reassemble(&new_src.k, src_k); - src_k = bkey_i_to_s_c(&new_src.k); + bkey_on_stack_realloc(&new_src, c, src_k.k->u64s); + bkey_reassemble(new_src.k, src_k); + src_k = bkey_i_to_s_c(new_src.k); - bch2_cut_front(src_iter->pos, &new_src.k); - bch2_cut_back(src_end, &new_src.k.k); + bch2_cut_front(src_iter->pos, new_src.k); + bch2_cut_back(src_end, new_src.k); ret = bch2_make_extent_indirect(&trans, src_iter, - bkey_i_to_extent(&new_src.k)); + bkey_i_to_extent(new_src.k)); if (ret) goto btree_err; @@ -299,6 +303,7 @@ err: } while (ret2 == -EINTR); ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&new_src, c); percpu_ref_put(&c->writes); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 4145832f..ac2f31e3 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -506,6 +506,7 @@ static void bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[0]); kfree(c->usage_base); free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); @@ -758,6 +759,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 2cc433ec..e69d03d1 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -550,7 +550,7 @@ size_t bch2_rand_range(size_t max) return rand; } -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) +void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) { struct bio_vec bv; struct bvec_iter iter; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 8e704b4a..0128daba 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -547,7 +547,7 @@ do { \ size_t bch2_rand_range(size_t); -void memcpy_to_bio(struct bio *, struct bvec_iter, void *); +void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); static inline void memcpy_u64s_small(void *dst, const void *src,