diff --git a/.bcachefs_revision b/.bcachefs_revision index fba12e90..573681c5 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d763e8ab17ff1f5bdd9c5474ac15eb8791d31582 +3cd63315a62c8e2da15aa9fd59fa74ef40c9dc14 diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index f6141fde..3b5e70a7 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1294,7 +1294,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(reflink, 6) \ x(new_siphash, 7) \ x(inline_data, 8) \ - x(new_extent_overwrite, 9) + x(new_extent_overwrite, 9) \ + x(incompressible, 10) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1374,11 +1375,12 @@ enum bch_csum_opts { }; #define BCH_COMPRESSION_TYPES() \ - x(none, 0) \ - x(lz4_old, 1) \ - x(gzip, 2) \ - x(lz4, 3) \ - x(zstd, 4) + x(none, 0) \ + x(lz4_old, 1) \ + x(gzip, 2) \ + x(lz4, 3) \ + x(zstd, 4) \ + x(incompressible, 5) enum bch_compression_type { #define x(t, n) BCH_COMPRESSION_TYPE_##t, diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 18f84201..7cbb5704 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -210,28 +210,38 @@ bch2_sort_repack_merge(struct bch_fs *c, bool filter_whiteouts) { struct bkey_packed *prev = NULL, *k_packed; - struct bkey_s k; + struct bkey_on_stack k; struct btree_nr_keys nr; - struct bkey unpacked; memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&k); while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { if (filter_whiteouts && bkey_whiteout(k_packed)) continue; - k = __bkey_disassemble(src, k_packed, &unpacked); + /* + * NOTE: + * bch2_bkey_normalize may modify the key we pass it (dropping + * stale pointers) and we don't have a write lock on the src + * node; we have to make a copy of the entire key before calling + * normalize + */ + bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); + bch2_bkey_unpack(src, k.k, k_packed); if (filter_whiteouts && - bch2_bkey_normalize(c, k)) + bch2_bkey_normalize(c, bkey_i_to_s(k.k))) continue; - extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k); + extent_sort_append(c, out_f, &nr, vstruct_last(dst), + &prev, bkey_i_to_s(k.k)); } extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + bkey_on_stack_exit(&k, c); return nr; } @@ -254,23 +264,18 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, sort_iter_sort(iter, sort_keys_cmp); while ((in = sort_iter_next(iter, sort_keys_cmp))) { + bool needs_whiteout = false; + if (bkey_whiteout(in) && (filter_whiteouts || !in->needs_whiteout)) continue; - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { + while ((next = sort_iter_peek(iter)) && + !bkey_cmp_packed(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; + needs_whiteout |= in->needs_whiteout; + in = sort_iter_next(iter, sort_keys_cmp); } if (bkey_whiteout(in)) { @@ -279,6 +284,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, } else { bkey_copy(out, in); } + out->needs_whiteout |= needs_whiteout; out = bkey_next(out); } diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index d6792d27..cf8fa59f 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1397,21 +1397,21 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, { if (lossy_packed_search) while (m != btree_bkey_last(b, t) && - bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, - m) > 0) + bkey_iter_cmp_p_or_unp(b, m, + lossy_packed_search, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (!packed_search) while (m != btree_bkey_last(b, t) && - bkey_iter_pos_cmp(b, search, m) > 0) + bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && - bkey_iter_cmp_p_or_unp(b, search, packed_search, - prev) <= 0); + bkey_iter_cmp_p_or_unp(b, prev, + packed_search, search) >= 0); } return m; diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 2653a74b..7338ccbc 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -199,12 +199,6 @@ __bkey_unpack_key_format_checked(const struct btree *b, if (btree_keys_expensive_checks(b)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - /* - * hack around a harmless race when compacting whiteouts - * for a write: - */ - dst2.needs_whiteout = dst->needs_whiteout; - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); } } @@ -360,7 +354,7 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); static inline int bkey_cmp_p_or_unp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r_packed, - struct bpos *r) + const struct bpos *r) { EBUG_ON(r_packed && !bkey_packed(r_packed)); @@ -449,7 +443,7 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) * XXX: only need to compare pointers for keys that are both within a * btree_node_iterator - we need to break ties for prev() to work correctly */ -static inline int bkey_iter_cmp(struct btree *b, +static inline int bkey_iter_cmp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { @@ -458,7 +452,7 @@ static inline int bkey_iter_cmp(struct btree *b, ?: cmp_int(l, r); } -static inline int btree_node_iter_cmp(struct btree *b, +static inline int btree_node_iter_cmp(const struct btree *b, struct btree_node_iter_set l, struct btree_node_iter_set r) { @@ -467,22 +461,22 @@ static inline int btree_node_iter_cmp(struct btree *b, __btree_node_offset_to_key(b, r.k)); } -/* These assume l (the search key) is not a deleted key: */ -static inline int bkey_iter_pos_cmp(struct btree *b, - struct bpos *l, - const struct bkey_packed *r) +/* These assume r (the search key) is not a deleted key: */ +static inline int bkey_iter_pos_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) { - return -bkey_cmp_left_packed(b, r, l) - ?: (int) bkey_deleted(r); + return bkey_cmp_left_packed(b, l, r) + ?: -((int) bkey_deleted(l)); } -static inline int bkey_iter_cmp_p_or_unp(struct btree *b, - struct bpos *l, - const struct bkey_packed *l_packed, - const struct bkey_packed *r) +static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) { - return -bkey_cmp_p_or_unp(b, r, l_packed, l) - ?: (int) bkey_deleted(r); + return bkey_cmp_p_or_unp(b, l, r_packed, r) + ?: -((int) bkey_deleted(l)); } static inline struct bkey_packed * diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index c5873c58..83358d6a 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -75,7 +75,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) return c->opts.btree_node_size >> c->block_bits; } -#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) +#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4) #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 988c550c..ea0555b8 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -11,10 +11,6 @@ #include #include -static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, - struct btree_iter_level *, - struct bkey *); - #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) #define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) #define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) @@ -29,37 +25,14 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l) (unsigned long) iter->l[l].b >= 128; } -/* Returns < 0 if @k is before iter pos, > 0 if @k is after */ -static inline int __btree_iter_pos_cmp(struct btree_iter *iter, - const struct btree *b, - const struct bkey_packed *k, - bool interior_node) +static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { - int cmp = bkey_cmp_left_packed(b, k, &iter->pos); + struct bpos pos = iter->pos; - if (cmp) - return cmp; - if (bkey_deleted(k)) - return -1; - - /* - * Normally, for extents we want the first key strictly greater than - * the iterator position - with the exception that for interior nodes, - * we don't want to advance past the last key if the iterator position - * is POS_MAX: - */ - if (iter->flags & BTREE_ITER_IS_EXTENTS && - (!interior_node || - bkey_cmp_left_packed_byval(b, k, POS_MAX))) - return -1; - return 1; -} - -static inline int btree_iter_pos_cmp(struct btree_iter *iter, - const struct btree *b, - const struct bkey_packed *k) -{ - return __btree_iter_pos_cmp(iter, b, k, b->level != 0); + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(pos, POS_MAX)) + pos = bkey_successor(pos); + return pos; } /* Btree node locking: */ @@ -415,6 +388,7 @@ void bch2_trans_unlock(struct btree_trans *trans) static void __bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) { + struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[b->level]; struct btree_node_iter tmp = l->iter; struct bkey_packed *k; @@ -437,17 +411,17 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) : bch2_btree_node_iter_prev_all(&tmp, b); - if (k && btree_iter_pos_cmp(iter, b, k) > 0) { + if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); bch2_bkey_to_text(&PBUF(buf), &uk); - panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", + panic("iterator should be before prev key:\n%s\n%llu:%llu\n", buf, iter->pos.inode, iter->pos.offset); } k = bch2_btree_node_iter_peek_all(&l->iter, b); - if (k && btree_iter_pos_cmp(iter, b, k) < 0) { + if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); @@ -495,15 +469,19 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, } static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, - struct btree *b, - struct bkey_packed *where) + struct btree *b, + struct bkey_packed *where) { - struct btree_node_iter *node_iter = &iter->l[0].iter; + struct btree_iter_level *l = &iter->l[b->level]; + struct bpos pos = btree_iter_search_key(iter); - if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { - bkey_disassemble(b, where, &iter->k); - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - } + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + + if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, @@ -535,6 +513,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, bool iter_current_key_modified = orig_iter_pos >= offset && orig_iter_pos <= offset + clobber_u64s; + struct bpos iter_pos = btree_iter_search_key(iter); btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -542,7 +521,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -557,7 +536,7 @@ found: return; if (new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -702,11 +681,12 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) { + struct bpos pos = btree_iter_search_key(iter); struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - btree_iter_pos_cmp(iter, l->b, k) < 0) { + bkey_iter_pos_cmp(l->b, k, &pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -765,13 +745,7 @@ static inline bool btree_iter_pos_before_node(struct btree_iter *iter, static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - int cmp = bkey_cmp(b->key.k.p, iter->pos); - - if (!cmp && - (iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(b->key.k.p, POS_MAX)) - cmp = -1; - return cmp < 0; + return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -785,16 +759,10 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, static inline void __btree_iter_init(struct btree_iter *iter, unsigned level) { + struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - btree_iter_advance_to_pos(iter, l, -1); - - /* Skip to first non whiteout: */ - if (level) - bch2_btree_node_iter_peek(&l->iter, l->b); + bch2_btree_node_iter_init(&l->iter, l->b, &pos); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } @@ -1371,12 +1339,6 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) if (debug_check_iterators(iter->trans->c)) { struct bkey k = bkey_unpack_key(l->b, _k); - /* - * this flag is internal to the btree code, - * we don't care if it doesn't match - if it's now set - * it just means the key has been written out to disk: - */ - k.needs_whiteout = iter->k.needs_whiteout; BUG_ON(memcmp(&k, &iter->k, sizeof(k))); } @@ -1564,9 +1526,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) int ret; recheck: - while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && - bkey_cmp(k.k->p, iter->pos) <= 0) - bch2_btree_node_iter_advance(&l->iter, l->b); + btree_iter_advance_to_pos(iter, l, -1); /* * iterator is now at the correct position for inserting at iter->pos, @@ -1575,9 +1535,27 @@ recheck: */ node_iter = l->iter; - if (k.k && bkey_whiteout(k.k)) - k = __btree_iter_unpack(iter, l, &iter->k, - bch2_btree_node_iter_peek(&node_iter, l->b)); + k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&node_iter, l->b)); + + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + /* + * If there wasn't actually a hole, want the iterator to be + * pointed at the key we found: + * + * XXX: actually, we shouldn't be changing the iterator here: + * the iterator needs to be correct for inserting at iter->pos, + * and there may be whiteouts between iter->pos and what this + * iterator points at: + */ + l->iter = node_iter; + + EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); + iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); + return k; + } /* * If we got to the end of the node, check if we need to traverse to the @@ -1592,24 +1570,6 @@ recheck: goto recheck; } - if (k.k && - !bkey_whiteout(k.k) && - bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - /* - * if we skipped forward to find the first non whiteout and - * there _wasn't_ actually a hole, we want the iterator to be - * pointed at the key we found: - */ - l->iter = node_iter; - - EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); - EBUG_ON(bkey_deleted(k.k)); - iter->uptodate = BTREE_ITER_UPTODATE; - - __bch2_btree_iter_verify(iter, l->b); - return k; - } - /* hole */ /* holes can't span inode numbers: */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 86e52468..b7af88e0 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -184,9 +184,25 @@ enum btree_iter_type { #define BTREE_ITER_TYPE ((1 << 2) - 1) +/* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ #define BTREE_ITER_SLOTS (1 << 2) +/* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ #define BTREE_ITER_INTENT (1 << 3) +/* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ #define BTREE_ITER_PREFETCH (1 << 4) +/* + * Indicates that this iterator should not be reused until transaction commit, + * either because a pending update references it or because the update depends + * on that particular key being locked (e.g. by the str_hash code, for hash + * table consistency) + */ #define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d84bb680..748e6356 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1191,7 +1191,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b BTREE_TRIGGER_GC); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) bch2_btree_node_iter_advance(node_iter, b); /* @@ -1385,7 +1385,7 @@ static void btree_split(struct btree_update *as, struct btree *b, if (keys) btree_split_insert_keys(as, n1, iter, keys); - if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { + if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { trace_btree_split(c, b); n2 = __btree_split_node(as, n1, iter); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index dfbe5dcd..afd2086e 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -64,12 +64,45 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bkey_cmp(insert->k.p, b->data->max_key) > 0); k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && !bkey_cmp_packed(b, k, &insert->k)) { - BUG_ON(bkey_whiteout(k)); + if (k && bkey_cmp_packed(b, k, &insert->k)) + k = NULL; + /* @k is the key being overwritten/deleted, if any: */ + + EBUG_ON(k && bkey_whiteout(k)); + + if (bkey_whiteout(&insert->k)) { + /* Deleting: */ + + /* Not found? Nothing to do: */ + if (!k) + return false; + + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) { + push_whiteout(iter->trans->c, b, k); + k->needs_whiteout = false; + } + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + + bch2_bset_delete(b, k, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, 0); + } else { + bch2_btree_iter_fix_key_modified(iter, b, k); + } + + return true; + } + + if (k) { + /* Overwriting: */ if (!bkey_written(b, k) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && - !bkey_whiteout(&insert->k)) { + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { k->type = insert->k.type; memcpy_u64s(bkeyp_val(f, k), &insert->v, bkey_val_u64s(&insert->k)); @@ -77,27 +110,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, } btree_account_key_drop(b, k); - - if (bkey_whiteout(&insert->k)) { - unsigned clobber_u64s = k->u64s, new_u64s = k->u64s; - - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) { - push_whiteout(iter->trans->c, b, k); - k->needs_whiteout = false; - } - - if (k >= btree_bset_last(b)->start) { - bch2_bset_delete(b, k, clobber_u64s); - new_u64s = 0; - } - - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, new_u64s); - return true; - - } + k->type = KEY_TYPE_deleted; insert->k.needs_whiteout = k->needs_whiteout; k->needs_whiteout = false; @@ -105,23 +118,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, if (k >= btree_bset_last(b)->start) { clobber_u64s = k->u64s; goto overwrite; + } else { + bch2_btree_iter_fix_key_modified(iter, b, k); } - - k->type = KEY_TYPE_deleted; - /* - * XXX: we should be able to do this without two calls to - * bch2_btree_node_iter_fix: - */ - bch2_btree_node_iter_fix(iter, b, node_iter, k, - k->u64s, k->u64s); - } else { - /* - * Deleting, but the key to delete wasn't found - nothing to do: - */ - if (bkey_whiteout(&insert->k)) - return false; - - insert->k.needs_whiteout = false; } k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); @@ -707,10 +706,18 @@ int __bch2_trans_commit(struct btree_trans *trans) trans_trigger_run = false; trans_for_each_update(trans, i) { - /* we know trans->nounlock won't be set here: */ - if (unlikely(!(i->iter->locks_want < 1 - ? __bch2_btree_iter_upgrade(i->iter, 1) - : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { + if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) { + trace_trans_restart_traverse(trans->ip); + ret = -EINTR; + goto out; + } + + /* + * We're not using bch2_btree_iter_upgrade here because + * we know trans->nounlock can't be set: + */ + if (unlikely(i->iter->locks_want < 1 && + !__bch2_btree_iter_upgrade(i->iter, 1))) { trace_trans_restart_upgrade(trans->ip); ret = -EINTR; goto out; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index a5c947e8..6f1afa4a 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -325,7 +325,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, BUG_ON(len_a + len_b > bio_sectors(bio)); BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); - BUG_ON(crc_old.compression_type); + BUG_ON(crc_is_compressed(crc_old)); BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != bch2_csum_type_is_encryption(new_csum_type)); @@ -354,6 +354,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, if (i->crc) *i->crc = (struct bch_extent_crc_unpacked) { .csum_type = i->csum_type, + .compression_type = crc_old.compression_type, .compressed_size = i->len, .uncompressed_size = i->len, .offset = 0, diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index ca9e4590..24dee803 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -155,13 +155,16 @@ static inline struct nonce null_nonce(void) static inline struct nonce extent_nonce(struct bversion version, struct bch_extent_crc_unpacked crc) { - unsigned size = crc.compression_type ? crc.uncompressed_size : 0; + unsigned compression_type = crc_is_compressed(crc) + ? crc.compression_type + : 0; + unsigned size = compression_type ? crc.uncompressed_size : 0; struct nonce nonce = (struct nonce) {{ [0] = cpu_to_le32(size << 22), [1] = cpu_to_le32(version.lo), [2] = cpu_to_le32(version.lo >> 32), [3] = cpu_to_le32(version.hi| - (crc.compression_type << 24))^BCH_NONCE_EXTENT, + (compression_type << 24))^BCH_NONCE_EXTENT, }}; return nonce_add(nonce, crc.nonce << 9); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index bb557eda..0959bb86 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -434,7 +434,7 @@ out: bio_unmap_or_unbounce(c, dst_data); return compression_type; err: - compression_type = 0; + compression_type = BCH_COMPRESSION_TYPE_incompressible; goto out; } diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c4b0b9e1..a19b91f9 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -337,7 +337,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, if (!bch2_checksum_mergeable(crc_l.csum_type)) return BCH_MERGE_NOMERGE; - if (crc_l.compression_type) + if (crc_is_compressed(crc_l)) return BCH_MERGE_NOMERGE; if (crc_l.csum_type && @@ -448,7 +448,7 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { - return !u.compression_type && + return !crc_is_compressed(u) && u.csum_type && u.uncompressed_size > u.live_size && bch2_csum_type_is_encryption(u.csum_type) == @@ -492,7 +492,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { bkey_for_each_crc(&k->k, ptrs, u, i) - if (!u.compression_type && + if (!crc_is_compressed(u) && u.csum_type && u.live_size == u.uncompressed_size) { n = u; @@ -501,7 +501,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) return false; } found: - BUG_ON(n.compression_type); + BUG_ON(crc_is_compressed(n)); BUG_ON(n.offset); BUG_ON(n.live_size != k->k.size); @@ -610,8 +610,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_TYPE_none; + ret += !p.ptr.cached && !crc_is_compressed(p.crc); } return ret; @@ -625,13 +624,24 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) unsigned ret = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_TYPE_none) + if (!p.ptr.cached && crc_is_compressed(p.crc)) ret += p.crc.compressed_size; return ret; } +bool bch2_bkey_is_incompressible(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} + bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, unsigned nr_replicas) { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 7c5a41e6..0d855417 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -175,6 +175,12 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) #undef common_fields } +static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) +{ + return (crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); +} + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { @@ -483,6 +489,7 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 4c7dd099..ca83d79e 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -546,9 +546,14 @@ static void __bch2_write_index(struct bch_write_op *op) * particularly want to plumb io_opts all the way through the btree * update stack right now */ - for_each_keylist_key(keys, k) + for_each_keylist_key(keys, k) { bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) + bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); + + } + if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); @@ -784,8 +789,9 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - op->crc.compression_type == op->compression_type) { - if (!op->crc.compression_type && + (op->crc.compression_type == op->compression_type || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && bch2_write_rechecksum(c, op, op->csum_type)) return PREP_ENCODED_CHECKSUM_ERR; @@ -797,7 +803,7 @@ static enum prep_encoded_ret { * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it: */ - if (op->crc.compression_type) { + if (crc_is_compressed(op->crc)) { struct bch_csum csum; if (bch2_write_decrypt(op)) @@ -864,6 +870,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: + BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -908,11 +915,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_type && !bounce); - crc.compression_type = op->compression_type - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_type + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_type) : 0; - if (!crc.compression_type) { + if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); @@ -941,7 +950,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } if ((op->flags & BCH_WRITE_DATA_ENCODED) && - !crc.compression_type && + !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { /* @@ -1338,6 +1347,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) static struct promote_op *__promote_alloc(struct bch_fs *c, enum btree_id btree_id, + struct bkey_s_c k, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1394,8 +1404,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, - btree_id, - bkey_s_c_null); + btree_id, k); BUG_ON(ret); return op; @@ -1437,7 +1446,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_REFLINK : BTREE_ID_EXTENTS, - pos, pick, opts, sectors, rbio); + k, pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1701,7 +1710,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; - if (rbio->pick.crc.compression_type) + if (crc_is_compressed(rbio->pick.crc)) return; bkey_on_stack_init(&new); @@ -1786,7 +1795,7 @@ static void __bch2_read_endio(struct work_struct *work) crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - if (crc.compression_type != BCH_COMPRESSION_TYPE_none) { + if (crc_is_compressed(crc)) { bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; @@ -1883,7 +1892,7 @@ static void bch2_read_endio(struct bio *bio) } if (rbio->narrow_crcs || - rbio->pick.crc.compression_type || + crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) @@ -1994,7 +2003,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none || + if (crc_is_compressed(pick.crc) || (pick.crc.csum_type != BCH_CSUM_NONE && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && @@ -2009,7 +2018,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick.crc.compression_type); + EBUG_ON(crc_is_compressed(pick.crc)); EBUG_ON(pick.crc.csum_type && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index c37b7d74..7f7b69b3 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -105,6 +105,7 @@ struct bch_write_op { unsigned nr_replicas:4; unsigned nr_replicas_required:4; unsigned alloc_reserve:4; + unsigned incompressible:1; struct bch_devs_list devs_have; u16 target; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 257e00ae..ecc74ebe 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -215,6 +215,9 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, enum btree_id btree_id, struct bkey_s_c k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; int ret; m->btree_id = btree_id; @@ -223,9 +226,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->nr_ptrs_reserved = 0; bch2_write_op_init(&m->op, c, io_opts); - m->op.compression_type = - bch2_compression_opt_to_type[io_opts.background_compression ?: - io_opts.compression]; + + if (!bch2_bkey_is_incompressible(k)) + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + else + m->op.incompressible = true; + m->op.target = data_opts.target, m->op.write_point = wp; @@ -265,14 +273,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; unsigned compressed_sectors = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc_is_compressed(p.crc) && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) compressed_sectors += p.crc.compressed_size; diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 84b3fb6e..ab193432 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -17,50 +17,52 @@ #include #include -static inline bool rebalance_ptr_pred(struct bch_fs *c, - struct extent_ptr_decoded p, - struct bch_io_opts *io_opts) +/* + * Check if an extent should be moved: + * returns -1 if it should not be moved, or + * device of pointer that should be moved, if known, or INT_MAX if unknown + */ +static int __bch2_rebalance_pred(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) { - if (io_opts->background_target && - !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && - !p.ptr.cached) - return true; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; if (io_opts->background_compression && - p.crc.compression_type != - bch2_compression_opt_to_type[io_opts->background_compression]) - return true; + !bch2_bkey_is_incompressible(k)) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return p.ptr.dev; - return false; + if (io_opts->background_target) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) + return p.ptr.dev; + + return -1; } void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + atomic64_t *counter; + int dev; - if (!io_opts->background_target && - !io_opts->background_compression) + dev = __bch2_rebalance_pred(c, k, io_opts); + if (dev < 0) return; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (rebalance_ptr_pred(c, p, io_opts)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + counter = dev < INT_MAX + ? &bch_dev_bkey_exists(c, dev)->rebalance_work + : &c->rebalance.work_unknown_dev; - if (atomic64_add_return(p.crc.compressed_size, - &ca->rebalance_work) == - p.crc.compressed_size) - rebalance_wakeup(c); - } -} - -void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -{ - if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == - sectors) + if (atomic64_add_return(k.k->size, counter) == k.k->size) rebalance_wakeup(c); } @@ -69,26 +71,20 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned nr_replicas = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - nr_replicas += !p.ptr.cached; - - if (rebalance_ptr_pred(c, p, io_opts)) - goto found; + if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; + } else { + return DATA_SKIP; } +} - if (nr_replicas < io_opts->data_replicas) - goto found; - - return DATA_SKIP; -found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == + sectors) + rebalance_wakeup(c); } struct rebalance_work { @@ -183,6 +179,8 @@ static int bch2_rebalance_thread(void *arg) prev_cputime = curr_cputime(); while (!kthread_wait_freezable(r->enabled)) { + cond_resched(); + start = jiffies; cputime = curr_cputime(); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 602def1e..d78ffcc0 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) struct extent_ptr_decoded p; extent_for_each_ptr_decode(e, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) { + if (!crc_is_compressed(p.crc)) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else {