From 92084feca4fd9d534b7d1d9e1425faeeaf91c3fa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 02:23:24 -0500 Subject: [PATCH 100/233] bcachefs: fix O(n^2) issue with whiteouts in journal keys Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit The journal_keys array can't be substantially modified after we go RW, because lookups need to be able to check it locklessly - thus we're limited on what we can do when a key in the journal has been overwritten. This is a problem when there's many overwrites to skip over for peek() operations. To fix this, add tracking of ranges of overwrites: we create a range entry when there's more than one contiguous whiteout. Signed-off-by: Kent Overstreet Signed-off-by: Alexander Miroshnichenko --- fs/bcachefs/bcachefs.h | 23 +--- fs/bcachefs/btree_journal_iter.c | 156 ++++++++++++++++++++++--- fs/bcachefs/btree_journal_iter.h | 2 + fs/bcachefs/btree_journal_iter_types.h | 36 ++++++ fs/bcachefs/super.c | 3 +- 5 files changed, 179 insertions(+), 41 deletions(-) create mode 100644 fs/bcachefs/btree_journal_iter_types.h diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7a947d43d504..11f9ed42a9da 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -205,6 +205,7 @@ #include #include "bcachefs_format.h" +#include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" @@ -658,28 +659,6 @@ struct journal_seq_blacklist_table { } entries[]; }; -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated; - bool overwritten; - struct bkey_i *k; - } *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; -}; - struct btree_trans_buf { struct btree_trans *trans; }; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index cc7f5fad90c6..de3db161d6ab 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -16,6 +16,17 @@ * operations for the regular btree iter code to use: */ +static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) +{ + size_t gap_size = keys->size - keys->nr; + + BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); + + if (pos >= keys->gap) + pos -= gap_size; + return pos; +} + static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) { size_t gap_size = keys->size - keys->nr; @@ -84,27 +95,37 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_ } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; + break; if (k->overwritten) { - (*idx)++; + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->end; + else + *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { + ret = k->k; + break; + } (*idx)++; iters++; if (iters == 10) { *idx = 0; + rcu_read_unlock(); goto search; } } - return NULL; + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, @@ -130,17 +151,25 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) - return NULL; + break; if (k->overwritten) { - --(*idx); + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->start - 1; + else + *idx -= 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { + ret = k->k; + break; + } --(*idx); iters++; @@ -150,7 +179,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } - return NULL; + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, @@ -163,6 +193,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree static void journal_iter_verify(struct journal_iter *iter) { +#ifdef CONFIG_BCACHEFS_DEBUG struct journal_keys *keys = iter->keys; size_t gap_size = keys->size - keys->nr; @@ -175,6 +206,7 @@ static void journal_iter_verify(struct journal_iter *iter) int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); BUG_ON(cmp > 0); } +#endif } static void journal_iters_fix(struct bch_fs *c) @@ -335,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, bkey_deleted(&keys->data[idx].k->k)); } +static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) +{ + struct journal_key *k = keys->data + pos; + size_t idx = pos_to_idx(keys, pos); + + k->overwritten = true; + + struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; + struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; + + bool prev_overwritten = prev && prev->overwritten; + bool next_overwritten = next && next->overwritten; + + struct journal_key_range_overwritten *prev_range = + prev_overwritten ? prev->overwritten_range : NULL; + struct journal_key_range_overwritten *next_range = + next_overwritten ? next->overwritten_range : NULL; + + BUG_ON(prev_range && prev_range->end != idx); + BUG_ON(next_range && next_range->start != idx + 1); + + if (prev_range && next_range) { + prev_range->end = next_range->end; + + keys->data[pos].overwritten_range = prev_range; + for (size_t i = next_range->start; i < next_range->end; i++) { + struct journal_key *ip = keys->data + idx_to_pos(keys, i); + BUG_ON(ip->overwritten_range != next_range); + ip->overwritten_range = prev_range; + } + + kfree_rcu_mightsleep(next_range); + } else if (prev_range) { + prev_range->end++; + k->overwritten_range = prev_range; + if (next_overwritten) { + prev_range->end++; + next->overwritten_range = prev_range; + } + } else if (next_range) { + next_range->start--; + k->overwritten_range = next_range; + if (prev_overwritten) { + next_range->start--; + prev->overwritten_range = next_range; + } + } else if (prev_overwritten || next_overwritten) { + struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return; + + r->start = idx - (size_t) prev_overwritten; + r->end = idx + 1 + (size_t) next_overwritten; + + rcu_assign_pointer(k->overwritten_range, r); + if (prev_overwritten) + prev->overwritten_range = r; + if (next_overwritten) + next->overwritten_range = r; + } +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -344,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, if (idx < keys->size && keys->data[idx].btree_id == btree && keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos)) - keys->data[idx].overwritten = true; + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { + mutex_lock(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); + mutex_unlock(&keys->overwrite_lock); + } } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -359,8 +457,11 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { + struct bkey_s_c ret = bkey_s_c_null; + journal_iter_verify(iter); + rcu_read_lock(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -369,13 +470,19 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) - return bkey_i_to_s_c(k->k); + if (!k->overwritten) { + ret = bkey_i_to_s_c(k->k); + break; + } - bch2_journal_iter_advance(iter); + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); } + rcu_read_unlock(); - return bkey_s_c_null; + return ret; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -556,9 +663,15 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys, keys->nr); - darray_for_each(*keys, i) + darray_for_each(*keys, i) { + if (i->overwritten_range && + (i == &darray_last(*keys) || + i->overwritten_range != i[1].overwritten_range)) + kfree(i->overwritten_range); + if (i->allocated) kfree(i->k); + } kvfree(keys->data); keys->data = NULL; @@ -682,3 +795,12 @@ void bch2_journal_keys_dump(struct bch_fs *c) } printbuf_exit(&buf); } + +void bch2_fs_journal_keys_init(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + atomic_set(&keys->ref, 1); + keys->initial_ref_held = true; + mutex_init(&keys->overwrite_lock); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 9e8f8ab1c6ff..2a3082919b8d 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -97,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, void bch2_journal_keys_dump(struct bch_fs *); +void bch2_fs_journal_keys_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h new file mode 100644 index 000000000000..8b773823704f --- /dev/null +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H + +struct journal_key_range_overwritten { + size_t start, end; +}; + +struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct journal_key_range_overwritten __rcu * + overwritten_range; + struct bkey_i *k; +}; + +struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; + struct journal_key *data; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + atomic_t ref; + bool initial_ref_held; + struct mutex overwrite_lock; +}; + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 37eee352fa21..08170a3d524f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -773,8 +773,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - atomic_set(&c->journal_keys.ref, 1); - c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -784,6 +782,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); + bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); -- 2.45.2