mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-02 00:00:03 +03:00
Update bcachefs sources to fd637ebda0 bcachefs: Journal updates to interior nodes
This commit is contained in:
parent
096f2ec00e
commit
8bcd38555c
@ -1 +1 @@
|
||||
3592e42edfaed6a66470fb6a456a5895243ef2f4
|
||||
fd637ebda030609b15a473f01f1ef54bbe818f27
|
||||
|
@ -1312,7 +1312,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
|
||||
x(new_extent_overwrite, 9) \
|
||||
x(incompressible, 10) \
|
||||
x(btree_ptr_v2, 11) \
|
||||
x(extents_above_btree_updates, 12)
|
||||
x(extents_above_btree_updates, 12) \
|
||||
x(btree_updates_journalled, 13)
|
||||
|
||||
#define BCH_SB_FEATURES_ALL \
|
||||
((1ULL << BCH_FEATURE_new_siphash)| \
|
||||
|
@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
|
||||
static inline void bkey_reassemble(struct bkey_i *dst,
|
||||
struct bkey_s_c src)
|
||||
{
|
||||
BUG_ON(bkey_packed(src.k));
|
||||
dst->k = *src.k;
|
||||
memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
|
||||
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
|
||||
}
|
||||
|
||||
#define bkey_s_null ((struct bkey_s) { .k = NULL })
|
||||
|
@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
|
||||
|
||||
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
|
||||
{
|
||||
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
|
||||
if (bkey_cmp(k.k->p, b->data->min_key) < 0)
|
||||
return "key before start of btree node";
|
||||
|
||||
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
|
||||
|
@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
||||
return nr;
|
||||
}
|
||||
|
||||
static void extent_sort_advance_prev(struct bkey_format *f,
|
||||
struct btree_nr_keys *nr,
|
||||
struct bkey_packed *start,
|
||||
struct bkey_packed **prev)
|
||||
{
|
||||
if (*prev) {
|
||||
bch2_bkey_pack(*prev, (void *) *prev, f);
|
||||
|
||||
btree_keys_account_key_add(nr, 0, *prev);
|
||||
*prev = bkey_next(*prev);
|
||||
} else {
|
||||
*prev = start;
|
||||
}
|
||||
}
|
||||
|
||||
static void extent_sort_append(struct bch_fs *c,
|
||||
struct bkey_format *f,
|
||||
struct btree_nr_keys *nr,
|
||||
struct bkey_packed *start,
|
||||
struct bkey_packed **prev,
|
||||
struct bkey_packed **out,
|
||||
struct bkey_s k)
|
||||
{
|
||||
if (bkey_whiteout(k.k))
|
||||
return;
|
||||
if (!bkey_whiteout(k.k)) {
|
||||
if (!bch2_bkey_pack_key(*out, k.k, f))
|
||||
memcpy_u64s_small(*out, k.k, BKEY_U64s);
|
||||
|
||||
/*
|
||||
* prev is always unpacked, for key merging - until right before we
|
||||
* advance it:
|
||||
*/
|
||||
memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
|
||||
|
||||
if (*prev &&
|
||||
bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
|
||||
BCH_MERGE_MERGE)
|
||||
return;
|
||||
|
||||
extent_sort_advance_prev(f, nr, start, prev);
|
||||
|
||||
bkey_reassemble((void *) *prev, k.s_c);
|
||||
btree_keys_account_key_add(nr, 0, *out);
|
||||
*out = bkey_next(*out);
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort + repack in a new format: */
|
||||
@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
|
||||
return nr;
|
||||
}
|
||||
|
||||
/* Sort, repack, and merge: */
|
||||
/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
|
||||
struct btree_nr_keys
|
||||
bch2_sort_repack_merge(struct bch_fs *c,
|
||||
struct bset *dst, struct btree *src,
|
||||
@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
|
||||
struct bkey_format *out_f,
|
||||
bool filter_whiteouts)
|
||||
{
|
||||
struct bkey_packed *prev = NULL, *k_packed;
|
||||
struct bkey_packed *out = vstruct_last(dst), *k_packed;
|
||||
struct bkey_on_stack k;
|
||||
struct btree_nr_keys nr;
|
||||
|
||||
@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c,
|
||||
bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
|
||||
continue;
|
||||
|
||||
extent_sort_append(c, out_f, &nr, vstruct_last(dst),
|
||||
&prev, bkey_i_to_s(k.k));
|
||||
extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
|
||||
}
|
||||
|
||||
extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
|
||||
|
||||
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
|
||||
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
|
||||
bkey_on_stack_exit(&k, c);
|
||||
return nr;
|
||||
}
|
||||
@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
||||
struct btree *b = iter->b;
|
||||
struct bkey_format *f = &b->format;
|
||||
struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
|
||||
struct bkey_packed *prev = NULL;
|
||||
struct bkey_packed *out = dst->start;
|
||||
struct bkey l_unpacked, r_unpacked;
|
||||
struct bkey_s l, r;
|
||||
struct btree_nr_keys nr;
|
||||
@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
||||
l = __bkey_disassemble(b, _l->k, &l_unpacked);
|
||||
|
||||
if (iter->used == 1) {
|
||||
extent_sort_append(c, f, &nr, dst->start, &prev, l);
|
||||
extent_sort_append(c, f, &nr, &out, l);
|
||||
extent_iter_advance(iter, 0);
|
||||
continue;
|
||||
}
|
||||
@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
||||
|
||||
/* If current key and next key don't overlap, just append */
|
||||
if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
|
||||
extent_sort_append(c, f, &nr, dst->start, &prev, l);
|
||||
extent_sort_append(c, f, &nr, &out, l);
|
||||
extent_iter_advance(iter, 0);
|
||||
continue;
|
||||
}
|
||||
@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
||||
__sort_iter_sift(iter, 0,
|
||||
extent_sort_fix_overlapping_cmp);
|
||||
|
||||
extent_sort_append(c, f, &nr, dst->start,
|
||||
&prev, bkey_i_to_s(split.k));
|
||||
extent_sort_append(c, f, &nr, &out,
|
||||
bkey_i_to_s(split.k));
|
||||
} else {
|
||||
bch2_cut_back_s(bkey_start_pos(r.k), l);
|
||||
extent_save(b, _l->k, l.k);
|
||||
}
|
||||
}
|
||||
|
||||
extent_sort_advance_prev(f, &nr, dst->start, &prev);
|
||||
|
||||
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
|
||||
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
|
||||
|
||||
bkey_on_stack_exit(&split, c);
|
||||
return nr;
|
||||
|
@ -588,6 +588,7 @@ err:
|
||||
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
const struct bkey_i *k,
|
||||
enum btree_id btree_id,
|
||||
unsigned level,
|
||||
enum six_lock_type lock_type,
|
||||
bool sync)
|
||||
@ -600,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
||||
* Parent node must be locked, else we could read in a btree node that's
|
||||
* been freed:
|
||||
*/
|
||||
if (!bch2_btree_node_relock(iter, level + 1))
|
||||
if (iter && !bch2_btree_node_relock(iter, level + 1))
|
||||
return ERR_PTR(-EINTR);
|
||||
|
||||
b = bch2_btree_node_mem_alloc(c);
|
||||
@ -608,7 +609,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
||||
return b;
|
||||
|
||||
bkey_copy(&b->key, k);
|
||||
if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
|
||||
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
|
||||
/* raced with another fill: */
|
||||
|
||||
/* mark as unhashed... */
|
||||
@ -628,7 +629,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
||||
*
|
||||
* XXX: ideally should be dropping all btree node locks here
|
||||
*/
|
||||
if (btree_node_read_locked(iter, level + 1))
|
||||
if (iter && btree_node_read_locked(iter, level + 1))
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
bch2_btree_node_read(c, b, sync);
|
||||
@ -676,7 +677,8 @@ retry:
|
||||
* else we could read in a btree node from disk that's been
|
||||
* freed:
|
||||
*/
|
||||
b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
|
||||
b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
|
||||
level, lock_type, true);
|
||||
|
||||
/* We raced and found the btree node in the cache */
|
||||
if (!b)
|
||||
@ -762,6 +764,74 @@ lock_node:
|
||||
return b;
|
||||
}
|
||||
|
||||
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
|
||||
const struct bkey_i *k,
|
||||
enum btree_id btree_id,
|
||||
unsigned level)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
struct bset_tree *t;
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
|
||||
b = btree_node_mem_ptr(k);
|
||||
if (b)
|
||||
goto lock_node;
|
||||
retry:
|
||||
b = btree_cache_find(bc, k);
|
||||
if (unlikely(!b)) {
|
||||
b = bch2_btree_node_fill(c, NULL, k, btree_id,
|
||||
level, SIX_LOCK_read, true);
|
||||
|
||||
/* We raced and found the btree node in the cache */
|
||||
if (!b)
|
||||
goto retry;
|
||||
|
||||
if (IS_ERR(b))
|
||||
return b;
|
||||
} else {
|
||||
lock_node:
|
||||
six_lock_read(&b->lock);
|
||||
|
||||
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
||||
b->btree_id != btree_id ||
|
||||
b->level != level)) {
|
||||
six_unlock_read(&b->lock);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
/* XXX: waiting on IO with btree locks held: */
|
||||
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
|
||||
prefetch(b->aux_data);
|
||||
|
||||
for_each_bset(b, t) {
|
||||
void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
||||
|
||||
prefetch(p + L1_CACHE_BYTES * 0);
|
||||
prefetch(p + L1_CACHE_BYTES * 1);
|
||||
prefetch(p + L1_CACHE_BYTES * 2);
|
||||
}
|
||||
|
||||
/* avoid atomic set bit if it's not needed: */
|
||||
if (!btree_node_accessed(b))
|
||||
set_btree_node_accessed(b);
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_read(&b->lock);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
EBUG_ON(b->btree_id != btree_id ||
|
||||
BTREE_NODE_LEVEL(b->data) != level ||
|
||||
bkey_cmp(b->data->max_key, k->k.p));
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
struct btree *b,
|
||||
@ -876,7 +946,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
|
||||
if (b)
|
||||
return;
|
||||
|
||||
bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
|
||||
bch2_btree_node_fill(c, iter, k, iter->btree_id,
|
||||
level, SIX_LOCK_read, false);
|
||||
}
|
||||
|
||||
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
|
@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
|
||||
const struct bkey_i *, unsigned,
|
||||
enum six_lock_type);
|
||||
|
||||
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
|
||||
enum btree_id, unsigned);
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
|
||||
struct btree *, enum btree_node_sibling);
|
||||
|
||||
|
@ -184,16 +184,8 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool pos_in_journal_keys(struct journal_keys *journal_keys,
|
||||
enum btree_id id, struct bpos pos)
|
||||
{
|
||||
struct journal_key *k = journal_key_search(journal_keys, id, pos);
|
||||
|
||||
return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
|
||||
}
|
||||
|
||||
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
|
||||
struct journal_keys *journal_keys, bool initial)
|
||||
bool initial)
|
||||
{
|
||||
struct btree_node_iter iter;
|
||||
struct bkey unpacked;
|
||||
@ -207,10 +199,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
|
||||
|
||||
for_each_btree_node_key_unpack(b, k, &iter,
|
||||
&unpacked) {
|
||||
if (!b->level && journal_keys &&
|
||||
pos_in_journal_keys(journal_keys, b->btree_id, k.k->p))
|
||||
continue;
|
||||
|
||||
bch2_bkey_debugcheck(c, b, k);
|
||||
|
||||
ret = bch2_gc_mark_key(c, k, max_stale, initial);
|
||||
@ -222,7 +210,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
|
||||
}
|
||||
|
||||
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
|
||||
struct journal_keys *journal_keys,
|
||||
bool initial, bool metadata_only)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
@ -250,8 +237,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
|
||||
|
||||
gc_pos_set(c, gc_pos_btree_node(b));
|
||||
|
||||
ret = btree_gc_mark_node(c, b, &max_stale,
|
||||
journal_keys, initial);
|
||||
ret = btree_gc_mark_node(c, b, &max_stale, initial);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
@ -287,6 +273,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
|
||||
struct journal_keys *journal_keys,
|
||||
unsigned target_depth)
|
||||
{
|
||||
struct btree_and_journal_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u8 max_stale = 0;
|
||||
int ret = 0;
|
||||
|
||||
bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
|
||||
|
||||
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
||||
bch2_bkey_debugcheck(c, b, k);
|
||||
|
||||
ret = bch2_gc_mark_key(c, k, &max_stale, true);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (b->level > target_depth) {
|
||||
struct btree *child;
|
||||
BKEY_PADDED(k) tmp;
|
||||
|
||||
bkey_reassemble(&tmp.k, k);
|
||||
|
||||
child = bch2_btree_node_get_noiter(c, &tmp.k,
|
||||
b->btree_id, b->level - 1);
|
||||
ret = PTR_ERR_OR_ZERO(child);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
bch2_gc_btree_init_recurse(c, child,
|
||||
journal_keys, target_depth);
|
||||
six_unlock_read(&child->lock);
|
||||
}
|
||||
|
||||
bch2_btree_and_journal_iter_advance(&iter);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_btree_init(struct bch_fs *c,
|
||||
struct journal_keys *journal_keys,
|
||||
enum btree_id btree_id,
|
||||
bool metadata_only)
|
||||
{
|
||||
struct btree *b;
|
||||
unsigned target_depth = metadata_only ? 1
|
||||
: expensive_debug_checks(c) ? 0
|
||||
: !btree_node_type_needs_gc(btree_id) ? 1
|
||||
: 0;
|
||||
u8 max_stale = 0;
|
||||
int ret = 0;
|
||||
|
||||
b = c->btree_roots[btree_id].b;
|
||||
|
||||
if (btree_node_fake(b))
|
||||
return 0;
|
||||
|
||||
six_lock_read(&b->lock);
|
||||
if (b->level >= target_depth)
|
||||
ret = bch2_gc_btree_init_recurse(c, b,
|
||||
journal_keys, target_depth);
|
||||
|
||||
if (!ret)
|
||||
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
|
||||
&max_stale, true);
|
||||
six_unlock_read(&b->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
|
||||
{
|
||||
return (int) btree_id_to_gc_phase(l) -
|
||||
@ -305,27 +363,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
enum btree_id id = ids[i];
|
||||
enum btree_node_type type = __btree_node_type(0, id);
|
||||
|
||||
int ret = bch2_gc_btree(c, id, journal_keys,
|
||||
initial, metadata_only);
|
||||
int ret = initial
|
||||
? bch2_gc_btree_init(c, journal_keys,
|
||||
id, metadata_only)
|
||||
: bch2_gc_btree(c, id, initial, metadata_only);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (journal_keys && !metadata_only &&
|
||||
btree_node_type_needs_gc(type)) {
|
||||
struct journal_key *j;
|
||||
u8 max_stale;
|
||||
int ret;
|
||||
|
||||
for_each_journal_key(*journal_keys, j)
|
||||
if (j->btree_id == id) {
|
||||
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
|
||||
&max_stale, initial);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -1261,7 +1261,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
||||
closure_put(&((struct btree_update *) new)->cl);
|
||||
|
||||
bch2_journal_pin_drop(&c->journal, &w->journal);
|
||||
closure_wake_up(&w->wait);
|
||||
}
|
||||
|
||||
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
||||
@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
|
||||
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
|
||||
wbio->wbio.bio.bi_private = b;
|
||||
|
||||
if (b->level || !b->written)
|
||||
wbio->wbio.bio.bi_opf |= REQ_FUA;
|
||||
|
||||
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
|
||||
|
||||
/*
|
||||
@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
|
||||
rcu_read_lock();
|
||||
for_each_cached_btree(b, c, tbl, i, pos) {
|
||||
unsigned long flags = READ_ONCE(b->flags);
|
||||
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
|
||||
|
||||
if (!(flags & (1 << BTREE_NODE_dirty)))
|
||||
continue;
|
||||
|
||||
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
|
||||
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
|
||||
b,
|
||||
(flags & (1 << BTREE_NODE_dirty)) != 0,
|
||||
(flags & (1 << BTREE_NODE_need_write)) != 0,
|
||||
@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
|
||||
b->written,
|
||||
!list_empty_careful(&b->write_blocked),
|
||||
b->will_make_reachable != 0,
|
||||
b->will_make_reachable & 1,
|
||||
b->writes[ idx].wait.list.first != NULL,
|
||||
b->writes[!idx].wait.list.first != NULL);
|
||||
b->will_make_reachable & 1);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
|
@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
||||
void bch2_btree_node_write(struct bch_fs *, struct btree *,
|
||||
enum six_lock_type);
|
||||
|
||||
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
|
||||
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
|
||||
enum six_lock_type lock_held)
|
||||
{
|
||||
while (b->written &&
|
||||
btree_node_need_write(b) &&
|
||||
btree_node_may_write(b)) {
|
||||
if (!btree_node_write_in_flight(b)) {
|
||||
bch2_btree_node_write(c, b, SIX_LOCK_read);
|
||||
bch2_btree_node_write(c, b, lock_held);
|
||||
break;
|
||||
}
|
||||
|
||||
six_unlock_read(&b->lock);
|
||||
btree_node_wait_on_io(b);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
btree_node_lock_type(c, b, lock_held);
|
||||
}
|
||||
}
|
||||
|
||||
@ -131,7 +132,7 @@ do { \
|
||||
new |= (1 << BTREE_NODE_need_write); \
|
||||
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
|
||||
\
|
||||
btree_node_write_if_need(_c, _b); \
|
||||
btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
|
||||
} while (0)
|
||||
|
||||
void bch2_btree_flush_all_reads(struct bch_fs *);
|
||||
|
@ -1068,7 +1068,14 @@ retry_all:
|
||||
goto retry_all;
|
||||
}
|
||||
|
||||
ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
|
||||
if (hweight64(trans->iters_live) > 1)
|
||||
ret = -EINTR;
|
||||
else
|
||||
trans_for_each_iter(trans, iter)
|
||||
if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
|
||||
ret = -EINTR;
|
||||
break;
|
||||
}
|
||||
out:
|
||||
bch2_btree_cache_cannibalize_unlock(c);
|
||||
return ret;
|
||||
|
@ -53,7 +53,6 @@ struct bset_tree {
|
||||
|
||||
struct btree_write {
|
||||
struct journal_entry_pin journal;
|
||||
struct closure_waitlist wait;
|
||||
};
|
||||
|
||||
struct btree_alloc {
|
||||
@ -261,6 +260,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
|
||||
return iter->flags & BTREE_ITER_TYPE;
|
||||
}
|
||||
|
||||
static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
|
||||
{
|
||||
return iter->l + iter->level;
|
||||
}
|
||||
|
||||
struct btree_insert_entry {
|
||||
unsigned trigger_flags;
|
||||
unsigned trans_triggers_run:1;
|
||||
@ -539,8 +543,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
|
||||
struct btree_root {
|
||||
struct btree *b;
|
||||
|
||||
struct btree_update *as;
|
||||
|
||||
/* On disk root - see async splits: */
|
||||
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
||||
u8 level;
|
||||
|
@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
|
||||
struct btree_iter *);
|
||||
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
|
||||
struct btree_node_iter *, struct bkey_i *);
|
||||
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
|
||||
|
||||
enum btree_insert_flags {
|
||||
__BTREE_INSERT_NOUNLOCK,
|
||||
|
@ -24,7 +24,6 @@
|
||||
static void btree_node_will_make_reachable(struct btree_update *,
|
||||
struct btree *);
|
||||
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
|
||||
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
|
||||
|
||||
/* Debug code: */
|
||||
|
||||
@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
|
||||
}
|
||||
|
||||
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
|
||||
struct pending_btree_node_free *pending)
|
||||
struct pending_btree_node_free *pending,
|
||||
u64 journal_seq)
|
||||
{
|
||||
BUG_ON(!pending->index_update_done);
|
||||
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
|
||||
0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
|
||||
0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
|
||||
|
||||
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
|
||||
0, 0, NULL, 0,
|
||||
0, 0, NULL, journal_seq,
|
||||
BTREE_TRIGGER_OVERWRITE|
|
||||
BTREE_TRIGGER_GC);
|
||||
}
|
||||
@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
|
||||
bch2_journal_preres_put(&c->journal, &as->journal_preres);
|
||||
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
bch2_journal_pin_flush(&c->journal, &as->journal);
|
||||
|
||||
BUG_ON(as->nr_new_nodes);
|
||||
BUG_ON(as->nr_pending);
|
||||
BUG_ON((as->nr_new_nodes || as->nr_pending) &&
|
||||
!bch2_journal_error(&c->journal));;
|
||||
|
||||
if (as->reserve)
|
||||
bch2_btree_reserve_put(c, as->reserve);
|
||||
@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
}
|
||||
|
||||
static void btree_update_nodes_reachable(struct closure *cl)
|
||||
static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
|
||||
{
|
||||
struct btree_update *as = container_of(cl, struct btree_update, cl);
|
||||
struct bch_fs *c = as->c;
|
||||
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
|
||||
while (as->nr_new_nodes) {
|
||||
@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
|
||||
}
|
||||
|
||||
while (as->nr_pending)
|
||||
bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
|
||||
bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
|
||||
seq);
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
closure_wake_up(&as->wait);
|
||||
|
||||
bch2_btree_update_free(as);
|
||||
}
|
||||
|
||||
static void btree_update_wait_on_journal(struct closure *cl)
|
||||
{
|
||||
struct btree_update *as = container_of(cl, struct btree_update, cl);
|
||||
struct bch_fs *c = as->c;
|
||||
int ret;
|
||||
|
||||
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
|
||||
if (ret == -EAGAIN) {
|
||||
continue_at(cl, btree_update_wait_on_journal, system_wq);
|
||||
return;
|
||||
}
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
|
||||
err:
|
||||
continue_at(cl, btree_update_nodes_reachable, system_wq);
|
||||
}
|
||||
|
||||
static void btree_update_nodes_written(struct closure *cl)
|
||||
{
|
||||
struct btree_update *as = container_of(cl, struct btree_update, cl);
|
||||
struct journal_res res = { 0 };
|
||||
struct bch_fs *c = as->c;
|
||||
struct btree *b;
|
||||
struct bset *i;
|
||||
struct bkey_i *k;
|
||||
unsigned journal_u64s = 0;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* We did an update to a parent node where the pointers we added pointed
|
||||
@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
|
||||
*/
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
as->nodes_written = true;
|
||||
retry:
|
||||
again:
|
||||
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
|
||||
struct btree_update, unwritten_list);
|
||||
if (!as || !as->nodes_written) {
|
||||
@ -679,31 +662,53 @@ retry:
|
||||
return;
|
||||
}
|
||||
|
||||
b = as->b;
|
||||
if (b && !six_trylock_intent(&b->lock)) {
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_intent);
|
||||
six_unlock_intent(&b->lock);
|
||||
goto out;
|
||||
}
|
||||
|
||||
journal_u64s = 0;
|
||||
|
||||
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
|
||||
for_each_keylist_key(&as->parent_keys, k)
|
||||
journal_u64s += jset_u64s(k->k.u64s);
|
||||
|
||||
ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
|
||||
JOURNAL_RES_GET_RESERVED);
|
||||
if (ret) {
|
||||
BUG_ON(!bch2_journal_error(&c->journal));
|
||||
/* can't unblock btree writes */
|
||||
goto free_update;
|
||||
}
|
||||
|
||||
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
|
||||
for_each_keylist_key(&as->parent_keys, k)
|
||||
bch2_journal_add_entry(&c->journal, &res,
|
||||
BCH_JSET_ENTRY_btree_keys,
|
||||
as->btree_id,
|
||||
as->level,
|
||||
k, k->k.u64s);
|
||||
|
||||
switch (as->mode) {
|
||||
case BTREE_INTERIOR_NO_UPDATE:
|
||||
BUG();
|
||||
case BTREE_INTERIOR_UPDATING_NODE:
|
||||
/* The usual case: */
|
||||
b = READ_ONCE(as->b);
|
||||
|
||||
if (!six_trylock_read(&b->lock)) {
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->lock);
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
BUG_ON(!btree_node_dirty(b));
|
||||
closure_wait(&btree_current_write(b)->wait, &as->cl);
|
||||
/* @b is the node we did the final insert into: */
|
||||
BUG_ON(!res.ref);
|
||||
|
||||
six_lock_write(&b->lock);
|
||||
list_del(&as->write_blocked_list);
|
||||
|
||||
/*
|
||||
* for flush_held_btree_writes() waiting on updates to flush or
|
||||
* nodes to be writeable:
|
||||
*/
|
||||
closure_wake_up(&c->btree_interior_update_wait);
|
||||
i = btree_bset_last(b);
|
||||
i->journal_seq = cpu_to_le64(
|
||||
max(res.seq,
|
||||
le64_to_cpu(i->journal_seq)));
|
||||
|
||||
bch2_btree_add_journal_pin(c, b, res.seq);
|
||||
six_unlock_write(&b->lock);
|
||||
|
||||
list_del(&as->unwritten_list);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
@ -712,82 +717,51 @@ retry:
|
||||
* b->write_blocked prevented it from being written, so
|
||||
* write it now if it needs to be written:
|
||||
*/
|
||||
bch2_btree_node_write_cond(c, b, true);
|
||||
six_unlock_read(&b->lock);
|
||||
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
|
||||
btree_node_write_if_need(c, b, SIX_LOCK_intent);
|
||||
six_unlock_intent(&b->lock);
|
||||
break;
|
||||
|
||||
case BTREE_INTERIOR_UPDATING_AS:
|
||||
/*
|
||||
* The btree node we originally updated has been freed and is
|
||||
* being rewritten - so we need to write anything here, we just
|
||||
* need to signal to that btree_update that it's ok to make the
|
||||
* new replacement node visible:
|
||||
*/
|
||||
closure_put(&as->parent_as->cl);
|
||||
|
||||
/*
|
||||
* and then we have to wait on that btree_update to finish:
|
||||
*/
|
||||
closure_wait(&as->parent_as->wait, &as->cl);
|
||||
BUG_ON(b);
|
||||
|
||||
list_del(&as->unwritten_list);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
|
||||
break;
|
||||
|
||||
case BTREE_INTERIOR_UPDATING_ROOT:
|
||||
/* b is the new btree root: */
|
||||
b = READ_ONCE(as->b);
|
||||
case BTREE_INTERIOR_UPDATING_ROOT: {
|
||||
struct btree_root *r = &c->btree_roots[as->btree_id];
|
||||
|
||||
if (!six_trylock_read(&b->lock)) {
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->lock);
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
goto retry;
|
||||
}
|
||||
BUG_ON(b);
|
||||
|
||||
BUG_ON(c->btree_roots[b->btree_id].as != as);
|
||||
c->btree_roots[b->btree_id].as = NULL;
|
||||
|
||||
bch2_btree_set_root_ondisk(c, b, WRITE);
|
||||
|
||||
/*
|
||||
* We don't have to wait anything anything here (before
|
||||
* btree_update_nodes_reachable frees the old nodes
|
||||
* ondisk) - we've ensured that the very next journal write will
|
||||
* have the pointer to the new root, and before the allocator
|
||||
* can reuse the old nodes it'll have to do a journal commit:
|
||||
*/
|
||||
six_unlock_read(&b->lock);
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
bkey_copy(&r->key, as->parent_keys.keys);
|
||||
r->level = as->level;
|
||||
r->alive = true;
|
||||
c->btree_roots_dirty = true;
|
||||
mutex_unlock(&c->btree_root_lock);
|
||||
|
||||
list_del(&as->unwritten_list);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* Bit of funny circularity going on here we have to break:
|
||||
*
|
||||
* We have to drop our journal pin before writing the journal
|
||||
* entry that points to the new btree root: else, we could
|
||||
* deadlock if the journal currently happens to be full.
|
||||
*
|
||||
* This mean we're dropping the journal pin _before_ the new
|
||||
* nodes are technically reachable - but this is safe, because
|
||||
* after the bch2_btree_set_root_ondisk() call above they will
|
||||
* be reachable as of the very next journal write:
|
||||
*/
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
|
||||
as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
|
||||
|
||||
btree_update_wait_on_journal(&as->cl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
|
||||
bch2_journal_res_put(&c->journal, &res);
|
||||
bch2_journal_preres_put(&c->journal, &as->journal_preres);
|
||||
|
||||
btree_update_nodes_reachable(as, res.seq);
|
||||
free_update:
|
||||
bch2_btree_update_free(as);
|
||||
/*
|
||||
* for flush_held_btree_writes() waiting on updates to flush or
|
||||
* nodes to be writeable:
|
||||
*/
|
||||
closure_wake_up(&c->btree_interior_update_wait);
|
||||
out:
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
goto retry;
|
||||
goto again;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
|
||||
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
|
||||
BUG_ON(!btree_node_dirty(b));
|
||||
|
||||
as->mode = BTREE_INTERIOR_UPDATING_NODE;
|
||||
as->b = b;
|
||||
as->mode = BTREE_INTERIOR_UPDATING_NODE;
|
||||
as->b = b;
|
||||
as->level = b->level;
|
||||
list_add(&as->write_blocked_list, &b->write_blocked);
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* In general, when you're staging things in a journal that will later
|
||||
* be written elsewhere, and you also want to guarantee ordering: that
|
||||
* is, if you have updates a, b, c, after a crash you should never see c
|
||||
* and not a or b - there's a problem:
|
||||
*
|
||||
* If the final destination of the update(s) (i.e. btree node) can be
|
||||
* written/flushed _before_ the relevant journal entry - oops, that
|
||||
* breaks ordering, since the various leaf nodes can be written in any
|
||||
* order.
|
||||
*
|
||||
* Normally we use bset->journal_seq to deal with this - if during
|
||||
* recovery we find a btree node write that's newer than the newest
|
||||
* journal entry, we just ignore it - we don't need it, anything we're
|
||||
* supposed to have (that we reported as completed via fsync()) will
|
||||
* still be in the journal, and as far as the state of the journal is
|
||||
* concerned that btree node write never happened.
|
||||
*
|
||||
* That breaks when we're rewriting/splitting/merging nodes, since we're
|
||||
* mixing btree node writes that haven't happened yet with previously
|
||||
* written data that has been reported as completed to the journal.
|
||||
*
|
||||
* Thus, before making the new nodes reachable, we have to wait the
|
||||
* newest journal sequence number we have data for to be written (if it
|
||||
* hasn't been yet).
|
||||
*/
|
||||
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
|
||||
}
|
||||
|
||||
static void interior_update_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
struct btree_update *as =
|
||||
container_of(pin, struct btree_update, journal);
|
||||
|
||||
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
|
||||
}
|
||||
|
||||
static void btree_update_reparent(struct btree_update *as,
|
||||
@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
|
||||
lockdep_assert_held(&c->btree_interior_update_lock);
|
||||
|
||||
child->b = NULL;
|
||||
child->mode = BTREE_INTERIOR_UPDATING_AS;
|
||||
child->parent_as = as;
|
||||
closure_get(&as->cl);
|
||||
|
||||
/*
|
||||
* When we write a new btree root, we have to drop our journal pin
|
||||
@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
|
||||
* just transfer the journal pin to the new interior update so
|
||||
* btree_update_nodes_written() can drop it.
|
||||
*/
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal,
|
||||
&child->journal, interior_update_flush);
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
|
||||
bch2_journal_pin_drop(&c->journal, &child->journal);
|
||||
|
||||
as->journal_seq = max(as->journal_seq, child->journal_seq);
|
||||
}
|
||||
|
||||
static void btree_update_updated_root(struct btree_update *as)
|
||||
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
struct btree_root *r = &c->btree_roots[as->btree_id];
|
||||
|
||||
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
|
||||
BUG_ON(!bch2_keylist_empty(&as->parent_keys));
|
||||
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
|
||||
|
||||
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
|
||||
|
||||
/*
|
||||
* Old root might not be persistent yet - if so, redirect its
|
||||
* btree_update operation to point to us:
|
||||
*/
|
||||
if (r->as)
|
||||
btree_update_reparent(as, r->as);
|
||||
|
||||
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
|
||||
as->b = r->b;
|
||||
r->as = as;
|
||||
|
||||
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
|
||||
as->level = b->level;
|
||||
bch2_keylist_add(&as->parent_keys, &b->key);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* When we're rewriting nodes and updating interior nodes, there's an
|
||||
* issue with updates that haven't been written in the journal getting
|
||||
* mixed together with older data - see btree_update_updated_node()
|
||||
* for the explanation.
|
||||
*
|
||||
* However, this doesn't affect us when we're writing a new btree root -
|
||||
* because to make that new root reachable we have to write out a new
|
||||
* journal entry, which must necessarily be newer than as->journal_seq.
|
||||
*/
|
||||
}
|
||||
|
||||
static void btree_node_will_make_reachable(struct btree_update *as,
|
||||
@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
struct closure *cl, *cl_n;
|
||||
struct btree_update *p, *n;
|
||||
struct btree_write *w;
|
||||
struct bset_tree *t;
|
||||
|
||||
set_btree_node_dying(b);
|
||||
|
||||
@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
|
||||
|
||||
btree_interior_update_add_node_reference(as, b);
|
||||
|
||||
/*
|
||||
* Does this node have data that hasn't been written in the journal?
|
||||
*
|
||||
* If so, we have to wait for the corresponding journal entry to be
|
||||
* written before making the new nodes reachable - we can't just carry
|
||||
* over the bset->journal_seq tracking, since we'll be mixing those keys
|
||||
* in with keys that aren't in the journal anymore:
|
||||
*/
|
||||
for_each_bset(b, t)
|
||||
as->journal_seq = max(as->journal_seq,
|
||||
le64_to_cpu(bset(b, t)->journal_seq));
|
||||
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
|
||||
|
||||
clear_btree_node_dirty(b);
|
||||
clear_btree_node_need_write(b);
|
||||
w = btree_current_write(b);
|
||||
|
||||
/*
|
||||
* Does this node have any btree_update operations waiting on this node
|
||||
* to be written?
|
||||
*
|
||||
* If so, wake them up when this btree_update operation is reachable:
|
||||
*/
|
||||
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
|
||||
llist_add(&cl->list, &as->wait.list);
|
||||
|
||||
/*
|
||||
* Does this node have unwritten data that has a pin on the journal?
|
||||
@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
|
||||
* oldest pin of any of the nodes we're freeing. We'll release the pin
|
||||
* when the new nodes are persistent and reachable on disk:
|
||||
*/
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal,
|
||||
&w->journal, interior_update_flush);
|
||||
w = btree_current_write(b);
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
|
||||
bch2_journal_pin_drop(&c->journal, &w->journal);
|
||||
|
||||
w = btree_prev_write(b);
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal,
|
||||
&w->journal, interior_update_flush);
|
||||
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
|
||||
bch2_journal_pin_drop(&c->journal, &w->journal);
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
|
||||
{
|
||||
struct btree_reserve *reserve;
|
||||
struct btree_update *as;
|
||||
int ret;
|
||||
|
||||
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
|
||||
if (IS_ERR(reserve))
|
||||
@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
|
||||
|
||||
bch2_keylist_init(&as->parent_keys, as->inline_keys);
|
||||
|
||||
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
|
||||
jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
|
||||
if (ret) {
|
||||
bch2_btree_reserve_put(c, reserve);
|
||||
closure_debug_destroy(&as->cl);
|
||||
mempool_free(as, &c->btree_interior_update_pool);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
list_add_tail(&as->list, &c->btree_interior_update_list);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
}
|
||||
|
||||
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
|
||||
{
|
||||
struct btree_root *r = &c->btree_roots[b->btree_id];
|
||||
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
|
||||
BUG_ON(b != r->b);
|
||||
bkey_copy(&r->key, &b->key);
|
||||
r->level = b->level;
|
||||
r->alive = true;
|
||||
if (rw == WRITE)
|
||||
c->btree_roots_dirty = true;
|
||||
|
||||
mutex_unlock(&c->btree_root_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* bch_btree_set_root - update the root in memory and on disk
|
||||
*
|
||||
@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
|
||||
|
||||
bch2_btree_set_root_inmem(as, b);
|
||||
|
||||
btree_update_updated_root(as);
|
||||
btree_update_updated_root(as, b);
|
||||
|
||||
/*
|
||||
* Unlock old root after new root is visible:
|
||||
@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
|
||||
bch2_btree_build_aux_trees(n1);
|
||||
six_unlock_write(&n1->lock);
|
||||
|
||||
bch2_keylist_add(&as->parent_keys, &n1->key);
|
||||
if (parent)
|
||||
bch2_keylist_add(&as->parent_keys, &n1->key);
|
||||
}
|
||||
|
||||
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
|
||||
@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
|
||||
(bkey_cmp_packed(b, k, &insert->k) >= 0))
|
||||
;
|
||||
|
||||
while (!bch2_keylist_empty(keys)) {
|
||||
insert = bch2_keylist_front(keys);
|
||||
|
||||
for_each_keylist_key(keys, insert)
|
||||
bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
|
||||
bch2_keylist_pop_front(keys);
|
||||
}
|
||||
|
||||
btree_update_updated_node(as, b);
|
||||
|
||||
@ -1630,7 +1512,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
|
||||
unsigned flags)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct btree *b = iter->l[0].b;
|
||||
struct btree *b = iter_l(iter)->b;
|
||||
struct btree_update *as;
|
||||
struct closure cl;
|
||||
int ret = 0;
|
||||
@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
|
||||
bkey_copy(&b->key, new_key);
|
||||
}
|
||||
|
||||
btree_update_updated_root(as);
|
||||
btree_update_updated_root(as, b);
|
||||
bch2_btree_node_unlock_write(b, iter);
|
||||
}
|
||||
|
||||
|
@ -69,8 +69,10 @@ struct btree_update {
|
||||
unsigned nodes_written:1;
|
||||
|
||||
enum btree_id btree_id;
|
||||
u8 level;
|
||||
|
||||
struct btree_reserve *reserve;
|
||||
struct journal_preres journal_preres;
|
||||
|
||||
/*
|
||||
* BTREE_INTERIOR_UPDATING_NODE:
|
||||
@ -83,18 +85,6 @@ struct btree_update {
|
||||
struct btree *b;
|
||||
struct list_head write_blocked_list;
|
||||
|
||||
/*
|
||||
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
|
||||
* we're now blocking another btree_update
|
||||
* @parent_as - btree_update that's waiting on our nodes to finish
|
||||
* writing, before it can make new nodes visible on disk
|
||||
* @wait - list of child btree_updates that are waiting on this
|
||||
* btree_update to make all the new nodes visible before they can free
|
||||
* their old btree nodes
|
||||
*/
|
||||
struct btree_update *parent_as;
|
||||
struct closure_waitlist wait;
|
||||
|
||||
/*
|
||||
* We may be freeing nodes that were dirty, and thus had journal entries
|
||||
* pinned: we need to transfer the oldest of those pins to the
|
||||
@ -103,8 +93,6 @@ struct btree_update {
|
||||
*/
|
||||
struct journal_entry_pin journal;
|
||||
|
||||
u64 journal_seq;
|
||||
|
||||
/*
|
||||
* Nodes being freed:
|
||||
* Protected by c->btree_node_pending_free_lock
|
||||
|
@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
|
||||
struct btree_insert_entry *i)
|
||||
{
|
||||
return i != trans->updates2 &&
|
||||
i[0].iter->l[0].b == i[-1].iter->l[0].b;
|
||||
iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
|
||||
}
|
||||
|
||||
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
|
||||
@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
|
||||
return __btree_node_flush(j, pin, 1, seq);
|
||||
}
|
||||
|
||||
inline void bch2_btree_add_journal_pin(struct bch_fs *c,
|
||||
struct btree *b, u64 seq)
|
||||
{
|
||||
struct btree_write *w = btree_current_write(b);
|
||||
|
||||
bch2_journal_pin_add(&c->journal, seq, &w->journal,
|
||||
btree_node_write_idx(b) == 0
|
||||
? btree_node_flush0
|
||||
: btree_node_flush1);
|
||||
}
|
||||
|
||||
static inline void __btree_journal_key(struct btree_trans *trans,
|
||||
enum btree_id btree_id,
|
||||
struct bkey_i *insert)
|
||||
@ -172,13 +183,8 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct journal *j = &c->journal;
|
||||
struct btree *b = iter->l[0].b;
|
||||
struct btree_write *w = btree_current_write(b);
|
||||
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
|
||||
? trans->journal_res.seq
|
||||
: j->replay_journal_seq;
|
||||
struct btree *b = iter_l(iter)->b;
|
||||
|
||||
EBUG_ON(iter->level || b->level);
|
||||
EBUG_ON(trans->journal_res.ref !=
|
||||
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
|
||||
|
||||
@ -188,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
|
||||
cpu_to_le64(trans->journal_res.seq);
|
||||
}
|
||||
|
||||
bch2_journal_pin_add(j, seq, &w->journal,
|
||||
btree_node_write_idx(b) == 0
|
||||
? btree_node_flush0
|
||||
: btree_node_flush1);
|
||||
bch2_btree_add_journal_pin(c, b,
|
||||
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
|
||||
? trans->journal_res.seq
|
||||
: j->replay_journal_seq);
|
||||
|
||||
if (unlikely(!btree_node_dirty(b)))
|
||||
set_btree_node_dirty(b);
|
||||
@ -205,17 +211,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
|
||||
struct bkey_i *insert)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree *b = iter->l[0].b;
|
||||
struct btree *b = iter_l(iter)->b;
|
||||
struct bset_tree *t = bset_tree_last(b);
|
||||
int old_u64s = bset_u64s(t);
|
||||
int old_live_u64s = b->nr.live_u64s;
|
||||
int live_u64s_added, u64s_added;
|
||||
|
||||
EBUG_ON(iter->level);
|
||||
|
||||
insert->k.needs_whiteout = false;
|
||||
|
||||
if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
|
||||
if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
|
||||
bch2_btree_journal_key(trans, iter, insert);
|
||||
|
||||
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
|
||||
@ -241,7 +245,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
BUG_ON(iter->level);
|
||||
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
|
||||
BUG_ON(debug_check_bkeys(c) &&
|
||||
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
|
||||
@ -290,7 +293,7 @@ btree_key_can_insert(struct btree_trans *trans,
|
||||
unsigned *u64s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree *b = iter->l[0].b;
|
||||
struct btree *b = iter_l(iter)->b;
|
||||
static enum btree_insert_ret ret;
|
||||
|
||||
if (unlikely(btree_node_fake(b)))
|
||||
@ -345,7 +348,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
|
||||
struct btree_insert_entry *i;
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
|
||||
if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
|
||||
bch2_mark_update(trans, i->iter, i->k, NULL,
|
||||
i->trigger_flags|BTREE_TRIGGER_GC);
|
||||
}
|
||||
@ -461,7 +464,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
|
||||
int ret;
|
||||
|
||||
trans_for_each_update2(trans, i)
|
||||
BUG_ON(!btree_node_intent_locked(i->iter, 0));
|
||||
BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
|
||||
|
||||
ret = bch2_journal_preres_get(&trans->c->journal,
|
||||
&trans->journal_preres, trans->journal_preres_u64s,
|
||||
@ -495,13 +498,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
|
||||
trans_for_each_update2(trans, i)
|
||||
if (!same_leaf_as_prev(trans, i))
|
||||
bch2_btree_node_lock_for_insert(trans->c,
|
||||
i->iter->l[0].b, i->iter);
|
||||
iter_l(i->iter)->b, i->iter);
|
||||
|
||||
ret = bch2_trans_commit_write_locked(trans, stopped_at);
|
||||
|
||||
trans_for_each_update2(trans, i)
|
||||
if (!same_leaf_as_prev(trans, i))
|
||||
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
|
||||
bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
|
||||
i->iter);
|
||||
|
||||
/*
|
||||
|
@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
|
||||
* extent we're inserting and overwriting:
|
||||
*/
|
||||
*nr_iters += 1;
|
||||
if (*nr_iters >= max_iters) {
|
||||
*end = bpos_min(*end, k.k->p);
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_extent:
|
||||
|
@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c)
|
||||
bch_verbose(c, "checking extents");
|
||||
|
||||
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
|
||||
POS(BCACHEFS_ROOT_INO, 0), 0);
|
||||
POS(BCACHEFS_ROOT_INO, 0),
|
||||
BTREE_ITER_INTENT);
|
||||
retry:
|
||||
for_each_btree_key_continue(iter, 0, k, ret) {
|
||||
if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
|
||||
|
@ -27,30 +27,78 @@
|
||||
|
||||
/* iterate over keys read from the journal: */
|
||||
|
||||
struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||
static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
while (iter->k) {
|
||||
if (iter->k->btree_id == iter->btree_id)
|
||||
return bkey_i_to_s_c(iter->k->k);
|
||||
size_t l = 0, r = journal_keys->nr, m;
|
||||
|
||||
iter->k++;
|
||||
if (iter->k == iter->keys->d + iter->keys->nr)
|
||||
iter->k = NULL;
|
||||
while (l < r) {
|
||||
m = l + ((r - l) >> 1);
|
||||
if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
|
||||
cmp_int(level, journal_keys->d[m].level) ?:
|
||||
bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
|
||||
l = m + 1;
|
||||
else
|
||||
r = m;
|
||||
}
|
||||
|
||||
return bkey_s_c_null;
|
||||
BUG_ON(l < journal_keys->nr &&
|
||||
(cmp_int(id, journal_keys->d[l].btree_id) ?:
|
||||
cmp_int(level, journal_keys->d[l].level) ?:
|
||||
bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
|
||||
|
||||
BUG_ON(l &&
|
||||
(cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
|
||||
cmp_int(level, journal_keys->d[l - 1].level) ?:
|
||||
bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
|
||||
|
||||
return l < journal_keys->nr ? journal_keys->d + l : NULL;
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
|
||||
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
|
||||
{
|
||||
if (!iter->k)
|
||||
return bkey_s_c_null;
|
||||
if (iter->k &&
|
||||
iter->k < iter->keys->d + iter->keys->nr &&
|
||||
iter->k->btree_id == iter->btree_id &&
|
||||
iter->k->level == iter->level)
|
||||
return iter->k->k;
|
||||
|
||||
iter->k++;
|
||||
if (iter->k == iter->keys->d + iter->keys->nr)
|
||||
iter->k = NULL;
|
||||
iter->k = NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return bch2_journal_iter_peek(iter);
|
||||
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||
{
|
||||
if (iter->k)
|
||||
iter->k++;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_init(struct journal_iter *iter,
|
||||
struct journal_keys *journal_keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
iter->btree_id = id;
|
||||
iter->level = level;
|
||||
iter->keys = journal_keys;
|
||||
iter->k = journal_key_search(journal_keys, id, level, pos);
|
||||
}
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
return iter->btree
|
||||
? bch2_btree_iter_peek(iter->btree)
|
||||
: bch2_btree_node_iter_peek_unpack(&iter->node_iter,
|
||||
iter->b, &iter->unpacked);
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
if (iter->btree)
|
||||
bch2_btree_iter_next(iter->btree);
|
||||
else
|
||||
bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
|
||||
@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
|
||||
case none:
|
||||
break;
|
||||
case btree:
|
||||
bch2_btree_iter_next(iter->btree);
|
||||
bch2_journal_iter_advance_btree(iter);
|
||||
break;
|
||||
case journal:
|
||||
bch2_journal_iter_next(&iter->journal);
|
||||
bch2_journal_iter_advance(&iter->journal);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
|
||||
struct bkey_s_c ret;
|
||||
|
||||
while (1) {
|
||||
struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree);
|
||||
struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal);
|
||||
struct bkey_s_c btree_k =
|
||||
bch2_journal_iter_peek_btree(iter);
|
||||
struct bkey_s_c journal_k =
|
||||
bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
|
||||
|
||||
if (btree_k.k && journal_k.k) {
|
||||
int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
|
||||
|
||||
if (!cmp)
|
||||
bch2_btree_iter_next(iter->btree);
|
||||
bch2_journal_iter_advance_btree(iter);
|
||||
|
||||
iter->last = cmp < 0 ? btree : journal;
|
||||
} else if (btree_k.k) {
|
||||
@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
|
||||
}
|
||||
|
||||
ret = iter->last == journal ? journal_k : btree_k;
|
||||
|
||||
if (iter->b &&
|
||||
bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
|
||||
iter->journal.k = NULL;
|
||||
iter->last = none;
|
||||
return bkey_s_c_null;
|
||||
}
|
||||
|
||||
if (!bkey_deleted(ret.k))
|
||||
break;
|
||||
|
||||
@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
|
||||
return bch2_btree_and_journal_iter_peek(iter);
|
||||
}
|
||||
|
||||
struct journal_key *journal_key_search(struct journal_keys *journal_keys,
|
||||
enum btree_id id, struct bpos pos)
|
||||
{
|
||||
size_t l = 0, r = journal_keys->nr, m;
|
||||
|
||||
while (l < r) {
|
||||
m = l + ((r - l) >> 1);
|
||||
if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
|
||||
bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
|
||||
l = m + 1;
|
||||
else
|
||||
r = m;
|
||||
}
|
||||
|
||||
BUG_ON(l < journal_keys->nr &&
|
||||
(cmp_int(id, journal_keys->d[l].btree_id) ?:
|
||||
bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
|
||||
|
||||
BUG_ON(l &&
|
||||
(cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
|
||||
bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
|
||||
|
||||
return l < journal_keys->nr ? journal_keys->d + l : NULL;
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
|
||||
struct btree_trans *trans,
|
||||
struct journal_keys *journal_keys,
|
||||
enum btree_id id, struct bpos pos)
|
||||
{
|
||||
iter->journal.keys = journal_keys;
|
||||
iter->journal.k = journal_key_search(journal_keys, id, pos);
|
||||
iter->journal.btree_id = id;
|
||||
memset(iter, 0, sizeof(*iter));
|
||||
|
||||
iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
|
||||
bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
||||
struct journal_keys *journal_keys,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bpos start = b->data->min_key;
|
||||
|
||||
if (btree_node_type_is_extents(b->btree_id))
|
||||
start = bkey_successor(start);
|
||||
|
||||
memset(iter, 0, sizeof(*iter));
|
||||
|
||||
iter->b = b;
|
||||
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
|
||||
bch2_journal_iter_init(&iter->journal, journal_keys,
|
||||
b->btree_id, b->level, start);
|
||||
}
|
||||
|
||||
/* sort and dedup all keys in the journal: */
|
||||
@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
|
||||
const struct journal_key *l = _l;
|
||||
const struct journal_key *r = _r;
|
||||
|
||||
return cmp_int(l->btree_id, r->btree_id) ?:
|
||||
return cmp_int(l->btree_id, r->btree_id) ?:
|
||||
cmp_int(l->level, r->level) ?:
|
||||
bkey_cmp(l->k->k.p, r->k->k.p) ?:
|
||||
cmp_int(l->journal_seq, r->journal_seq) ?:
|
||||
cmp_int(l->journal_offset, r->journal_offset);
|
||||
@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
|
||||
const struct journal_key *l = _l;
|
||||
const struct journal_key *r = _r;
|
||||
|
||||
return cmp_int(l->journal_seq, r->journal_seq) ?:
|
||||
cmp_int(l->btree_id, r->btree_id) ?:
|
||||
bkey_cmp(l->k->k.p, r->k->k.p);
|
||||
return cmp_int(r->level, l->level) ?:
|
||||
cmp_int(l->journal_seq, r->journal_seq) ?:
|
||||
cmp_int(l->btree_id, r->btree_id) ?:
|
||||
bkey_cmp(l->k->k.p, r->k->k.p);
|
||||
}
|
||||
|
||||
static void journal_keys_free(struct journal_keys *keys)
|
||||
@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
|
||||
for_each_jset_key(k, _n, entry, &p->j)
|
||||
keys.d[keys.nr++] = (struct journal_key) {
|
||||
.btree_id = entry->btree_id,
|
||||
.level = entry->level,
|
||||
.k = k,
|
||||
.journal_seq = le64_to_cpu(p->j.seq) -
|
||||
keys.journal_seq_base,
|
||||
@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
|
||||
src = dst = keys.d;
|
||||
while (src < keys.d + keys.nr) {
|
||||
while (src + 1 < keys.d + keys.nr &&
|
||||
src[0].btree_id == src[1].btree_id &&
|
||||
src[0].btree_id == src[1].btree_id &&
|
||||
src[0].level == src[1].level &&
|
||||
!bkey_cmp(src[0].k->k.p, src[1].k->k.p))
|
||||
src++;
|
||||
|
||||
@ -351,12 +404,15 @@ err:
|
||||
}
|
||||
|
||||
static int __bch2_journal_replay_key(struct btree_trans *trans,
|
||||
enum btree_id id, struct bkey_i *k)
|
||||
enum btree_id id, unsigned level,
|
||||
struct bkey_i *k)
|
||||
{
|
||||
struct btree_iter *iter;
|
||||
int ret;
|
||||
|
||||
iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
|
||||
iter = bch2_trans_get_node_iter(trans, id, k->k.p,
|
||||
BTREE_MAX_DEPTH, level,
|
||||
BTREE_ITER_INTENT);
|
||||
if (IS_ERR(iter))
|
||||
return PTR_ERR(iter);
|
||||
|
||||
@ -375,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
|
||||
struct bkey_i *k)
|
||||
unsigned level, struct bkey_i *k)
|
||||
{
|
||||
return bch2_trans_do(c, NULL, NULL,
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_LAZY_RW|
|
||||
BTREE_INSERT_JOURNAL_REPLAY,
|
||||
__bch2_journal_replay_key(&trans, id, k));
|
||||
__bch2_journal_replay_key(&trans, id, level, k));
|
||||
}
|
||||
|
||||
static int bch2_journal_replay(struct bch_fs *c,
|
||||
@ -393,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c,
|
||||
|
||||
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
|
||||
|
||||
for_each_journal_key(keys, i) {
|
||||
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
|
||||
replay_now_at(j, keys.journal_seq_base);
|
||||
|
||||
for_each_journal_key(keys, i) {
|
||||
if (!i->level)
|
||||
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
|
||||
|
||||
if (i->level)
|
||||
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
|
||||
if (i->btree_id == BTREE_ID_ALLOC)
|
||||
ret = bch2_alloc_replay_key(c, i->k);
|
||||
else if (i->k->k.size)
|
||||
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
|
||||
else
|
||||
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
|
||||
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
|
||||
|
||||
if (ret) {
|
||||
bch_err(c, "journal replay: error %d while replaying key",
|
||||
@ -864,7 +925,7 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
*/
|
||||
bch_info(c, "starting metadata mark and sweep");
|
||||
err = "error in mark and sweep";
|
||||
ret = bch2_gc(c, NULL, true, true);
|
||||
ret = bch2_gc(c, &journal_keys, true, true);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "mark and sweep done");
|
||||
|
@ -5,6 +5,7 @@
|
||||
struct journal_keys {
|
||||
struct journal_key {
|
||||
enum btree_id btree_id:8;
|
||||
unsigned level:8;
|
||||
struct bkey_i *k;
|
||||
u32 journal_seq;
|
||||
u32 journal_offset;
|
||||
@ -17,15 +18,23 @@ struct journal_keys {
|
||||
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
|
||||
|
||||
struct journal_iter {
|
||||
enum btree_id btree_id;
|
||||
unsigned level;
|
||||
struct journal_keys *keys;
|
||||
struct journal_key *k;
|
||||
enum btree_id btree_id;
|
||||
};
|
||||
|
||||
struct btree_and_journal_iter {
|
||||
enum btree_id btree_id;
|
||||
/*
|
||||
* Iterate over keys in the btree, with keys from the journal overlaid on top:
|
||||
*/
|
||||
|
||||
struct btree_and_journal_iter {
|
||||
struct btree_iter *btree;
|
||||
|
||||
struct btree *b;
|
||||
struct btree_node_iter node_iter;
|
||||
struct bkey unpacked;
|
||||
|
||||
struct journal_iter journal;
|
||||
|
||||
enum last_key_returned {
|
||||
@ -38,12 +47,14 @@ struct btree_and_journal_iter {
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
|
||||
struct journal_key *journal_key_search(struct journal_keys *,
|
||||
enum btree_id, struct bpos);
|
||||
|
||||
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
|
||||
struct btree_trans *,
|
||||
struct journal_keys *,
|
||||
enum btree_id, struct bpos);
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
||||
struct journal_keys *,
|
||||
struct btree *);
|
||||
|
||||
int bch2_fs_recovery(struct bch_fs *);
|
||||
int bch2_fs_initialize(struct bch_fs *);
|
||||
|
@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
|
||||
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
|
||||
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
|
||||
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
|
||||
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
|
||||
ret = bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
@ -1089,6 +1090,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
|
||||
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
|
||||
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
|
||||
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
|
||||
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
|
||||
|
||||
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user