Update bcachefs sources to fd637ebda0 bcachefs: Journal updates to interior nodes

This commit is contained in:
Kent Overstreet 2020-03-25 15:56:38 -04:00
parent 096f2ec00e
commit 8bcd38555c
21 changed files with 491 additions and 446 deletions

View File

@ -1 +1 @@
3592e42edfaed6a66470fb6a456a5895243ef2f4
fd637ebda030609b15a473f01f1ef54bbe818f27

View File

@ -1312,7 +1312,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12)
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \

View File

@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
static inline void bkey_reassemble(struct bkey_i *dst,
struct bkey_s_c src)
{
BUG_ON(bkey_packed(src.k));
dst->k = *src.k;
memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })

View File

@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
if (bkey_cmp(k.k->p, b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)

View File

@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
return nr;
}
static void extent_sort_advance_prev(struct bkey_format *f,
struct btree_nr_keys *nr,
struct bkey_packed *start,
struct bkey_packed **prev)
{
if (*prev) {
bch2_bkey_pack(*prev, (void *) *prev, f);
btree_keys_account_key_add(nr, 0, *prev);
*prev = bkey_next(*prev);
} else {
*prev = start;
}
}
static void extent_sort_append(struct bch_fs *c,
struct bkey_format *f,
struct btree_nr_keys *nr,
struct bkey_packed *start,
struct bkey_packed **prev,
struct bkey_packed **out,
struct bkey_s k)
{
if (bkey_whiteout(k.k))
return;
if (!bkey_whiteout(k.k)) {
if (!bch2_bkey_pack_key(*out, k.k, f))
memcpy_u64s_small(*out, k.k, BKEY_U64s);
/*
* prev is always unpacked, for key merging - until right before we
* advance it:
*/
memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
if (*prev &&
bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
BCH_MERGE_MERGE)
return;
extent_sort_advance_prev(f, nr, start, prev);
bkey_reassemble((void *) *prev, k.s_c);
btree_keys_account_key_add(nr, 0, *out);
*out = bkey_next(*out);
}
}
/* Sort + repack in a new format: */
@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
return nr;
}
/* Sort, repack, and merge: */
/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
struct btree_nr_keys
bch2_sort_repack_merge(struct bch_fs *c,
struct bset *dst, struct btree *src,
@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
struct bkey_format *out_f,
bool filter_whiteouts)
{
struct bkey_packed *prev = NULL, *k_packed;
struct bkey_packed *out = vstruct_last(dst), *k_packed;
struct bkey_on_stack k;
struct btree_nr_keys nr;
@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c,
bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
continue;
extent_sort_append(c, out_f, &nr, vstruct_last(dst),
&prev, bkey_i_to_s(k.k));
extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
}
extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&k, c);
return nr;
}
@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
struct btree *b = iter->b;
struct bkey_format *f = &b->format;
struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
struct bkey_packed *prev = NULL;
struct bkey_packed *out = dst->start;
struct bkey l_unpacked, r_unpacked;
struct bkey_s l, r;
struct btree_nr_keys nr;
@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
l = __bkey_disassemble(b, _l->k, &l_unpacked);
if (iter->used == 1) {
extent_sort_append(c, f, &nr, dst->start, &prev, l);
extent_sort_append(c, f, &nr, &out, l);
extent_iter_advance(iter, 0);
continue;
}
@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
/* If current key and next key don't overlap, just append */
if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
extent_sort_append(c, f, &nr, dst->start, &prev, l);
extent_sort_append(c, f, &nr, &out, l);
extent_iter_advance(iter, 0);
continue;
}
@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
__sort_iter_sift(iter, 0,
extent_sort_fix_overlapping_cmp);
extent_sort_append(c, f, &nr, dst->start,
&prev, bkey_i_to_s(split.k));
extent_sort_append(c, f, &nr, &out,
bkey_i_to_s(split.k));
} else {
bch2_cut_back_s(bkey_start_pos(r.k), l);
extent_save(b, _l->k, l.k);
}
}
extent_sort_advance_prev(f, &nr, dst->start, &prev);
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&split, c);
return nr;

View File

@ -588,6 +588,7 @@ err:
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_iter *iter,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
enum six_lock_type lock_type,
bool sync)
@ -600,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
if (!bch2_btree_node_relock(iter, level + 1))
if (iter && !bch2_btree_node_relock(iter, level + 1))
return ERR_PTR(-EINTR);
b = bch2_btree_node_mem_alloc(c);
@ -608,7 +609,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return b;
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
@ -628,7 +629,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
*
* XXX: ideally should be dropping all btree node locks here
*/
if (btree_node_read_locked(iter, level + 1))
if (iter && btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
bch2_btree_node_read(c, b, sync);
@ -676,7 +677,8 @@ retry:
* else we could read in a btree node from disk that's been
* freed:
*/
b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
level, lock_type, true);
/* We raced and found the btree node in the cache */
if (!b)
@ -762,6 +764,74 @@ lock_node:
return b;
}
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
EBUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_node_mem_ptr(k);
if (b)
goto lock_node;
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
b = bch2_btree_node_fill(c, NULL, k, btree_id,
level, SIX_LOCK_read, true);
/* We raced and found the btree node in the cache */
if (!b)
goto retry;
if (IS_ERR(b))
return b;
} else {
lock_node:
six_lock_read(&b->lock);
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->btree_id != btree_id ||
b->level != level)) {
six_unlock_read(&b->lock);
goto retry;
}
}
/* XXX: waiting on IO with btree locks held: */
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
prefetch(b->aux_data);
for_each_bset(b, t) {
void *p = (u64 *) b->aux_data + t->aux_data_offset;
prefetch(p + L1_CACHE_BYTES * 0);
prefetch(p + L1_CACHE_BYTES * 1);
prefetch(p + L1_CACHE_BYTES * 2);
}
/* avoid atomic set bit if it's not needed: */
if (!btree_node_accessed(b))
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->lock);
return ERR_PTR(-EIO);
}
EBUG_ON(b->btree_id != btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
return b;
}
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
@ -876,7 +946,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
if (b)
return;
bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
bch2_btree_node_fill(c, iter, k, iter->btree_id,
level, SIX_LOCK_read, false);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,

View File

@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
enum six_lock_type);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);

View File

@ -184,16 +184,8 @@ fsck_err:
return ret;
}
static bool pos_in_journal_keys(struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
struct journal_key *k = journal_key_search(journal_keys, id, pos);
return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
struct journal_keys *journal_keys, bool initial)
bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
@ -207,10 +199,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
if (!b->level && journal_keys &&
pos_in_journal_keys(journal_keys, b->btree_id, k.k->p))
continue;
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
@ -222,7 +210,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
@ -250,8 +237,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
ret = btree_gc_mark_node(c, b, &max_stale,
journal_keys, initial);
ret = btree_gc_mark_node(c, b, &max_stale, initial);
if (ret)
break;
@ -287,6 +273,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
return ret;
}
static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
struct journal_keys *journal_keys,
unsigned target_depth)
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
u8 max_stale = 0;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, &max_stale, true);
if (ret)
break;
if (b->level > target_depth) {
struct btree *child;
BKEY_PADDED(k) tmp;
bkey_reassemble(&tmp.k, k);
child = bch2_btree_node_get_noiter(c, &tmp.k,
b->btree_id, b->level - 1);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
bch2_gc_btree_init_recurse(c, child,
journal_keys, target_depth);
six_unlock_read(&child->lock);
}
bch2_btree_and_journal_iter_advance(&iter);
}
return ret;
}
static int bch2_gc_btree_init(struct bch_fs *c,
struct journal_keys *journal_keys,
enum btree_id btree_id,
bool metadata_only)
{
struct btree *b;
unsigned target_depth = metadata_only ? 1
: expensive_debug_checks(c) ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
int ret = 0;
b = c->btree_roots[btree_id].b;
if (btree_node_fake(b))
return 0;
six_lock_read(&b->lock);
if (b->level >= target_depth)
ret = bch2_gc_btree_init_recurse(c, b,
journal_keys, target_depth);
if (!ret)
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
&max_stale, true);
six_unlock_read(&b->lock);
return ret;
}
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
return (int) btree_id_to_gc_phase(l) -
@ -305,27 +363,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
enum btree_node_type type = __btree_node_type(0, id);
int ret = bch2_gc_btree(c, id, journal_keys,
initial, metadata_only);
int ret = initial
? bch2_gc_btree_init(c, journal_keys,
id, metadata_only)
: bch2_gc_btree(c, id, initial, metadata_only);
if (ret)
return ret;
if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
struct journal_key *j;
u8 max_stale;
int ret;
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
&max_stale, initial);
if (ret)
return ret;
}
}
}
return 0;

View File

@ -1261,7 +1261,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
closure_put(&((struct btree_update *) new)->cl);
bch2_journal_pin_drop(&c->journal, &w->journal);
closure_wake_up(&w->wait);
}
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
if (b->level || !b->written)
wbio->wbio.bio.bi_opf |= REQ_FUA;
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos) {
unsigned long flags = READ_ONCE(b->flags);
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b->written,
!list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0,
b->will_make_reachable & 1,
b->writes[ idx].wait.list.first != NULL,
b->writes[!idx].wait.list.first != NULL);
b->will_make_reachable & 1);
}
rcu_read_unlock();

View File

@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
while (b->written &&
btree_node_need_write(b) &&
btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) {
bch2_btree_node_write(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, lock_held);
break;
}
six_unlock_read(&b->lock);
btree_node_wait_on_io(b);
btree_node_lock_type(c, b, SIX_LOCK_read);
btree_node_lock_type(c, b, lock_held);
}
}
@ -131,7 +132,7 @@ do { \
new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\
btree_node_write_if_need(_c, _b); \
btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);

View File

@ -1068,7 +1068,14 @@ retry_all:
goto retry_all;
}
ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
if (hweight64(trans->iters_live) > 1)
ret = -EINTR;
else
trans_for_each_iter(trans, iter)
if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
ret = -EINTR;
break;
}
out:
bch2_btree_cache_cannibalize_unlock(c);
return ret;

View File

@ -53,7 +53,6 @@ struct bset_tree {
struct btree_write {
struct journal_entry_pin journal;
struct closure_waitlist wait;
};
struct btree_alloc {
@ -261,6 +260,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
return iter->flags & BTREE_ITER_TYPE;
}
static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
{
return iter->l + iter->level;
}
struct btree_insert_entry {
unsigned trigger_flags;
unsigned trans_triggers_run:1;
@ -539,8 +543,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
struct btree_root {
struct btree *b;
struct btree_update *as;
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;

View File

@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK,

View File

@ -24,7 +24,6 @@
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
}
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
struct pending_btree_node_free *pending)
struct pending_btree_node_free *pending,
u64 journal_seq)
{
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
0, 0, NULL, 0,
0, 0, NULL, journal_seq,
BTREE_TRIGGER_OVERWRITE|
BTREE_TRIGGER_GC);
}
@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
{
struct bch_fs *c = as->c;
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
BUG_ON(as->nr_new_nodes);
BUG_ON(as->nr_pending);
BUG_ON((as->nr_new_nodes || as->nr_pending) &&
!bch2_journal_error(&c->journal));;
if (as->reserve)
bch2_btree_reserve_put(c, as->reserve);
@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
mutex_unlock(&c->btree_interior_update_lock);
}
static void btree_update_nodes_reachable(struct closure *cl)
static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
bch2_journal_pin_drop(&c->journal, &as->journal);
mutex_lock(&c->btree_interior_update_lock);
while (as->nr_new_nodes) {
@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
}
while (as->nr_pending)
bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
seq);
mutex_unlock(&c->btree_interior_update_lock);
closure_wake_up(&as->wait);
bch2_btree_update_free(as);
}
static void btree_update_wait_on_journal(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
int ret;
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret == -EAGAIN) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
return;
}
if (ret < 0)
goto err;
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:
continue_at(cl, btree_update_nodes_reachable, system_wq);
}
static void btree_update_nodes_written(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
struct journal_res res = { 0 };
struct bch_fs *c = as->c;
struct btree *b;
struct bset *i;
struct bkey_i *k;
unsigned journal_u64s = 0;
int ret;
/*
* We did an update to a parent node where the pointers we added pointed
@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
*/
mutex_lock(&c->btree_interior_update_lock);
as->nodes_written = true;
retry:
again:
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
struct btree_update, unwritten_list);
if (!as || !as->nodes_written) {
@ -679,31 +662,53 @@ retry:
return;
}
b = as->b;
if (b && !six_trylock_intent(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_intent);
six_unlock_intent(&b->lock);
goto out;
}
journal_u64s = 0;
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
for_each_keylist_key(&as->parent_keys, k)
journal_u64s += jset_u64s(k->k.u64s);
ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
JOURNAL_RES_GET_RESERVED);
if (ret) {
BUG_ON(!bch2_journal_error(&c->journal));
/* can't unblock btree writes */
goto free_update;
}
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
for_each_keylist_key(&as->parent_keys, k)
bch2_journal_add_entry(&c->journal, &res,
BCH_JSET_ENTRY_btree_keys,
as->btree_id,
as->level,
k, k->k.u64s);
switch (as->mode) {
case BTREE_INTERIOR_NO_UPDATE:
BUG();
case BTREE_INTERIOR_UPDATING_NODE:
/* The usual case: */
b = READ_ONCE(as->b);
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
mutex_lock(&c->btree_interior_update_lock);
goto retry;
}
BUG_ON(!btree_node_dirty(b));
closure_wait(&btree_current_write(b)->wait, &as->cl);
/* @b is the node we did the final insert into: */
BUG_ON(!res.ref);
six_lock_write(&b->lock);
list_del(&as->write_blocked_list);
/*
* for flush_held_btree_writes() waiting on updates to flush or
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
i = btree_bset_last(b);
i->journal_seq = cpu_to_le64(
max(res.seq,
le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, res.seq);
six_unlock_write(&b->lock);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
@ -712,82 +717,51 @@ retry:
* b->write_blocked prevented it from being written, so
* write it now if it needs to be written:
*/
bch2_btree_node_write_cond(c, b, true);
six_unlock_read(&b->lock);
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
btree_node_write_if_need(c, b, SIX_LOCK_intent);
six_unlock_intent(&b->lock);
break;
case BTREE_INTERIOR_UPDATING_AS:
/*
* The btree node we originally updated has been freed and is
* being rewritten - so we need to write anything here, we just
* need to signal to that btree_update that it's ok to make the
* new replacement node visible:
*/
closure_put(&as->parent_as->cl);
/*
* and then we have to wait on that btree_update to finish:
*/
closure_wait(&as->parent_as->wait, &as->cl);
BUG_ON(b);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
break;
case BTREE_INTERIOR_UPDATING_ROOT:
/* b is the new btree root: */
b = READ_ONCE(as->b);
case BTREE_INTERIOR_UPDATING_ROOT: {
struct btree_root *r = &c->btree_roots[as->btree_id];
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
mutex_lock(&c->btree_interior_update_lock);
goto retry;
}
BUG_ON(b);
BUG_ON(c->btree_roots[b->btree_id].as != as);
c->btree_roots[b->btree_id].as = NULL;
bch2_btree_set_root_ondisk(c, b, WRITE);
/*
* We don't have to wait anything anything here (before
* btree_update_nodes_reachable frees the old nodes
* ondisk) - we've ensured that the very next journal write will
* have the pointer to the new root, and before the allocator
* can reuse the old nodes it'll have to do a journal commit:
*/
six_unlock_read(&b->lock);
mutex_lock(&c->btree_root_lock);
bkey_copy(&r->key, as->parent_keys.keys);
r->level = as->level;
r->alive = true;
c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
/*
* Bit of funny circularity going on here we have to break:
*
* We have to drop our journal pin before writing the journal
* entry that points to the new btree root: else, we could
* deadlock if the journal currently happens to be full.
*
* This mean we're dropping the journal pin _before_ the new
* nodes are technically reachable - but this is safe, because
* after the bch2_btree_set_root_ondisk() call above they will
* be reachable as of the very next journal write:
*/
bch2_journal_pin_drop(&c->journal, &as->journal);
as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
btree_update_wait_on_journal(&as->cl);
break;
}
}
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_res_put(&c->journal, &res);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
btree_update_nodes_reachable(as, res.seq);
free_update:
bch2_btree_update_free(as);
/*
* for flush_held_btree_writes() waiting on updates to flush or
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
out:
mutex_lock(&c->btree_interior_update_lock);
goto retry;
goto again;
}
/*
@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!btree_node_dirty(b));
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
as->level = b->level;
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
/*
* In general, when you're staging things in a journal that will later
* be written elsewhere, and you also want to guarantee ordering: that
* is, if you have updates a, b, c, after a crash you should never see c
* and not a or b - there's a problem:
*
* If the final destination of the update(s) (i.e. btree node) can be
* written/flushed _before_ the relevant journal entry - oops, that
* breaks ordering, since the various leaf nodes can be written in any
* order.
*
* Normally we use bset->journal_seq to deal with this - if during
* recovery we find a btree node write that's newer than the newest
* journal entry, we just ignore it - we don't need it, anything we're
* supposed to have (that we reported as completed via fsync()) will
* still be in the journal, and as far as the state of the journal is
* concerned that btree node write never happened.
*
* That breaks when we're rewriting/splitting/merging nodes, since we're
* mixing btree node writes that haven't happened yet with previously
* written data that has been reported as completed to the journal.
*
* Thus, before making the new nodes reachable, we have to wait the
* newest journal sequence number we have data for to be written (if it
* hasn't been yet).
*/
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
}
static void interior_update_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct btree_update *as =
container_of(pin, struct btree_update, journal);
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
}
static void btree_update_reparent(struct btree_update *as,
@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
{
struct bch_fs *c = as->c;
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
child->parent_as = as;
closure_get(&as->cl);
/*
* When we write a new btree root, we have to drop our journal pin
@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
* just transfer the journal pin to the new interior update so
* btree_update_nodes_written() can drop it.
*/
bch2_journal_pin_copy(&c->journal, &as->journal,
&child->journal, interior_update_flush);
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
bch2_journal_pin_drop(&c->journal, &child->journal);
as->journal_seq = max(as->journal_seq, child->journal_seq);
}
static void btree_update_updated_root(struct btree_update *as)
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
struct btree_root *r = &c->btree_roots[as->btree_id];
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!bch2_keylist_empty(&as->parent_keys));
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
/*
* Old root might not be persistent yet - if so, redirect its
* btree_update operation to point to us:
*/
if (r->as)
btree_update_reparent(as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b;
r->as = as;
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->level = b->level;
bch2_keylist_add(&as->parent_keys, &b->key);
mutex_unlock(&c->btree_interior_update_lock);
/*
* When we're rewriting nodes and updating interior nodes, there's an
* issue with updates that haven't been written in the journal getting
* mixed together with older data - see btree_update_updated_node()
* for the explanation.
*
* However, this doesn't affect us when we're writing a new btree root -
* because to make that new root reachable we have to write out a new
* journal entry, which must necessarily be newer than as->journal_seq.
*/
}
static void btree_node_will_make_reachable(struct btree_update *as,
@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree *b)
{
struct bch_fs *c = as->c;
struct closure *cl, *cl_n;
struct btree_update *p, *n;
struct btree_write *w;
struct bset_tree *t;
set_btree_node_dying(b);
@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
btree_interior_update_add_node_reference(as, b);
/*
* Does this node have data that hasn't been written in the journal?
*
* If so, we have to wait for the corresponding journal entry to be
* written before making the new nodes reachable - we can't just carry
* over the bset->journal_seq tracking, since we'll be mixing those keys
* in with keys that aren't in the journal anymore:
*/
for_each_bset(b, t)
as->journal_seq = max(as->journal_seq,
le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock);
/*
@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
clear_btree_node_dirty(b);
clear_btree_node_need_write(b);
w = btree_current_write(b);
/*
* Does this node have any btree_update operations waiting on this node
* to be written?
*
* If so, wake them up when this btree_update operation is reachable:
*/
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
llist_add(&cl->list, &as->wait.list);
/*
* Does this node have unwritten data that has a pin on the journal?
@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk:
*/
bch2_journal_pin_copy(&c->journal, &as->journal,
&w->journal, interior_update_flush);
w = btree_current_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal,
&w->journal, interior_update_flush);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
{
struct btree_reserve *reserve;
struct btree_update *as;
int ret;
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
if (IS_ERR(reserve))
@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
bch2_keylist_init(&as->parent_keys, as->inline_keys);
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
if (ret) {
bch2_btree_reserve_put(c, reserve);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
return ERR_PTR(ret);
}
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{
struct btree_root *r = &c->btree_roots[b->btree_id];
mutex_lock(&c->btree_root_lock);
BUG_ON(b != r->b);
bkey_copy(&r->key, &b->key);
r->level = b->level;
r->alive = true;
if (rw == WRITE)
c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
}
/**
* bch_btree_set_root - update the root in memory and on disk
*
@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
bch2_btree_set_root_inmem(as, b);
btree_update_updated_root(as);
btree_update_updated_root(as, b);
/*
* Unlock old root after new root is visible:
@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->lock);
bch2_keylist_add(&as->parent_keys, &n1->key);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
}
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
(bkey_cmp_packed(b, k, &insert->k) >= 0))
;
while (!bch2_keylist_empty(keys)) {
insert = bch2_keylist_front(keys);
for_each_keylist_key(keys, insert)
bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
bch2_keylist_pop_front(keys);
}
btree_update_updated_node(as, b);
@ -1630,7 +1512,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned flags)
{
struct btree_trans *trans = iter->trans;
struct btree *b = iter->l[0].b;
struct btree *b = iter_l(iter)->b;
struct btree_update *as;
struct closure cl;
int ret = 0;
@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bkey_copy(&b->key, new_key);
}
btree_update_updated_root(as);
btree_update_updated_root(as, b);
bch2_btree_node_unlock_write(b, iter);
}

View File

@ -69,8 +69,10 @@ struct btree_update {
unsigned nodes_written:1;
enum btree_id btree_id;
u8 level;
struct btree_reserve *reserve;
struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
@ -83,18 +85,6 @@ struct btree_update {
struct btree *b;
struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_update
* @parent_as - btree_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_updates that are waiting on this
* btree_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_update *parent_as;
struct closure_waitlist wait;
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
@ -103,8 +93,6 @@ struct btree_update {
*/
struct journal_entry_pin journal;
u64 journal_seq;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock

View File

@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates2 &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
}
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq);
}
inline void bch2_btree_add_journal_pin(struct bch_fs *c,
struct btree *b, u64 seq)
{
struct btree_write *w = btree_current_write(b);
bch2_journal_pin_add(&c->journal, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
}
static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_i *insert)
@ -172,13 +183,8 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree *b = iter->l[0].b;
struct btree_write *w = btree_current_write(b);
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq;
struct btree *b = iter_l(iter)->b;
EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@ -188,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
cpu_to_le64(trans->journal_res.seq);
}
bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
bch2_btree_add_journal_pin(c, b,
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
@ -205,17 +211,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
struct btree *b = iter_l(iter)->b;
struct bset_tree *t = bset_tree_last(b);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
EBUG_ON(iter->level);
insert->k.needs_whiteout = false;
if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
@ -241,7 +245,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
BUG_ON(iter->level);
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
@ -290,7 +293,7 @@ btree_key_can_insert(struct btree_trans *trans,
unsigned *u64s)
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
struct btree *b = iter_l(iter)->b;
static enum btree_insert_ret ret;
if (unlikely(btree_node_fake(b)))
@ -345,7 +348,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
bch2_mark_update(trans, i->iter, i->k, NULL,
i->trigger_flags|BTREE_TRIGGER_GC);
}
@ -461,7 +464,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
int ret;
trans_for_each_update2(trans, i)
BUG_ON(!btree_node_intent_locked(i->iter, 0));
BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
ret = bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
@ -495,13 +498,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
iter_l(i->iter)->b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
i->iter);
/*

View File

@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
* extent we're inserting and overwriting:
*/
*nr_iters += 1;
if (*nr_iters >= max_iters) {
*end = bpos_min(*end, k.k->p);
ret = 1;
}
switch (k.k->type) {
case KEY_TYPE_extent:

View File

@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c)
bch_verbose(c, "checking extents");
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(BCACHEFS_ROOT_INO, 0), 0);
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {

View File

@ -27,30 +27,78 @@
/* iterate over keys read from the journal: */
struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
while (iter->k) {
if (iter->k->btree_id == iter->btree_id)
return bkey_i_to_s_c(iter->k->k);
size_t l = 0, r = journal_keys->nr, m;
iter->k++;
if (iter->k == iter->keys->d + iter->keys->nr)
iter->k = NULL;
while (l < r) {
m = l + ((r - l) >> 1);
if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
cmp_int(level, journal_keys->d[m].level) ?:
bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
l = m + 1;
else
r = m;
}
return bkey_s_c_null;
BUG_ON(l < journal_keys->nr &&
(cmp_int(id, journal_keys->d[l].btree_id) ?:
cmp_int(level, journal_keys->d[l].level) ?:
bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
BUG_ON(l &&
(cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
cmp_int(level, journal_keys->d[l - 1].level) ?:
bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
return l < journal_keys->nr ? journal_keys->d + l : NULL;
}
struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
if (!iter->k)
return bkey_s_c_null;
if (iter->k &&
iter->k < iter->keys->d + iter->keys->nr &&
iter->k->btree_id == iter->btree_id &&
iter->k->level == iter->level)
return iter->k->k;
iter->k++;
if (iter->k == iter->keys->d + iter->keys->nr)
iter->k = NULL;
iter->k = NULL;
return NULL;
}
return bch2_journal_iter_peek(iter);
static void bch2_journal_iter_advance(struct journal_iter *iter)
{
if (iter->k)
iter->k++;
}
static void bch2_journal_iter_init(struct journal_iter *iter,
struct journal_keys *journal_keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
iter->btree_id = id;
iter->level = level;
iter->keys = journal_keys;
iter->k = journal_key_search(journal_keys, id, level, pos);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
{
return iter->btree
? bch2_btree_iter_peek(iter->btree)
: bch2_btree_node_iter_peek_unpack(&iter->node_iter,
iter->b, &iter->unpacked);
}
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
{
if (iter->btree)
bch2_btree_iter_next(iter->btree);
else
bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
}
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
case none:
break;
case btree:
bch2_btree_iter_next(iter->btree);
bch2_journal_iter_advance_btree(iter);
break;
case journal:
bch2_journal_iter_next(&iter->journal);
bch2_journal_iter_advance(&iter->journal);
break;
}
@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
struct bkey_s_c ret;
while (1) {
struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree);
struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal);
struct bkey_s_c btree_k =
bch2_journal_iter_peek_btree(iter);
struct bkey_s_c journal_k =
bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
if (btree_k.k && journal_k.k) {
int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
if (!cmp)
bch2_btree_iter_next(iter->btree);
bch2_journal_iter_advance_btree(iter);
iter->last = cmp < 0 ? btree : journal;
} else if (btree_k.k) {
@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
}
ret = iter->last == journal ? journal_k : btree_k;
if (iter->b &&
bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
iter->journal.k = NULL;
iter->last = none;
return bkey_s_c_null;
}
if (!bkey_deleted(ret.k))
break;
@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
return bch2_btree_and_journal_iter_peek(iter);
}
struct journal_key *journal_key_search(struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < journal_keys->nr &&
(cmp_int(id, journal_keys->d[l].btree_id) ?:
bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
BUG_ON(l &&
(cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
return l < journal_keys->nr ? journal_keys->d + l : NULL;
}
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
struct btree_trans *trans,
struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
iter->journal.keys = journal_keys;
iter->journal.k = journal_key_search(journal_keys, id, pos);
iter->journal.btree_id = id;
memset(iter, 0, sizeof(*iter));
iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
}
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct journal_keys *journal_keys,
struct btree *b)
{
struct bpos start = b->data->min_key;
if (btree_node_type_is_extents(b->btree_id))
start = bkey_successor(start);
memset(iter, 0, sizeof(*iter));
iter->b = b;
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
bch2_journal_iter_init(&iter->journal, journal_keys,
b->btree_id, b->level, start);
}
/* sort and dedup all keys in the journal: */
@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
return cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->level, r->level) ?:
bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->k->k.p, r->k->k.p);
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
src = dst = keys.d;
while (src < keys.d + keys.nr) {
while (src + 1 < keys.d + keys.nr &&
src[0].btree_id == src[1].btree_id &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
!bkey_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
@ -351,12 +404,15 @@ err:
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
enum btree_id id, struct bkey_i *k)
enum btree_id id, unsigned level,
struct bkey_i *k)
{
struct btree_iter *iter;
int ret;
iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
iter = bch2_trans_get_node_iter(trans, id, k->k.p,
BTREE_MAX_DEPTH, level,
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
@ -375,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
}
static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
struct bkey_i *k)
unsigned level, struct bkey_i *k)
{
return bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY,
__bch2_journal_replay_key(&trans, id, k));
__bch2_journal_replay_key(&trans, id, level, k));
}
static int bch2_journal_replay(struct bch_fs *c,
@ -393,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c,
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
for_each_journal_key(keys, i) {
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
replay_now_at(j, keys.journal_seq_base);
for_each_journal_key(keys, i) {
if (!i->level)
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
if (i->level)
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
@ -864,7 +925,7 @@ int bch2_fs_recovery(struct bch_fs *c)
*/
bch_info(c, "starting metadata mark and sweep");
err = "error in mark and sweep";
ret = bch2_gc(c, NULL, true, true);
ret = bch2_gc(c, &journal_keys, true, true);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");

View File

@ -5,6 +5,7 @@
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
unsigned level:8;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
@ -17,15 +18,23 @@ struct journal_keys {
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
struct journal_iter {
enum btree_id btree_id;
unsigned level;
struct journal_keys *keys;
struct journal_key *k;
enum btree_id btree_id;
};
struct btree_and_journal_iter {
enum btree_id btree_id;
/*
* Iterate over keys in the btree, with keys from the journal overlaid on top:
*/
struct btree_and_journal_iter {
struct btree_iter *btree;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
struct journal_iter journal;
enum last_key_returned {
@ -38,12 +47,14 @@ struct btree_and_journal_iter {
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
struct journal_key *journal_key_search(struct journal_keys *,
enum btree_id, struct bpos);
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
struct btree_trans *,
struct journal_keys *,
enum btree_id, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct journal_keys *,
struct btree *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);

View File

@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@ -1089,6 +1090,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;