Update bcachefs sources to 8fd009dd76 bcachefs: Rip out code for storing backpointers in alloc keys

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-03-31 15:52:24 -04:00
parent d22c79d2ff
commit 7f102ee83d
39 changed files with 1552 additions and 1239 deletions

View File

@ -1 +1 @@
0342eebf85b7be76f01bacec8f958c6e6039535b 8fd009dd764dabd79e2b42e1c85812a08ad1d6c0

View File

@ -113,17 +113,40 @@ static inline void *bio_data(struct bio *bio)
#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) #define __bio_kunmap_atomic(addr) kunmap_atomic(addr)
static inline struct bio_vec *bio_next_segment(const struct bio *bio, static inline struct bio_vec bio_iter_all_peek(const struct bio *bio,
struct bvec_iter_all *iter) struct bvec_iter_all *iter)
{ {
if (iter->idx >= bio->bi_vcnt) if (WARN_ON(iter->idx >= bio->bi_vcnt))
return NULL; return (struct bio_vec) { NULL };
return &bio->bi_io_vec[iter->idx]; return bvec_iter_all_peek(bio->bi_io_vec, iter);
} }
#define bio_for_each_segment_all(bvl, bio, iter) \ static inline void bio_iter_all_advance(const struct bio *bio,
for ((iter).idx = 0; (bvl = bio_next_segment((bio), &(iter))); (iter).idx++) struct bvec_iter_all *iter,
unsigned bytes)
{
bvec_iter_all_advance(bio->bi_io_vec, iter, bytes);
WARN_ON(iter->idx > bio->bi_vcnt ||
(iter->idx == bio->bi_vcnt && iter->done));
}
#define bio_for_each_segment_all_continue(bvl, bio, iter) \
for (; \
iter.idx < bio->bi_vcnt && \
((bvl = bio_iter_all_peek(bio, &iter)), true); \
bio_iter_all_advance((bio), &iter, bvl.bv_len))
/*
* drivers should _never_ use the all version - the bio may have been split
* before it got to the driver and the driver won't own all of it
*/
#define bio_for_each_segment_all(bvl, bio, iter) \
for (bvec_iter_all_init(&iter); \
iter.idx < (bio)->bi_vcnt && \
((bvl = bio_iter_all_peek((bio), &iter)), true); \
bio_iter_all_advance((bio), &iter, bvl.bv_len))
static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
unsigned bytes) unsigned bytes)

View File

@ -43,10 +43,6 @@ struct bvec_iter {
current bvec */ current bvec */
}; };
struct bvec_iter_all {
int idx;
};
/* /*
* various member access, note that bio_data should of course not be used * various member access, note that bio_data should of course not be used
* on highmem page vectors * on highmem page vectors
@ -98,4 +94,52 @@ static inline void bvec_iter_advance(const struct bio_vec *bv,
((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
/*
* bvec_iter_all: for advancing over individual pages in a bio, as it was when
* it was first created:
*/
struct bvec_iter_all {
int idx;
unsigned done;
};
static inline void bvec_iter_all_init(struct bvec_iter_all *iter_all)
{
iter_all->done = 0;
iter_all->idx = 0;
}
static inline struct bio_vec __bvec_iter_all_peek(const struct bio_vec *bvec,
const struct bvec_iter_all *iter)
{
struct bio_vec bv = bvec[iter->idx];
BUG_ON(iter->done >= bv.bv_len);
bv.bv_offset += iter->done;
bv.bv_len -= iter->done;
return bv;
}
static inline struct bio_vec bvec_iter_all_peek(const struct bio_vec *bvec,
const struct bvec_iter_all *iter)
{
struct bio_vec bv = __bvec_iter_all_peek(bvec, iter);
bv.bv_len = min_t(unsigned, PAGE_SIZE - bv.bv_offset, bv.bv_len);
return bv;
}
static inline void bvec_iter_all_advance(const struct bio_vec *bvec,
struct bvec_iter_all *iter,
unsigned bytes)
{
iter->done += bytes;
while (iter->done && iter->done >= bvec[iter->idx].bv_len) {
iter->done -= bvec[iter->idx].bv_len;
iter->idx++;
}
}
#endif /* __LINUX_BVEC_ITER_H */ #endif /* __LINUX_BVEC_ITER_H */

View File

@ -831,10 +831,35 @@ DEFINE_EVENT(transaction_event, trans_restart_injected,
TP_ARGS(trans, caller_ip) TP_ARGS(trans, caller_ip)
); );
DEFINE_EVENT(transaction_event, trans_restart_split_race, TRACE_EVENT(trans_restart_split_race,
TP_PROTO(struct btree_trans *trans, TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip), unsigned long caller_ip,
TP_ARGS(trans, caller_ip) struct btree *b),
TP_ARGS(trans, caller_ip, b),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, level )
__field(u16, written )
__field(u16, blocks )
__field(u16, u64s_remaining )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
__entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
__entry->trans_fn, (void *) __entry->caller_ip,
__entry->level,
__entry->written, __entry->blocks,
__entry->u64s_remaining)
); );
DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,

View File

@ -451,6 +451,8 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
if (src < dst) if (src < dst)
memset(src, 0, dst - src); memset(src, 0, dst - src);
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
} else { } else {
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
@ -476,20 +478,13 @@ static noinline struct bkey_i_alloc_v4 *
__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
{ {
struct bkey_i_alloc_v4 *ret; struct bkey_i_alloc_v4 *ret;
if (k.k->type == KEY_TYPE_alloc_v4) {
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
unsigned bytes = sizeof(struct bkey_i_alloc_v4) +
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) *
sizeof(struct bch_backpointer);
void *src, *dst;
/* ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4));
* Reserve space for one more backpointer here: if (IS_ERR(ret))
* Not sketchy at doing it this way, nope... return ret;
*/
ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); if (k.k->type == KEY_TYPE_alloc_v4) {
if (IS_ERR(ret)) void *src, *dst;
return ret;
bkey_reassemble(&ret->k_i, k); bkey_reassemble(&ret->k_i, k);
@ -497,17 +492,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
dst = alloc_v4_backpointers(&ret->v); dst = alloc_v4_backpointers(&ret->v);
memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
sizeof(struct bch_backpointer));
if (src < dst) if (src < dst)
memset(src, 0, dst - src); memset(src, 0, dst - src);
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
set_alloc_v4_u64s(ret); set_alloc_v4_u64s(ret);
} else { } else {
ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) +
sizeof(struct bch_backpointer));
if (IS_ERR(ret))
return ret;
bkey_alloc_v4_init(&ret->k_i); bkey_alloc_v4_init(&ret->k_i);
ret->k.p = k.k->p; ret->k.p = k.k->p;
bch2_alloc_to_v4(k, &ret->v); bch2_alloc_to_v4(k, &ret->v);
@ -517,8 +507,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
{ {
struct bkey_s_c_alloc_v4 a;
if (likely(k.k->type == KEY_TYPE_alloc_v4) && if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { ((a = bkey_s_c_to_alloc_v4(k), true) &&
BCH_ALLOC_V4_BACKPOINTERS_START(a.v) == BCH_ALLOC_V4_U64s &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) {
/* /*
* Reserve space for one more backpointer here: * Reserve space for one more backpointer here:
* Not sketchy at doing it this way, nope... * Not sketchy at doing it this way, nope...
@ -962,10 +956,17 @@ struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, s
struct bpos next; struct bpos next;
bch2_trans_copy_iter(&iter2, iter); bch2_trans_copy_iter(&iter2, iter);
k = bch2_btree_iter_peek_upto(&iter2,
bkey_min(bkey_min(end, if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
iter->path->l[0].b->key.k.p), end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)));
end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
/*
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
k = bch2_btree_iter_peek_upto(&iter2, end);
next = iter2.pos; next = iter2.pos;
bch2_trans_iter_exit(iter->trans, &iter2); bch2_trans_iter_exit(iter->trans, &iter2);
@ -1760,7 +1761,7 @@ static void bch2_do_discards_work(struct work_struct *work)
void bch2_do_discards(struct bch_fs *c) void bch2_do_discards(struct bch_fs *c)
{ {
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
!queue_work(system_long_wq, &c->discard_work)) !queue_work(c->write_ref_wq, &c->discard_work))
bch2_write_ref_put(c, BCH_WRITE_REF_discard); bch2_write_ref_put(c, BCH_WRITE_REF_discard);
} }
@ -1886,11 +1887,12 @@ err:
void bch2_do_invalidates(struct bch_fs *c) void bch2_do_invalidates(struct bch_fs *c)
{ {
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
!queue_work(system_long_wq, &c->invalidate_work)) !queue_work(c->write_ref_wq, &c->invalidate_work))
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
} }
static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
unsigned long *last_updated)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter iter; struct btree_iter iter;
@ -1910,6 +1912,12 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
* freespace/need_discard/need_gc_gens btrees as needed: * freespace/need_discard/need_gc_gens btrees as needed:
*/ */
while (1) { while (1) {
if (*last_updated + HZ * 10 < jiffies) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
*last_updated = jiffies;
}
bch2_trans_begin(&trans); bch2_trans_begin(&trans);
if (bkey_ge(iter.pos, end)) { if (bkey_ge(iter.pos, end)) {
@ -1989,6 +1997,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
unsigned i; unsigned i;
int ret = 0; int ret = 0;
bool doing_init = false; bool doing_init = false;
unsigned long last_updated = jiffies;
/* /*
* We can crash during the device add path, so we need to check this on * We can crash during the device add path, so we need to check this on
@ -2004,7 +2013,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
doing_init = true; doing_init = true;
} }
ret = bch2_dev_freespace_init(c, ca); ret = bch2_dev_freespace_init(c, ca, &last_updated);
if (ret) { if (ret) {
percpu_ref_put(&ca->ref); percpu_ref_put(&ca->ref);
return ret; return ret;

View File

@ -345,17 +345,17 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
struct bch_backpointer bp; struct bch_backpointer bp;
u64 bp_offset = 0; struct bpos bp_pos = POS_MIN;
ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
&bp_offset, &bp, &bp_pos, &bp,
BTREE_ITER_NOPRESERVE); BTREE_ITER_NOPRESERVE);
if (ret) { if (ret) {
ob = ERR_PTR(ret); ob = ERR_PTR(ret);
goto err; goto err;
} }
if (bp_offset != U64_MAX) { if (!bkey_eq(bp_pos, POS_MAX)) {
/* /*
* Bucket may have data in it - we don't call * Bucket may have data in it - we don't call
* bc2h_trans_inconnsistent() because fsck hasn't * bc2h_trans_inconnsistent() because fsck hasn't

View File

@ -69,6 +69,10 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{ {
prt_str(out, "bucket=");
bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
prt_str(out, " ");
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
} }
@ -81,117 +85,6 @@ void bch2_backpointer_swab(struct bkey_s k)
bch2_bpos_swab(&bp.v->pos); bch2_bpos_swab(&bp.v->pos);
} }
#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1)
static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
{
return cmp_int(l.bucket_offset, r.bucket_offset);
}
static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
struct bpos bucket,
u64 bp_offset,
struct bch_backpointer bp)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
if (bp_offset < BACKPOINTER_OFFSET_MAX) {
struct bch_backpointer *bps;
struct bkey_i_alloc_v4 *a;
unsigned i, nr;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
bucket,
BTREE_ITER_INTENT|
BTREE_ITER_SLOTS|
BTREE_ITER_WITH_UPDATES);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_alloc_v4) {
ret = -ENOENT;
goto err;
}
a = bch2_alloc_to_v4_mut(trans, k);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
bps = alloc_v4_backpointers(&a->v);
nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
for (i = 0; i < nr; i++) {
if (bps[i].bucket_offset == bp_offset)
goto found;
if (bps[i].bucket_offset > bp_offset)
break;
}
ret = -ENOENT;
goto err;
found:
if (memcmp(&bps[i], &bp, sizeof(bp))) {
ret = -ENOENT;
goto err;
}
array_remove_item(bps, nr, i);
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
set_alloc_v4_u64s(a);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
} else {
bp_offset -= BACKPOINTER_OFFSET_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp_offset),
BTREE_ITER_INTENT|
BTREE_ITER_SLOTS|
BTREE_ITER_WITH_UPDATES);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
ret = -ENOENT;
goto err;
}
ret = bch2_btree_delete_at(trans, &iter, 0);
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
bool bch2_bucket_backpointer_del(struct btree_trans *trans,
struct bkey_i_alloc_v4 *a,
struct bch_backpointer bp)
{
struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
for (i = 0; i < nr; i++) {
int cmp = backpointer_cmp(bps[i], bp) ?:
memcmp(&bps[i], &bp, sizeof(bp));
if (!cmp) {
array_remove_item(bps, nr, i);
SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
set_alloc_v4_u64s(a);
return true;
}
if (cmp >= 0)
break;
}
return false;
}
static noinline int backpointer_mod_err(struct btree_trans *trans, static noinline int backpointer_mod_err(struct btree_trans *trans,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c bp_k, struct bkey_s_c bp_k,
@ -245,7 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
} }
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
struct bkey_i_alloc_v4 *a, struct bpos bucket,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c orig_k, struct bkey_s_c orig_k,
bool insert) bool insert)
@ -262,7 +155,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
return ret; return ret;
bkey_backpointer_init(&bp_k->k_i); bkey_backpointer_init(&bp_k->k_i);
bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
bp_k->v = bp; bp_k->v = bp;
if (!insert) { if (!insert) {
@ -271,7 +164,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
} }
bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), bp_k->k.p,
BTREE_ITER_INTENT| BTREE_ITER_INTENT|
BTREE_ITER_SLOTS| BTREE_ITER_SLOTS|
BTREE_ITER_WITH_UPDATES); BTREE_ITER_WITH_UPDATES);
@ -298,94 +191,62 @@ err:
/* /*
* Find the next backpointer >= *bp_offset: * Find the next backpointer >= *bp_offset:
*/ */
int __bch2_get_next_backpointer(struct btree_trans *trans, int bch2_get_next_backpointer(struct btree_trans *trans,
struct bpos bucket, int gen, struct bpos bucket, int gen,
u64 *bp_offset, struct bpos *bp_pos,
struct bpos *bp_pos_ret, struct bch_backpointer *bp,
struct bch_backpointer *dst, unsigned iter_flags)
unsigned iter_flags)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bpos bp_pos, bp_end_pos; struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
struct btree_iter alloc_iter, bp_iter = { NULL }; struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_s_c_alloc_v4 a; int ret = 0;
size_t i;
int ret;
if (*bp_offset == U64_MAX) if (bpos_ge(*bp_pos, bp_end_pos))
return 0;
bp_pos = bucket_pos_to_bp(c, bucket,
max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
bucket, BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&alloc_iter);
ret = bkey_err(k);
if (ret)
goto out;
if (k.k->type != KEY_TYPE_alloc_v4)
goto done; goto done;
a = bkey_s_c_to_alloc_v4(k); if (gen >= 0) {
if (gen >= 0 && a.v->gen != gen) bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
goto done; bucket, BTREE_ITER_CACHED|iter_flags);
k = bch2_btree_iter_peek_slot(&alloc_iter);
ret = bkey_err(k);
if (ret)
goto out;
for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { if (k.k->type != KEY_TYPE_alloc_v4 ||
if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) bkey_s_c_to_alloc_v4(k).v->gen != gen)
continue; goto done;
*dst = alloc_v4_backpointers_c(a.v)[i];
*bp_offset = dst->bucket_offset;
goto out;
} }
*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
bp_pos, 0, k, ret) { *bp_pos, iter_flags, k, ret) {
if (bpos_ge(k.k->p, bp_end_pos)) if (bpos_ge(k.k->p, bp_end_pos))
break; break;
if (k.k->type != KEY_TYPE_backpointer) *bp_pos = k.k->p;
continue; *bp = *bkey_s_c_to_backpointer(k).v;
*dst = *bkey_s_c_to_backpointer(k).v;
*bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
*bp_pos_ret = k.k->p;
goto out; goto out;
} }
done: done:
*bp_offset = U64_MAX; *bp_pos = SPOS_MAX;
out: out:
bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &bp_iter);
bch2_trans_iter_exit(trans, &alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter);
return ret; return ret;
} }
int bch2_get_next_backpointer(struct btree_trans *trans,
struct bpos bucket, int gen,
u64 *bp_offset,
struct bch_backpointer *dst,
unsigned iter_flags)
{
struct bpos bp_pos;
return __bch2_get_next_backpointer(trans, bucket, gen,
bp_offset, &bp_pos,
dst, iter_flags);
}
static void backpointer_not_found(struct btree_trans *trans, static void backpointer_not_found(struct btree_trans *trans,
struct bpos bucket, struct bpos bp_pos,
u64 bp_offset,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c k, struct bkey_s_c k,
const char *thing_it_points_to) const char *thing_it_points_to)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
if (likely(!bch2_backpointers_no_use_write_buffer)) if (likely(!bch2_backpointers_no_use_write_buffer))
return; return;
@ -396,14 +257,9 @@ static void backpointer_not_found(struct btree_trans *trans,
bch2_bpos_to_text(&buf, bucket); bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n "); prt_printf(&buf, "\n ");
if (bp_offset >= BACKPOINTER_OFFSET_MAX) { prt_printf(&buf, "backpointer pos: ");
struct bpos bp_pos = bch2_bpos_to_text(&buf, bp_pos);
bucket_pos_to_bp(c, bucket, prt_printf(&buf, "\n ");
bp_offset - BACKPOINTER_OFFSET_MAX);
prt_printf(&buf, "backpointer pos: ");
bch2_bpos_to_text(&buf, bp_pos);
prt_printf(&buf, "\n ");
}
bch2_backpointer_to_text(&buf, &bp); bch2_backpointer_to_text(&buf, &bp);
prt_printf(&buf, "\n "); prt_printf(&buf, "\n ");
@ -418,11 +274,12 @@ static void backpointer_not_found(struct btree_trans *trans,
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct btree_iter *iter, struct btree_iter *iter,
struct bpos bucket, struct bpos bp_pos,
u64 bp_offset, struct bch_backpointer bp,
struct bch_backpointer bp) unsigned iter_flags)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
struct bkey_s_c k; struct bkey_s_c k;
bch2_trans_node_iter_init(trans, iter, bch2_trans_node_iter_init(trans, iter,
@ -430,7 +287,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
bp.pos, bp.pos,
0, 0,
min(bp.level, c->btree_roots[bp.btree_id].level), min(bp.level, c->btree_roots[bp.btree_id].level),
0); iter_flags);
k = bch2_btree_iter_peek_slot(iter); k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) { if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter); bch2_trans_iter_exit(trans, iter);
@ -455,7 +312,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
* been written out yet - backpointer_get_node() checks for * been written out yet - backpointer_get_node() checks for
* this: * this:
*/ */
b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (!IS_ERR_OR_NULL(b)) if (!IS_ERR_OR_NULL(b))
return bkey_i_to_s_c(&b->key); return bkey_i_to_s_c(&b->key);
@ -466,7 +323,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
return bkey_s_c_null; return bkey_s_c_null;
} }
backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); backpointer_not_found(trans, bp_pos, bp, k, "extent");
} }
return bkey_s_c_null; return bkey_s_c_null;
@ -474,11 +331,11 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
struct btree_iter *iter, struct btree_iter *iter,
struct bpos bucket, struct bpos bp_pos,
u64 bp_offset,
struct bch_backpointer bp) struct bch_backpointer bp)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
struct btree *b; struct btree *b;
BUG_ON(!bp.level); BUG_ON(!bp.level);
@ -501,7 +358,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
if (b && btree_node_will_make_reachable(b)) { if (b && btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else { } else {
backpointer_not_found(trans, bucket, bp_offset, bp, backpointer_not_found(trans, bp_pos, bp,
bkey_i_to_s_c(&b->key), "btree node"); bkey_i_to_s_c(&b->key), "btree node");
b = NULL; b = NULL;
} }
@ -570,7 +427,7 @@ struct bpos_level {
}; };
static int check_bp_exists(struct btree_trans *trans, static int check_bp_exists(struct btree_trans *trans,
struct bpos bucket_pos, struct bpos bucket,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c orig_k, struct bkey_s_c orig_k,
struct bpos bucket_start, struct bpos bucket_start,
@ -578,40 +435,20 @@ static int check_bp_exists(struct btree_trans *trans,
struct bpos_level *last_flushed) struct bpos_level *last_flushed)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter alloc_iter, bp_iter = { NULL }; struct btree_iter bp_iter = { NULL };
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bkey_s_c alloc_k, bp_k; struct bkey_s_c bp_k;
int ret; int ret;
if (bpos_lt(bucket_pos, bucket_start) || if (bpos_lt(bucket, bucket_start) ||
bpos_gt(bucket_pos, bucket_end)) bpos_gt(bucket, bucket_end))
return 0; return 0;
bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); if (!bch2_dev_bucket_exists(c, bucket))
alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
ret = bkey_err(alloc_k);
if (ret)
goto err;
if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
for (i = 0; i < nr; i++) {
int cmp = backpointer_cmp(bps[i], bp) ?:
memcmp(&bps[i], &bp, sizeof(bp));
if (!cmp)
goto out;
if (cmp >= 0)
break;
}
} else {
goto missing; goto missing;
}
bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0); 0);
bp_k = bch2_btree_iter_peek_slot(&bp_iter); bp_k = bch2_btree_iter_peek_slot(&bp_iter);
ret = bkey_err(bp_k); ret = bkey_err(bp_k);
@ -635,11 +472,9 @@ out:
err: err:
fsck_err: fsck_err:
bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &bp_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf); printbuf_exit(&buf);
return ret; return ret;
missing: missing:
prt_printf(&buf, "missing backpointer for btree=%s l=%u ", prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_ids[bp.btree_id], bp.level); bch2_btree_ids[bp.btree_id], bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k); bch2_bkey_val_to_text(&buf, c, orig_k);
@ -648,12 +483,8 @@ missing:
if (c->sb.version < bcachefs_metadata_version_backpointers || if (c->sb.version < bcachefs_metadata_version_backpointers ||
c->opts.reconstruct_alloc || c->opts.reconstruct_alloc ||
fsck_err(c, "%s", buf.buf)) { fsck_err(c, "%s", buf.buf))
struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
ret = PTR_ERR_OR_ZERO(a) ?:
bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true);
}
goto out; goto out;
} }
@ -952,53 +783,40 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
} }
static int check_one_backpointer(struct btree_trans *trans, static int check_one_backpointer(struct btree_trans *trans,
struct bpos bucket,
u64 *bp_offset,
struct bbpos start, struct bbpos start,
struct bbpos end, struct bbpos end,
struct bkey_s_c_backpointer bp,
struct bpos *last_flushed_pos) struct bpos *last_flushed_pos)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter iter; struct btree_iter iter;
struct bch_backpointer bp; struct bbpos pos = bp_to_bbpos(*bp.v);
struct bbpos pos;
struct bpos bp_pos;
struct bkey_s_c k; struct bkey_s_c k;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret; int ret;
ret = __bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp_pos, &bp, 0);
if (ret || *bp_offset == U64_MAX)
return ret;
pos = bp_to_bbpos(bp);
if (bbpos_cmp(pos, start) < 0 || if (bbpos_cmp(pos, start) < 0 ||
bbpos_cmp(pos, end) > 0) bbpos_cmp(pos, end) > 0)
return 0; return 0;
k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
ret = bkey_err(k); ret = bkey_err(k);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
return 0; return 0;
if (ret) if (ret)
return ret; return ret;
if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) { if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
*last_flushed_pos = bp_pos; *last_flushed_pos = bp.k->p;
ret = bch2_btree_write_buffer_flush_sync(trans) ?: ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-BCH_ERR_transaction_restart_write_buffer_flush; -BCH_ERR_transaction_restart_write_buffer_flush;
goto out; goto out;
} }
if (fsck_err_on(!k.k, c, if (fsck_err_on(!k.k, c,
"backpointer for %llu:%llu:%llu (btree pos %llu:%llu) points to missing extent\n %s", "backpointer for missing extent\n %s",
bucket.inode, bucket.offset, (u64) bp.bucket_offset, (bch2_backpointer_k_to_text(&buf, c, bp.s_c), buf.buf)))
bp_pos.inode, bp_pos.offset, return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
(bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
if (ret == -ENOENT)
bch_err(c, "backpointer at %llu not found", *bp_offset);
}
out: out:
fsck_err: fsck_err:
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
@ -1013,25 +831,13 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bpos last_flushed_pos = SPOS_MAX; struct bpos last_flushed_pos = SPOS_MAX;
int ret = 0;
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
BTREE_ITER_PREFETCH, k, ret) { POS_MIN, BTREE_ITER_PREFETCH, k,
u64 bp_offset = 0; NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
check_one_backpointer(trans, start, end,
while (!(ret = commit_do(trans, NULL, NULL, bkey_s_c_to_backpointer(k),
BTREE_INSERT_LAZY_RW| &last_flushed_pos));
BTREE_INSERT_NOFAIL,
check_one_backpointer(trans, iter.pos, &bp_offset,
start, end, &last_flushed_pos))) &&
bp_offset < U64_MAX)
bp_offset++;
if (ret)
break;
}
bch2_trans_iter_exit(trans, &iter);
return ret < 0 ? ret : 0;
} }
int bch2_check_backpointers_to_extents(struct bch_fs *c) int bch2_check_backpointers_to_extents(struct bch_fs *c)

View File

@ -53,16 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret; return ret;
} }
bool bch2_bucket_backpointer_del(struct btree_trans *, int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos,
struct bkey_i_alloc_v4 *,
struct bch_backpointer);
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
struct bkey_i_alloc_v4 *,
struct bch_backpointer, struct bkey_s_c, bool); struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_i_alloc_v4 *a, struct bpos bucket,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c orig_k, struct bkey_s_c orig_k,
bool insert) bool insert)
@ -71,13 +66,8 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_i_backpointer *bp_k; struct bkey_i_backpointer *bp_k;
int ret; int ret;
if (!insert &&
unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) &&
bch2_bucket_backpointer_del(trans, a, bp))
return 0;
if (unlikely(bch2_backpointers_no_use_write_buffer)) if (unlikely(bch2_backpointers_no_use_write_buffer))
return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert); return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
ret = PTR_ERR_OR_ZERO(bp_k); ret = PTR_ERR_OR_ZERO(bp_k);
@ -85,7 +75,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
return ret; return ret;
bkey_backpointer_init(&bp_k->k_i); bkey_backpointer_init(&bp_k->k_i);
bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
bp_k->v = bp; bp_k->v = bp;
if (!insert) { if (!insert) {
@ -126,11 +116,12 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
} }
int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
u64 *, struct bch_backpointer *, unsigned); struct bpos *, struct bch_backpointer *, unsigned);
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
struct bpos, u64, struct bch_backpointer); struct bpos, struct bch_backpointer,
unsigned);
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
struct bpos, u64, struct bch_backpointer); struct bpos, struct bch_backpointer);
int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_btree_backpointers(struct bch_fs *);
int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *);

View File

@ -629,18 +629,6 @@ struct btree_path_buf {
#define REPLICAS_DELTA_LIST_MAX (1U << 16) #define REPLICAS_DELTA_LIST_MAX (1U << 16)
struct snapshot_t {
u32 parent;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 equiv;
};
typedef struct {
u32 subvol;
u64 inum;
} subvol_inum;
#define BCACHEFS_ROOT_SUBVOL_INUM \ #define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
@ -808,6 +796,12 @@ struct bch_fs {
struct workqueue_struct *btree_io_complete_wq; struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */ /* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq; struct workqueue_struct *copygc_wq;
/*
* Use a dedicated wq for write ref holder tasks. Required to avoid
* dependency problems with other wq tasks that can block on ref
* draining, such as read-only transition.
*/
struct workqueue_struct *write_ref_wq;
/* ALLOCATION */ /* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR]; struct bch_devs_mask rw_devs[BCH_DATA_NR];
@ -937,6 +931,7 @@ struct bch_fs {
/* COPYGC */ /* COPYGC */
struct task_struct *copygc_thread; struct task_struct *copygc_thread;
struct write_point copygc_write_point; struct write_point copygc_write_point;
s64 copygc_wait_at;
s64 copygc_wait; s64 copygc_wait;
bool copygc_running; bool copygc_running;
wait_queue_head_t copygc_running_wq; wait_queue_head_t copygc_running_wq;
@ -971,6 +966,10 @@ struct bch_fs {
reflink_gc_table reflink_gc_table; reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr; size_t reflink_gc_nr;
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
/* VFS IO PATH - fs-io.c */ /* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset; struct bio_set writepage_bioset;
struct bio_set dio_write_bioset; struct bio_set dio_write_bioset;

View File

@ -1554,7 +1554,8 @@ struct bch_sb_field_journal_seq_blacklist {
x(unwritten_extents, 24) \ x(unwritten_extents, 24) \
x(bucket_gens, 25) \ x(bucket_gens, 25) \
x(lru_v2, 26) \ x(lru_v2, 26) \
x(fragmentation_lru, 27) x(fragmentation_lru, 27) \
x(no_bps_in_alloc_keys, 28)
enum bcachefs_metadata_version { enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9, bcachefs_metadata_version_min = 9,

View File

@ -572,15 +572,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
if (c->opts.reconstruct_alloc || if (!g->gen_valid &&
fsck_err_on(!g->gen_valid, c, (c->opts.reconstruct_alloc ||
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s", "while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)], bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen, p.ptr.gen,
(printbuf_reset(&buf), (printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (!p.ptr.cached) { if (!p.ptr.cached) {
g->gen_valid = true; g->gen_valid = true;
g->gen = p.ptr.gen; g->gen = p.ptr.gen;
@ -589,14 +589,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
} }
} }
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" (c->opts.reconstruct_alloc ||
"while marking %s", fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), "while marking %s",
bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
p.ptr.gen, g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)],
(printbuf_reset(&buf), p.ptr.gen, g->gen,
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { (printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (!p.ptr.cached) { if (!p.ptr.cached) {
g->gen_valid = true; g->gen_valid = true;
g->gen = p.ptr.gen; g->gen = p.ptr.gen;
@ -609,25 +610,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
} }
} }
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" (c->opts.reconstruct_alloc ||
"while marking %s", fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, "while marking %s",
bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
p.ptr.gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)],
(printbuf_reset(&buf), p.ptr.gen,
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) (printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
do_update = true; do_update = true;
if (fsck_err_on(!p.ptr.cached && if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
gen_cmp(p.ptr.gen, g->gen) < 0, c, (c->opts.reconstruct_alloc ||
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s", "while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)], bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen, g->gen, p.ptr.gen, g->gen,
(printbuf_reset(&buf), (printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
do_update = true; do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@ -757,7 +759,7 @@ found:
if (level) if (level)
bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
if (c->opts.verbose) { if (0) {
printbuf_reset(&buf); printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, *k); bch2_bkey_val_to_text(&buf, c, *k);
bch_info(c, "updated %s", buf.buf); bch_info(c, "updated %s", buf.buf);

View File

@ -2722,12 +2722,12 @@ static inline void btree_path_list_add(struct btree_trans *trans,
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{ {
if (iter->path)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
if (iter->update_path) if (iter->update_path)
bch2_path_put_nokeep(trans, iter->update_path, bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT); iter->flags & BTREE_ITER_INTENT);
if (iter->path)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
if (iter->key_cache_path) if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path, bch2_path_put(trans, iter->key_cache_path,
iter->flags & BTREE_ITER_INTENT); iter->flags & BTREE_ITER_INTENT);

View File

@ -60,6 +60,7 @@ enum btree_insert_flags {
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned); unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
struct bkey_i *, enum btree_update_flags); struct bkey_i *, enum btree_update_flags);
@ -94,8 +95,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *); struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned); int __bch2_trans_commit(struct btree_trans *, unsigned);
int bch2_trans_log_msg(struct btree_trans *, const char *, ...);
int bch2_fs_log_msg(struct bch_fs *, const char *, ...); int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
/** /**
* bch2_trans_commit - insert keys at given iterator positions * bch2_trans_commit - insert keys at given iterator positions

View File

@ -11,6 +11,7 @@
#include "btree_iter.h" #include "btree_iter.h"
#include "btree_locking.h" #include "btree_locking.h"
#include "buckets.h" #include "buckets.h"
#include "clock.h"
#include "error.h" #include "error.h"
#include "extents.h" #include "extents.h"
#include "journal.h" #include "journal.h"
@ -363,6 +364,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
BUG_ON(ret); BUG_ON(ret);
trace_and_count(c, btree_node_alloc, c, b); trace_and_count(c, btree_node_alloc, c, b);
bch2_increment_clock(c, btree_sectors(c), WRITE);
return b; return b;
} }
@ -686,7 +688,8 @@ err:
bch2_trans_unlock(&trans); bch2_trans_unlock(&trans);
btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
bch2_btree_path_level_init(&trans, path, b); path->l[b->c.level].lock_seq = b->c.lock.state.seq;
path->l[b->c.level].b = b;
bch2_btree_node_lock_write_nofail(&trans, path, &b->c); bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
@ -1677,7 +1680,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
BUG_ON(!as || as->b); BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys); bch2_verify_keylist_sorted(keys);
if (!(local_clock() & 63)) if ((local_clock() & 63) == 63)
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
ret = bch2_btree_node_lock_write(trans, path, &b->c); ret = bch2_btree_node_lock_write(trans, path, &b->c);
@ -1717,7 +1720,7 @@ split:
* bch2_btree_path_upgrade() and allocating more nodes: * bch2_btree_path_upgrade() and allocating more nodes:
*/ */
if (b->c.level >= as->update_level) { if (b->c.level >= as->update_level) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_); trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
} }

View File

@ -622,14 +622,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
prefetch(&trans->c->journal.flags); prefetch(&trans->c->journal.flags);
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
if (ret)
return ret;
h = h->next;
}
trans_for_each_update(trans, i) { trans_for_each_update(trans, i) {
/* Multiple inserts might go to same leaf: */ /* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i)) if (!same_leaf_as_prev(trans, i))
@ -696,6 +688,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto revert_fs_usage; goto revert_fs_usage;
} }
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
if (ret)
goto revert_fs_usage;
h = h->next;
}
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, i->flags); ret = run_one_mem_trigger(trans, i, i->flags);
@ -1426,10 +1426,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
update->k.p = k.k->p; update->k.p = k.k->p;
update->k.p.snapshot = insert->k.p.snapshot; update->k.p.snapshot = insert->k.p.snapshot;
if (insert->k.p.snapshot != k.k->p.snapshot || if (insert->k.p.snapshot != k.k->p.snapshot) {
(btree_type_has_snapshots(btree_id) &&
need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
update->k.type = KEY_TYPE_whiteout; update->k.type = KEY_TYPE_whiteout;
} else if (btree_type_has_snapshots(btree_id)) {
ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
if (ret < 0)
goto err;
if (ret)
update->k.type = KEY_TYPE_whiteout;
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update, ret = bch2_btree_insert_nonextent(trans, btree_id, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
@ -1797,6 +1802,20 @@ int bch2_btree_delete_at(struct btree_trans *trans,
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
} }
int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
struct bkey_i *k;
k = bch2_trans_kmalloc(trans, sizeof(*k));
if (IS_ERR(k))
return PTR_ERR(k);
bkey_init(&k->k);
k->k.p = pos;
return bch2_trans_update_buffered(trans, btree, k);
}
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end, struct bpos start, struct bpos end,
unsigned update_flags, unsigned update_flags,
@ -1919,14 +1938,19 @@ err:
return ret; return ret;
} }
int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...) static int
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
va_list args)
{ {
va_list args;
int ret; int ret;
va_start(args, fmt); if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args); ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
va_end(args); } else {
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|commit_flags,
__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
}
return ret; return ret;
} }
@ -1937,16 +1961,22 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
int ret; int ret;
va_start(args, fmt); va_start(args, fmt);
ret = __bch2_fs_log_msg(c, 0, fmt, args);
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { va_end(args);
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); return ret;
} else { }
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); /*
} * Use for logging messages during recovery to enable reserved space and avoid
* blocking.
*/
int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
{
va_list args;
int ret;
va_start(args, fmt);
ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args);
va_end(args); va_end(args);
return ret; return ret;
} }

View File

@ -109,9 +109,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
struct journal *j = &c->journal; struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_write_buffer *wb = &c->btree_write_buffer;
struct journal_entry_pin pin; struct journal_entry_pin pin;
struct btree_write_buffered_key *i, *dst, *keys; struct btree_write_buffered_key *i, *keys;
struct btree_iter iter = { NULL }; struct btree_iter iter = { NULL };
size_t nr = 0, skipped = 0, fast = 0; size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false; bool write_locked = false;
union btree_write_buffer_state s; union btree_write_buffer_state s;
int ret = 0; int ret = 0;
@ -135,15 +135,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* *
* However, since we're not flushing in the order they appear in the * However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is * journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal, if we weren't * flushed - which means this could deadlock the journal if we weren't
* passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
* if it would block taking a journal reservation. * if it would block taking a journal reservation.
* *
* If that happens, we sort them by the order they appeared in the * If that happens, simply skip the key so we can optimistically insert
* journal - after dropping redundant entries - and then restart * as many keys as possible in the fast path.
* flushing, this time dropping journal pins as we go.
*/ */
sort(keys, nr, sizeof(keys[0]), sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_key_cmp, NULL); btree_write_buffered_key_cmp, NULL);
@ -152,6 +150,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
i[0].btree == i[1].btree && i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p)) { bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++; skipped++;
i->journal_seq = 0;
continue; continue;
} }
@ -177,8 +176,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
bch2_trans_begin(trans); bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
continue;
}
if (ret) if (ret)
break; break;
i->journal_seq = 0;
} }
if (write_locked) if (write_locked)
@ -187,7 +192,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
if (ret == -BCH_ERR_journal_reclaim_would_deadlock) if (slowpath)
goto slowpath; goto slowpath;
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
@ -198,23 +203,19 @@ out:
slowpath: slowpath:
trace_write_buffer_flush_slowpath(trans, i - keys, nr); trace_write_buffer_flush_slowpath(trans, i - keys, nr);
dst = keys; /*
for (; i < keys + nr; i++) { * Now sort the rest by journal seq and bump the journal pin as we go.
if (i + 1 < keys + nr && * The slowpath zapped the seq of keys that were successfully flushed so
i[0].btree == i[1].btree && * we can skip those here.
bpos_eq(i[0].k.k.p, i[1].k.k.p)) */
continue;
*dst = *i;
dst++;
}
nr = dst - keys;
sort(keys, nr, sizeof(keys[0]), sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_journal_cmp, btree_write_buffered_journal_cmp,
NULL); NULL);
for (i = keys; i < keys + nr; i++) { for (i = keys; i < keys + nr; i++) {
if (!i->journal_seq)
continue;
if (i->journal_seq > pin.seq) { if (i->journal_seq > pin.seq) {
struct journal_entry_pin pin2; struct journal_entry_pin pin2;

View File

@ -1407,17 +1407,17 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct btree_iter iter; struct btree_iter iter;
struct bkey_i_alloc_v4 *a; struct bkey_i_alloc_v4 *a;
struct bpos bucket_pos; struct bpos bucket;
struct bch_backpointer bp; struct bch_backpointer bp;
s64 sectors; s64 sectors;
int ret; int ret;
bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
sectors = bp.bucket_len; sectors = bp.bucket_len;
if (!insert) if (!insert)
sectors = -sectors; sectors = -sectors;
a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); a = bch2_trans_start_alloc_update(trans, &iter, bucket);
if (IS_ERR(a)) if (IS_ERR(a))
return PTR_ERR(a); return PTR_ERR(a);
@ -1428,7 +1428,7 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
goto err; goto err;
if (!p.ptr.cached) { if (!p.ptr.cached) {
ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert); ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
if (ret) if (ret)
goto err; goto err;
} }

View File

@ -19,11 +19,11 @@ struct { \
typedef DARRAY(void) darray_void; typedef DARRAY(void) darray_void;
static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
{ {
if (d->nr + more > d->size) { if (d->nr + more > d->size) {
size_t new_size = roundup_pow_of_two(d->nr + more); size_t new_size = roundup_pow_of_two(d->nr + more);
void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); void *data = krealloc_array(d->data, new_size, t_size, gfp);
if (!data) if (!data)
return -ENOMEM; return -ENOMEM;
@ -35,20 +35,30 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
return 0; return 0;
} }
#define darray_make_room_gfp(_d, _more, _gfp) \
__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
#define darray_make_room(_d, _more) \ #define darray_make_room(_d, _more) \
__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) darray_make_room_gfp(_d, _more, GFP_KERNEL)
#define darray_top(_d) ((_d).data[(_d).nr]) #define darray_top(_d) ((_d).data[(_d).nr])
#define darray_push(_d, _item) \ #define darray_push_gfp(_d, _item, _gfp) \
({ \ ({ \
int _ret = darray_make_room((_d), 1); \ int _ret = darray_make_room_gfp((_d), 1, _gfp); \
\ \
if (!_ret) \ if (!_ret) \
(_d)->data[(_d)->nr++] = (_item); \ (_d)->data[(_d)->nr++] = (_item); \
_ret; \ _ret; \
}) })
#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
#define darray_pop(_d) ((_d)->data[--(_d)->nr])
#define darray_first(_d) ((_d).data[0])
#define darray_last(_d) ((_d).data[(_d).nr - 1])
#define darray_insert_item(_d, _pos, _item) \ #define darray_insert_item(_d, _pos, _item) \
({ \ ({ \
size_t pos = (_pos); \ size_t pos = (_pos); \

View File

@ -163,7 +163,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) && if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) { !ptr->cached) {
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
/*
* See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
*/
rewrites_found |= 1U << i; rewrites_found |= 1U << i;
} }
i++; i++;
@ -205,7 +209,14 @@ restart_drop_extra_replicas:
if (!p.ptr.cached && if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) { durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability; durability -= ptr_durability;
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
/*
* Currently, we're dropping unneeded replicas
* instead of marking them as cached, since
* cached data in stripe buckets prevents them
* from being reused:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
*/
goto restart_drop_extra_replicas; goto restart_drop_extra_replicas;
} }
} }

View File

@ -826,7 +826,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
void bch2_do_stripe_deletes(struct bch_fs *c) void bch2_do_stripe_deletes(struct bch_fs *c)
{ {
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
!schedule_work(&c->ec_stripe_delete_work)) !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
} }
@ -887,7 +887,7 @@ err:
static int ec_stripe_update_extent(struct btree_trans *trans, static int ec_stripe_update_extent(struct btree_trans *trans,
struct bpos bucket, u8 gen, struct bpos bucket, u8 gen,
struct ec_stripe_buf *s, struct ec_stripe_buf *s,
u64 *bp_offset) struct bpos *bp_pos)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bch_backpointer bp; struct bch_backpointer bp;
@ -900,10 +900,10 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
int ret, dev, block; int ret, dev, block;
ret = bch2_get_next_backpointer(trans, bucket, gen, ret = bch2_get_next_backpointer(trans, bucket, gen,
bp_offset, &bp, BTREE_ITER_CACHED); bp_pos, &bp, BTREE_ITER_CACHED);
if (ret) if (ret)
return ret; return ret;
if (*bp_offset == U64_MAX) if (bpos_eq(*bp_pos, SPOS_MAX))
return 0; return 0;
if (bp.level) { if (bp.level) {
@ -911,7 +911,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct btree_iter node_iter; struct btree_iter node_iter;
struct btree *b; struct btree *b;
b = bch2_backpointer_get_node(trans, &node_iter, bucket, *bp_offset, bp); b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
bch2_trans_iter_exit(trans, &node_iter); bch2_trans_iter_exit(trans, &node_iter);
if (!b) if (!b)
@ -925,7 +925,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
return -EIO; return -EIO;
} }
k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
ret = bkey_err(k); ret = bkey_err(k);
if (ret) if (ret)
return ret; return ret;
@ -984,7 +984,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bch_extent_ptr bucket = s->key.v.ptrs[block]; struct bch_extent_ptr bucket = s->key.v.ptrs[block];
struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
u64 bp_offset = 0; struct bpos bp_pos = POS_MIN;
int ret = 0; int ret = 0;
while (1) { while (1) {
@ -992,13 +992,13 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL, BTREE_INSERT_NOFAIL,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen, ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_offset)); s, &bp_pos));
if (ret) if (ret)
break; break;
if (bp_offset == U64_MAX) if (bkey_eq(bp_pos, POS_MAX))
break; break;
bp_offset++; bp_pos = bpos_nosnap_successor(bp_pos);
} }
return ret; return ret;

File diff suppressed because it is too large Load Diff

View File

@ -451,19 +451,20 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
return ret; return ret;
if (path.dentry->d_sb->s_fs_info != c) { if (path.dentry->d_sb->s_fs_info != c) {
path_put(&path); ret = -EXDEV;
return -EXDEV; goto err;
} }
dir = path.dentry->d_parent->d_inode; dir = path.dentry->d_parent->d_inode;
ret = __bch2_unlink(dir, path.dentry, true); ret = __bch2_unlink(dir, path.dentry, true);
if (!ret) { if (ret)
fsnotify_rmdir(dir, path.dentry); goto err;
d_delete(path.dentry);
}
path_put(&path);
fsnotify_rmdir(dir, path.dentry);
d_delete(path.dentry);
err:
path_put(&path);
return ret; return ret;
} }

View File

@ -105,6 +105,11 @@ retry:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry; goto retry;
bch2_fs_fatal_err_on(ret == -ENOENT, c,
"inode %u:%llu not found when updating",
inode_inum(inode).subvol,
inode_inum(inode).inum);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
return ret < 0 ? ret : 0; return ret < 0 ? ret : 0;
} }
@ -201,6 +206,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
return ERR_PTR(ret); return ERR_PTR(ret);
} }
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
unlock_new_inode(&inode->v); unlock_new_inode(&inode->v);
return &inode->v; return &inode->v;
@ -314,6 +323,9 @@ err_before_quota:
inode = old; inode = old;
} else { } else {
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/* /*
* we really don't want insert_inode_locked2() to be setting * we really don't want insert_inode_locked2() to be setting
* I_NEW... * I_NEW...
@ -442,19 +454,27 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_trans_init(&trans, c, 4, 1024); bch2_trans_init(&trans, c, 4, 1024);
ret = commit_do(&trans, NULL, NULL, ret = commit_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL, BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans, bch2_unlink_trans(&trans,
inode_inum(dir), &dir_u, inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name, &inode_u, &dentry->d_name,
deleting_snapshot)); deleting_snapshot));
if (unlikely(ret))
goto err;
if (likely(!ret)) { bch2_inode_update_after_write(&trans, dir, &dir_u,
bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME);
ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(&trans, inode, &inode_u,
bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_MTIME);
ATTR_MTIME);
if (inode_u.bi_subvol) {
/*
* Subvolume deletion is asynchronous, but we still want to tell
* the VFS that it's been deleted here:
*/
set_nlink(&inode->v, 0);
} }
err:
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@ -1349,6 +1369,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->v.i_op = &bch_special_inode_operations; inode->v.i_op = &bch_special_inode_operations;
break; break;
} }
mapping_set_large_folios(inode->v.i_mapping);
} }
static struct inode *bch2_alloc_inode(struct super_block *sb) static struct inode *bch2_alloc_inode(struct super_block *sb)
@ -1362,6 +1384,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v); inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock); mutex_init(&inode->ei_update_lock);
two_state_lock_init(&inode->ei_pagecache_lock); two_state_lock_init(&inode->ei_pagecache_lock);
INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
mutex_init(&inode->ei_quota_lock); mutex_init(&inode->ei_quota_lock);
return &inode->v; return &inode->v;
@ -1426,53 +1449,78 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN); KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode_inum(inode)); bch2_inode_rm(c, inode_inum(inode));
} }
mutex_lock(&c->vfs_inodes_lock);
list_del_init(&inode->ei_vfs_inode_list);
mutex_unlock(&c->vfs_inodes_lock);
} }
void bch2_evict_subvolume_inodes(struct bch_fs *c, void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
snapshot_id_list *s)
{ {
struct super_block *sb = c->vfs_sb; struct bch_inode_info *inode, **i;
struct inode *inode; DARRAY(struct bch_inode_info *) grabbed;
bool clean_pass = false, this_pass_clean;
spin_lock(&sb->s_inode_list_lock); /*
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { * Initially, we scan for inodes without I_DONTCACHE, then mark them to
if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || * be pruned with d_mark_dontcache().
(inode->i_state & I_FREEING)) *
continue; * Once we've had a clean pass where we didn't find any inodes without
* I_DONTCACHE, we wait for them to be freed:
*/
d_mark_dontcache(inode); darray_init(&grabbed);
d_prune_aliases(inode); darray_make_room(&grabbed, 1024);
}
spin_unlock(&sb->s_inode_list_lock);
again: again:
cond_resched(); cond_resched();
spin_lock(&sb->s_inode_list_lock); this_pass_clean = true;
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || mutex_lock(&c->vfs_inodes_lock);
(inode->i_state & I_FREEING)) list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
if (!snapshot_list_has_id(s, inode->ei_subvol))
continue; continue;
if (!(inode->i_state & I_DONTCACHE)) { if (!(inode->v.i_state & I_DONTCACHE) &&
d_mark_dontcache(inode); !(inode->v.i_state & I_FREEING)) {
d_prune_aliases(inode); this_pass_clean = false;
}
d_mark_dontcache(&inode->v);
d_prune_aliases(&inode->v);
/*
* If i_count was zero, we have to take and release a
* ref in order for I_DONTCACHE to be noticed and the
* inode to be dropped;
*/
if (!atomic_read(&inode->v.i_count) &&
igrab(&inode->v) &&
darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN))
break;
} else if (clean_pass && this_pass_clean) {
wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
spin_lock(&inode->i_lock);
if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
!(inode->i_state & I_FREEING)) {
wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock); mutex_unlock(&c->vfs_inodes_lock);
spin_unlock(&sb->s_inode_list_lock);
schedule(); schedule();
finish_wait(wq, &wait.wq_entry); finish_wait(wq, &wait.wq_entry);
goto again; goto again;
} }
spin_unlock(&inode->i_lock);
} }
spin_unlock(&sb->s_inode_list_lock); mutex_unlock(&c->vfs_inodes_lock);
darray_for_each(grabbed, i)
iput(&(*i)->v);
grabbed.nr = 0;
if (!clean_pass || !this_pass_clean) {
clean_pass = this_pass_clean;
goto again;
}
darray_exit(&grabbed);
} }
static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)

View File

@ -13,6 +13,7 @@
struct bch_inode_info { struct bch_inode_info {
struct inode v; struct inode v;
struct list_head ei_vfs_inode_list;
unsigned long ei_flags; unsigned long ei_flags;
struct mutex ei_update_lock; struct mutex ei_update_lock;

View File

@ -803,9 +803,6 @@ retry:
bch2_inode_unpack(k, &inode_u); bch2_inode_unpack(k, &inode_u);
/* Subvolume root? */
BUG_ON(inode_u.bi_subvol);
bkey_inode_generation_init(&delete.k_i); bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos; delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);

View File

@ -151,11 +151,11 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{ {
struct bvec_iter_all iter; struct bvec_iter_all iter;
struct bio_vec *bv; struct bio_vec bv;
bio_for_each_segment_all(bv, bio, iter) bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0)) if (bv.bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages); mempool_free(bv.bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0; bio->bi_vcnt = 0;
} }
@ -385,6 +385,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct open_buckets open_buckets; struct open_buckets open_buckets;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_buf old, new; struct bkey_buf old, new;
unsigned sectors_allocated;
bool have_reservation = false; bool have_reservation = false;
bool unwritten = opts.nocow && bool unwritten = opts.nocow &&
c->sb.version >= bcachefs_metadata_version_unwritten_extents; c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@ -395,6 +396,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
closure_init_stack(&cl); closure_init_stack(&cl);
open_buckets.nr = 0; open_buckets.nr = 0;
retry: retry:
sectors_allocated = 0;
k = bch2_btree_iter_peek_slot(iter); k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k); ret = bkey_err(k);
if (ret) if (ret)
@ -451,15 +454,16 @@ retry:
opts.data_replicas, opts.data_replicas,
opts.data_replicas, opts.data_replicas,
RESERVE_none, 0, &cl, &wp); RESERVE_none, 0, &cl, &wp);
if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { if (ret) {
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
closure_sync(&cl); closure_sync(&cl);
goto retry; if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
} goto retry;
if (ret)
return ret; return ret;
}
sectors = min(sectors, wp->sectors_free); sectors = min(sectors, wp->sectors_free);
sectors_allocated = sectors;
bch2_key_resize(&e->k, sectors); bch2_key_resize(&e->k, sectors);
@ -486,6 +490,9 @@ out:
goto retry; goto retry;
} }
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
bch2_open_buckets_put(c, &open_buckets); bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res); bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c); bch2_bkey_buf_exit(&new, c);
@ -1475,7 +1482,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_ptrs_c ptrs; struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr, *ptr2; const struct bch_extent_ptr *ptr;
struct { struct {
struct bpos b; struct bpos b;
unsigned gen; unsigned gen;
@ -1530,11 +1537,12 @@ retry:
bucket_to_u64(buckets[nr_buckets].b)); bucket_to_u64(buckets[nr_buckets].b));
prefetch(buckets[nr_buckets].l); prefetch(buckets[nr_buckets].l);
nr_buckets++;
if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
goto err_get_ioref; goto err_get_ioref;
nr_buckets++;
if (ptr->unwritten) if (ptr->unwritten)
op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
} }
@ -1625,12 +1633,8 @@ err:
} }
return; return;
err_get_ioref: err_get_ioref:
bkey_for_each_ptr(ptrs, ptr2) { for (i = 0; i < nr_buckets; i++)
if (ptr2 == ptr) percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
break;
percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
}
/* Fall back to COW path: */ /* Fall back to COW path: */
goto out; goto out;
@ -1639,9 +1643,8 @@ err_bucket_stale:
bch2_bucket_nocow_unlock(&c->nocow_locks, bch2_bucket_nocow_unlock(&c->nocow_locks,
buckets[i].b, buckets[i].b,
BUCKET_NOCOW_LOCK_UPDATE); BUCKET_NOCOW_LOCK_UPDATE);
for (i = 0; i < nr_buckets; i++)
bkey_for_each_ptr(ptrs, ptr2) percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
/* We can retry this: */ /* We can retry this: */
ret = BCH_ERR_transaction_restart; ret = BCH_ERR_transaction_restart;
@ -1889,6 +1892,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_str(out, "pos: "); prt_str(out, "pos: ");
bch2_bpos_to_text(out, op->pos); bch2_bpos_to_text(out, op->pos);
prt_newline(out); prt_newline(out);
printbuf_indent_add(out, 2);
prt_str(out, "started: "); prt_str(out, "started: ");
bch2_pr_time_units(out, local_clock() - op->start_time); bch2_pr_time_units(out, local_clock() - op->start_time);
@ -1897,6 +1901,11 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_str(out, "flags: "); prt_str(out, "flags: ");
prt_bitflags(out, bch2_write_flags, op->flags); prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out); prt_newline(out);
prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
prt_newline(out);
printbuf_indent_sub(out, 2);
} }
/* Cache promotion on read */ /* Cache promotion on read */

View File

@ -76,6 +76,67 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
p->devs.nr = 0; p->devs.nr = 0;
} }
/*
* Detect stuck journal conditions and trigger shutdown. Technically the journal
* can end up stuck for a variety of reasons, such as a blocked I/O, journal
* reservation lockup, etc. Since this is a fatal error with potentially
* unpredictable characteristics, we want to be fairly conservative before we
* decide to shut things down.
*
* Consider the journal stuck when it appears full with no ability to commit
* btree transactions, to discard journal buckets, nor acquire priority
* (reserved watermark) reservation.
*/
static inline bool
journal_error_check_stuck(struct journal *j, int error, unsigned flags)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool stuck = false;
struct printbuf buf = PRINTBUF;
if (!(error == JOURNAL_ERR_journal_full ||
error == JOURNAL_ERR_journal_pin_full) ||
nr_unwritten_journal_entries(j) ||
(flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved)
return stuck;
spin_lock(&j->lock);
if (j->can_discard) {
spin_unlock(&j->lock);
return stuck;
}
stuck = true;
/*
* The journal shutdown path will set ->err_seq, but do it here first to
* serialize against concurrent failures and avoid duplicate error
* reports.
*/
if (j->err_seq) {
spin_unlock(&j->lock);
return stuck;
}
j->err_seq = journal_cur_seq(j);
spin_unlock(&j->lock);
bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
bch2_journal_errors[error]);
bch2_journal_debug_to_text(&buf, j);
bch_err(c, "%s", buf.buf);
printbuf_reset(&buf);
bch2_journal_pins_to_text(&buf, j);
bch_err(c, "Journal pins:\n%s", buf.buf);
printbuf_exit(&buf);
bch2_fatal_error(c);
dump_stack();
return stuck;
}
/* journal entry close/open: */ /* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j) void __bch2_journal_buf_put(struct journal *j)
@ -163,6 +224,7 @@ void bch2_journal_halt(struct journal *j)
__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
if (!j->err_seq) if (!j->err_seq)
j->err_seq = journal_cur_seq(j); j->err_seq = journal_cur_seq(j);
journal_wake(j);
spin_unlock(&j->lock); spin_unlock(&j->lock);
} }
@ -363,6 +425,12 @@ retry:
spin_lock(&j->lock); spin_lock(&j->lock);
/* check once more in case somebody else shut things down... */
if (bch2_journal_error(j)) {
spin_unlock(&j->lock);
return -BCH_ERR_erofs_journal_err;
}
/* /*
* Recheck after taking the lock, so we don't race with another thread * Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call journal_entry_close() * that just did journal_entry_open() and call journal_entry_close()
@ -410,28 +478,8 @@ unlock:
if (!ret) if (!ret)
goto retry; goto retry;
if (journal_error_check_stuck(j, ret, flags))
if ((ret == JOURNAL_ERR_journal_full || ret = -BCH_ERR_journal_res_get_blocked;
ret == JOURNAL_ERR_journal_pin_full) &&
!can_discard &&
!nr_unwritten_journal_entries(j) &&
(flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
struct printbuf buf = PRINTBUF;
bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
bch2_journal_errors[ret]);
bch2_journal_debug_to_text(&buf, j);
bch_err(c, "%s", buf.buf);
printbuf_reset(&buf);
bch2_journal_pins_to_text(&buf, j);
bch_err(c, "Journal pins:\n%s", buf.buf);
printbuf_exit(&buf);
bch2_fatal_error(c);
dump_stack();
}
/* /*
* Journal is full - can't rely on reclaim from work item due to * Journal is full - can't rely on reclaim from work item due to

View File

@ -210,24 +210,7 @@ void bch2_journal_space_available(struct journal *j)
clean = j->space[journal_space_clean].total; clean = j->space[journal_space_clean].total;
total = j->space[journal_space_total].total; total = j->space[journal_space_total].total;
if (!clean_ondisk && if (!j->space[journal_space_discarded].next_entry)
journal_cur_seq(j) == j->seq_ondisk) {
struct printbuf buf = PRINTBUF;
__bch2_journal_debug_to_text(&buf, j);
bch_err(c, "journal stuck\n%s", buf.buf);
printbuf_exit(&buf);
/*
* Hack: bch2_fatal_error() calls bch2_journal_halt() which
* takes journal lock:
*/
spin_unlock(&j->lock);
bch2_fatal_error(c);
spin_lock(&j->lock);
ret = JOURNAL_ERR_journal_stuck;
} else if (!j->space[journal_space_discarded].next_entry)
ret = JOURNAL_ERR_journal_full; ret = JOURNAL_ERR_journal_full;
if ((j->space[journal_space_clean_ondisk].next_entry < if ((j->space[journal_space_clean_ondisk].next_entry <

View File

@ -148,7 +148,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out; goto out;
} }
if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n" if (c->opts.reconstruct_alloc ||
fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
" %s\n" " %s\n"
" for %s", " for %s",
bch2_lru_types[type], bch2_lru_types[type],

View File

@ -627,9 +627,12 @@ void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket,
struct bkey_s_c k; struct bkey_s_c k;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bch_backpointer bp; struct bch_backpointer bp;
u64 bp_offset = 0; struct bpos bp_pos = POS_MIN;
unsigned nr_bps = 0;
int ret; int ret;
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
bucket, BTREE_ITER_CACHED); bucket, BTREE_ITER_CACHED);
again: again:
@ -650,6 +653,7 @@ again:
} }
} }
set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
return; return;
failed_to_evacuate: failed_to_evacuate:
@ -665,17 +669,16 @@ failed_to_evacuate:
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen, ret = bch2_get_next_backpointer(trans, bucket, gen,
&bp_offset, &bp, &bp_pos, &bp,
BTREE_ITER_CACHED); BTREE_ITER_CACHED);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue; continue;
if (ret) if (ret)
break; break;
if (bp_offset == U64_MAX) if (bkey_eq(bp_pos, POS_MAX))
break; break;
k = bch2_backpointer_get_key(trans, &iter, k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
bucket, bp_offset, bp);
ret = bkey_err(k); ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue; continue;
@ -686,6 +689,10 @@ failed_to_evacuate:
prt_newline(&buf); prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k); bch2_bkey_val_to_text(&buf, c, k);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
if (++nr_bps > 10)
break;
bp_pos = bpos_nosnap_successor(bp_pos);
} }
bch2_print_string_as_lines(KERN_ERR, buf.buf); bch2_print_string_as_lines(KERN_ERR, buf.buf);
@ -709,11 +716,17 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct data_update_opts data_opts; struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size; unsigned dirty_sectors, bucket_size;
u64 fragmentation; u64 fragmentation;
u64 bp_offset = 0, cur_inum = U64_MAX; u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0; int ret = 0;
bch2_bkey_buf_init(&sk); bch2_bkey_buf_init(&sk);
/*
* We're not run in a context that handles transaction restarts:
*/
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
bucket, BTREE_ITER_CACHED); bucket, BTREE_ITER_CACHED);
ret = lockrestart_do(trans, ret = lockrestart_do(trans,
@ -740,13 +753,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen, ret = bch2_get_next_backpointer(trans, bucket, gen,
&bp_offset, &bp, &bp_pos, &bp,
BTREE_ITER_CACHED); BTREE_ITER_CACHED);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue; continue;
if (ret) if (ret)
goto err; goto err;
if (bp_offset == U64_MAX) if (bkey_eq(bp_pos, POS_MAX))
break; break;
if (!bp.level) { if (!bp.level) {
@ -754,8 +767,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct bkey_s_c k; struct bkey_s_c k;
unsigned i = 0; unsigned i = 0;
k = bch2_backpointer_get_key(trans, &iter, k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
bucket, bp_offset, bp);
ret = bkey_err(k); ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue; continue;
@ -810,8 +822,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
} else { } else {
struct btree *b; struct btree *b;
b = bch2_backpointer_get_node(trans, &iter, b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
bucket, bp_offset, bp);
ret = PTR_ERR_OR_ZERO(b); ret = PTR_ERR_OR_ZERO(b);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
continue; continue;
@ -839,7 +850,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
} }
} }
next: next:
bp_offset++; bp_pos = bpos_nosnap_successor(bp_pos);
} }
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);

View File

@ -16,9 +16,20 @@ struct bch_move_stats {
atomic64_t sectors_raced; atomic64_t sectors_raced;
}; };
struct move_bucket_in_flight { struct move_bucket_key {
struct bpos bucket; struct bpos bucket;
u8 gen; u8 gen;
};
struct move_bucket {
struct move_bucket_key k;
unsigned sectors;
};
struct move_bucket_in_flight {
struct move_bucket_in_flight *next;
struct rhash_head hash;
struct move_bucket bucket;
atomic_t count; atomic_t count;
}; };

View File

@ -34,8 +34,51 @@
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/wait.h> #include <linux/wait.h>
struct buckets_in_flight {
struct rhashtable table;
struct move_bucket_in_flight *first;
struct move_bucket_in_flight *last;
size_t nr;
size_t sectors;
};
static const struct rhashtable_params bch_move_bucket_params = {
.head_offset = offsetof(struct move_bucket_in_flight, hash),
.key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
.key_len = sizeof(struct move_bucket_key),
};
static struct move_bucket_in_flight *
move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
{
struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
int ret;
if (!new)
return ERR_PTR(-ENOMEM);
new->bucket = b;
ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
bch_move_bucket_params);
if (ret) {
kfree(new);
return ERR_PTR(ret);
}
if (!list->first)
list->first = new;
else
list->last->next = new;
list->last = new;
list->nr++;
list->sectors += b.sectors;
return new;
}
static int bch2_bucket_is_movable(struct btree_trans *trans, static int bch2_bucket_is_movable(struct btree_trans *trans,
struct bpos bucket, u64 time, u8 *gen) struct move_bucket *b, u64 time)
{ {
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
@ -43,10 +86,13 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
const struct bch_alloc_v4 *a; const struct bch_alloc_v4 *a;
int ret; int ret;
if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset)) if (bch2_bucket_is_open(trans->c,
b->k.bucket.inode,
b->k.bucket.offset))
return 0; return 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
b->k.bucket, BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter); k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k); ret = bkey_err(k);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
@ -55,12 +101,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret; return ret;
a = bch2_alloc_to_v4(k, &_a); a = bch2_alloc_to_v4(k, &_a);
*gen = a->gen; b->k.gen = a->gen;
b->sectors = a->dirty_sectors;
ret = data_type_movable(a->data_type) && ret = data_type_movable(a->data_type) &&
a->fragmentation_lru && a->fragmentation_lru &&
a->fragmentation_lru <= time; a->fragmentation_lru <= time;
if (ret) { if (!ret) {
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, trans->c, k); bch2_bkey_val_to_text(&buf, trans->c, k);
@ -71,41 +119,16 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret; return ret;
} }
typedef FIFO(struct move_bucket_in_flight) move_buckets_in_flight;
struct move_bucket {
struct bpos bucket;
u8 gen;
};
typedef DARRAY(struct move_bucket) move_buckets;
static int move_bucket_cmp(const void *_l, const void *_r)
{
const struct move_bucket *l = _l;
const struct move_bucket *r = _r;
return bkey_cmp(l->bucket, r->bucket);
}
static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b)
{
return bsearch(&b,
buckets_sorted->data,
buckets_sorted->nr,
sizeof(buckets_sorted->data[0]),
move_bucket_cmp) != NULL;
}
static void move_buckets_wait(struct btree_trans *trans, static void move_buckets_wait(struct btree_trans *trans,
struct moving_context *ctxt, struct moving_context *ctxt,
move_buckets_in_flight *buckets_in_flight, struct buckets_in_flight *list,
size_t nr, bool verify_evacuated) bool flush)
{ {
while (!fifo_empty(buckets_in_flight)) { struct move_bucket_in_flight *i;
struct move_bucket_in_flight *i = &fifo_peek_front(buckets_in_flight); int ret;
if (fifo_used(buckets_in_flight) > nr) while ((i = list->first)) {
if (flush)
move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
if (atomic_read(&i->count)) if (atomic_read(&i->count))
@ -116,66 +139,82 @@ static void move_buckets_wait(struct btree_trans *trans,
* reads, which inits another btree_trans; this one must be * reads, which inits another btree_trans; this one must be
* unlocked: * unlocked:
*/ */
if (verify_evacuated) bch2_verify_bucket_evacuated(trans, i->bucket.k.bucket, i->bucket.k.gen);
bch2_verify_bucket_evacuated(trans, i->bucket, i->gen);
buckets_in_flight->front++; list->first = i->next;
if (!list->first)
list->last = NULL;
list->nr--;
list->sectors -= i->bucket.sectors;
ret = rhashtable_remove_fast(&list->table, &i->hash,
bch_move_bucket_params);
BUG_ON(ret);
kfree(i);
} }
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
} }
static bool bucket_in_flight(struct buckets_in_flight *list,
struct move_bucket_key k)
{
return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
}
typedef DARRAY(struct move_bucket) move_buckets;
static int bch2_copygc_get_buckets(struct btree_trans *trans, static int bch2_copygc_get_buckets(struct btree_trans *trans,
struct moving_context *ctxt, struct moving_context *ctxt,
move_buckets_in_flight *buckets_in_flight, struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets) move_buckets *buckets)
{ {
struct bch_fs *c = trans->c;
struct btree_iter iter; struct btree_iter iter;
move_buckets buckets_sorted = { 0 };
struct move_bucket_in_flight *i;
struct bkey_s_c k; struct bkey_s_c k;
size_t fifo_iter, nr_to_get; size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret; int ret;
move_buckets_wait(trans, ctxt, buckets_in_flight, buckets_in_flight->size / 2, true); move_buckets_wait(trans, ctxt, buckets_in_flight, false);
nr_to_get = max(16UL, fifo_used(buckets_in_flight) / 4); ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
fifo_for_each_entry_ptr(i, buckets_in_flight, fifo_iter) { __func__, bch2_err_str(ret)))
ret = darray_push(&buckets_sorted, ((struct move_bucket) {i->bucket, i->gen})); return ret;
if (ret) {
bch_err(trans->c, "error allocating move_buckets_sorted");
goto err;
}
}
sort(buckets_sorted.data,
buckets_sorted.nr,
sizeof(buckets_sorted.data[0]),
move_bucket_cmp,
NULL);
ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({ 0, k, ({
struct move_bucket b = { .bucket = u64_to_bucket(k.k->p.offset) }; struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
int ret = 0; int ret = 0;
if (!bucket_in_flight(&buckets_sorted, b) && saw++;
bch2_bucket_is_movable(trans, b.bucket, lru_pos_time(k.k->p), &b.gen))
ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
not_movable++;
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
if (ret >= 0)
sectors += b.sectors;
}
ret; ret;
})); }));
err:
darray_exit(&buckets_sorted); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
buckets_in_flight->nr, buckets_in_flight->sectors,
saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
return ret < 0 ? ret : 0; return ret < 0 ? ret : 0;
} }
static int bch2_copygc(struct btree_trans *trans, static int bch2_copygc(struct btree_trans *trans,
struct moving_context *ctxt, struct moving_context *ctxt,
move_buckets_in_flight *buckets_in_flight) struct buckets_in_flight *buckets_in_flight)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct data_update_opts data_opts = { struct data_update_opts data_opts = {
@ -187,11 +226,6 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved); u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0; int ret = 0;
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
__func__, bch2_err_str(ret)))
return ret;
ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
if (ret) if (ret)
goto err; goto err;
@ -200,12 +234,17 @@ static int bch2_copygc(struct btree_trans *trans,
if (unlikely(freezing(current))) if (unlikely(freezing(current)))
break; break;
f = fifo_push_ref(buckets_in_flight); f = move_bucket_in_flight_add(buckets_in_flight, *i);
f->bucket = i->bucket; ret = PTR_ERR_OR_ZERO(f);
f->gen = i->gen; if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
atomic_set(&f->count, 0); continue;
if (ret == -ENOMEM) { /* flush IO, continue later */
ret = 0;
break;
}
ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket, f->gen, data_opts); ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret) if (ret)
goto err; goto err;
} }
@ -269,6 +308,12 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
atomic64_read(&c->io_clock[WRITE].now)) << 9); atomic64_read(&c->io_clock[WRITE].now)) << 9);
prt_newline(out); prt_newline(out);
prt_printf(out, "Currently waiting since: ");
prt_human_readable_u64(out, max(0LL,
atomic64_read(&c->io_clock[WRITE].now) -
c->copygc_wait_at) << 9);
prt_newline(out);
prt_printf(out, "Currently calculated wait: "); prt_printf(out, "Currently calculated wait: ");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out); prt_newline(out);
@ -281,13 +326,17 @@ static int bch2_copygc_thread(void *arg)
struct moving_context ctxt; struct moving_context ctxt;
struct bch_move_stats move_stats; struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE]; struct io_clock *clock = &c->io_clock[WRITE];
move_buckets_in_flight move_buckets; struct buckets_in_flight move_buckets;
u64 last, wait; u64 last, wait;
int ret = 0; int ret = 0;
if (!init_fifo(&move_buckets, 1 << 14, GFP_KERNEL)) { memset(&move_buckets, 0, sizeof(move_buckets));
bch_err(c, "error allocating copygc buckets in flight");
return -ENOMEM; ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
if (ret) {
bch_err(c, "error allocating copygc buckets in flight: %s",
bch2_err_str(ret));
return ret;
} }
set_freezable(); set_freezable();
@ -303,12 +352,12 @@ static int bch2_copygc_thread(void *arg)
cond_resched(); cond_resched();
if (!c->copy_gc_enabled) { if (!c->copy_gc_enabled) {
move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); move_buckets_wait(&trans, &ctxt, &move_buckets, true);
kthread_wait_freezable(c->copy_gc_enabled); kthread_wait_freezable(c->copy_gc_enabled);
} }
if (unlikely(freezing(current))) { if (unlikely(freezing(current))) {
move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); move_buckets_wait(&trans, &ctxt, &move_buckets, true);
__refrigerator(false); __refrigerator(false);
continue; continue;
} }
@ -317,9 +366,10 @@ static int bch2_copygc_thread(void *arg)
wait = bch2_copygc_wait_amount(c); wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) { if (wait > clock->max_slop) {
move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); c->copygc_wait_at = last;
trace_and_count(c, copygc_wait, c, wait, last + wait);
c->copygc_wait = last + wait; c->copygc_wait = last + wait;
move_buckets_wait(&trans, &ctxt, &move_buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait, bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT); MAX_SCHEDULE_TIMEOUT);
continue; continue;
@ -334,9 +384,9 @@ static int bch2_copygc_thread(void *arg)
wake_up(&c->copygc_running_wq); wake_up(&c->copygc_running_wq);
} }
move_buckets_wait(&trans, &ctxt, &move_buckets, true);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
bch2_moving_ctxt_exit(&ctxt); bch2_moving_ctxt_exit(&ctxt);
free_fifo(&move_buckets);
return 0; return 0;
} }

View File

@ -476,6 +476,26 @@ void bch2_journal_keys_free(struct journal_keys *keys)
keys->nr = keys->gap = keys->size = 0; keys->nr = keys->gap = keys->size = 0;
} }
static void __journal_keys_sort(struct journal_keys *keys)
{
struct journal_key *src, *dst;
sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
bpos_eq(src[0].k->k.p, src[1].k->k.p))
src++;
*dst++ = *src++;
}
keys->nr = dst - keys->d;
}
static int journal_keys_sort(struct bch_fs *c) static int journal_keys_sort(struct bch_fs *c)
{ {
struct genradix_iter iter; struct genradix_iter iter;
@ -483,8 +503,7 @@ static int journal_keys_sort(struct bch_fs *c)
struct jset_entry *entry; struct jset_entry *entry;
struct bkey_i *k; struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys; struct journal_keys *keys = &c->journal_keys;
struct journal_key *src, *dst; size_t nr_keys = 0, nr_read = 0;
size_t nr_keys = 0;
genradix_for_each(&c->journal_entries, iter, _i) { genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i; i = *_i;
@ -503,9 +522,19 @@ static int journal_keys_sort(struct bch_fs *c)
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
if (!keys->d) { if (!keys->d) {
bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys)", bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
nr_keys); nr_keys);
return -BCH_ERR_ENOMEM_journal_keys_sort;
do {
keys->size >>= 1;
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
} while (!keys->d && keys->size > nr_keys / 8);
if (!keys->d) {
bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
keys->size);
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
} }
genradix_for_each(&c->journal_entries, iter, _i) { genradix_for_each(&c->journal_entries, iter, _i) {
@ -514,7 +543,17 @@ static int journal_keys_sort(struct bch_fs *c)
if (!i || i->ignore) if (!i || i->ignore)
continue; continue;
for_each_jset_key(k, entry, &i->j) for_each_jset_key(k, entry, &i->j) {
if (keys->nr == keys->size) {
__journal_keys_sort(keys);
if (keys->nr > keys->size * 7 / 8) {
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
keys->nr, keys->size, nr_read, nr_keys);
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
}
keys->d[keys->nr++] = (struct journal_key) { keys->d[keys->nr++] = (struct journal_key) {
.btree_id = entry->btree_id, .btree_id = entry->btree_id,
.level = entry->level, .level = entry->level,
@ -522,23 +561,15 @@ static int journal_keys_sort(struct bch_fs *c)
.journal_seq = le64_to_cpu(i->j.seq), .journal_seq = le64_to_cpu(i->j.seq),
.journal_offset = k->_data - i->j._data, .journal_offset = k->_data - i->j._data,
}; };
nr_read++;
}
} }
sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); __journal_keys_sort(keys);
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
bpos_eq(src[0].k->k.p, src[1].k->k.p))
src++;
*dst++ = *src++;
}
keys->nr = dst - keys->d;
keys->gap = keys->nr; keys->gap = keys->nr;
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
return 0; return 0;
} }
@ -614,8 +645,8 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
journal_sort_seq_cmp, NULL); journal_sort_seq_cmp, NULL);
if (keys->nr) { if (keys->nr) {
ret = bch2_fs_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq); keys->nr, start_seq, end_seq);
if (ret) if (ret)
goto err; goto err;
} }
@ -649,7 +680,7 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
ret = bch2_journal_error(j); ret = bch2_journal_error(j);
if (keys->nr && !ret) if (keys->nr && !ret)
bch2_fs_log_msg(c, "journal replay finished"); bch2_journal_log_msg(c, "journal replay finished");
err: err:
kvfree(keys_sorted); kvfree(keys_sorted);
return ret; return ret;
@ -1103,14 +1134,11 @@ int bch2_fs_recovery(struct bch_fs *c)
} }
if (!c->opts.nochanges) { if (!c->opts.nochanges) {
if (c->sb.version < bcachefs_metadata_version_lru_v2) { if (c->sb.version < bcachefs_metadata_version_no_bps_in_alloc_keys) {
bch_info(c, "version prior to backpointers, upgrade and fsck required"); bch_info(c, "version prior to no_bps_in_alloc_keys, upgrade and fsck required");
c->opts.version_upgrade = true; c->opts.version_upgrade = true;
c->opts.fsck = true; c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES; c->opts.fix_errors = FSCK_OPT_YES;
} else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
bch_info(c, "version prior to backpointers, upgrade required");
c->opts.version_upgrade = true;
} }
} }
@ -1213,8 +1241,8 @@ use_clean:
journal_seq += 8; journal_seq += 8;
if (blacklist_seq != journal_seq) { if (blacklist_seq != journal_seq) {
ret = bch2_fs_log_msg(c, "blacklisting entries %llu-%llu", ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
blacklist_seq, journal_seq) ?: blacklist_seq, journal_seq) ?:
bch2_journal_seq_blacklist_add(c, bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq); blacklist_seq, journal_seq);
if (ret) { if (ret) {
@ -1223,14 +1251,14 @@ use_clean:
} }
} }
ret = bch2_fs_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
journal_seq, last_seq, blacklist_seq - 1) ?: journal_seq, last_seq, blacklist_seq - 1) ?:
bch2_fs_journal_start(&c->journal, journal_seq); bch2_fs_journal_start(&c->journal, journal_seq);
if (ret) if (ret)
goto err; goto err;
if (c->opts.reconstruct_alloc) if (c->opts.reconstruct_alloc)
bch2_fs_log_msg(c, "dropping alloc info"); bch2_journal_log_msg(c, "dropping alloc info");
/* /*
* Skip past versions that might have possibly been used (as nonces), * Skip past versions that might have possibly been used (as nonces),

View File

@ -714,7 +714,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
void bch2_delete_dead_snapshots_async(struct bch_fs *c) void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{ {
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
!queue_work(system_long_wq, &c->snapshot_delete_work)) !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
} }
@ -926,7 +926,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
return -EROFS; return -EROFS;
if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
return 0; return 0;
} }

View File

@ -6,4 +6,16 @@
typedef DARRAY(u32) snapshot_id_list; typedef DARRAY(u32) snapshot_id_list;
struct snapshot_t {
u32 parent;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 equiv;
};
typedef struct {
u32 subvol;
u64 inum;
} subvol_inum;
#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */

View File

@ -494,6 +494,8 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->journal_seq_blacklist_table); kfree(c->journal_seq_blacklist_table);
kfree(c->unused_inode_hints); kfree(c->unused_inode_hints);
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
if (c->io_complete_wq) if (c->io_complete_wq)
destroy_workqueue(c->io_complete_wq); destroy_workqueue(c->io_complete_wq);
if (c->copygc_wq) if (c->copygc_wq)
@ -709,6 +711,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
sema_init(&c->io_in_flight, 128); sema_init(&c->io_in_flight, 128);
INIT_LIST_HEAD(&c->vfs_inodes_list);
mutex_init(&c->vfs_inodes_lock);
c->copy_gc_enabled = 1; c->copy_gc_enabled = 1;
c->rebalance.enabled = 1; c->rebalance.enabled = 1;
c->promote_whole_extents = true; c->promote_whole_extents = true;
@ -784,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io", !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG #ifndef BCH_WRITE_REF_DEBUG
percpu_ref_init(&c->writes, bch2_writes_disabled, percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) || PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
@ -1738,6 +1745,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
bch2_write_super(c); bch2_write_super(c);
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
ret = bch2_fs_freespace_init(c);
if (ret)
bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
up_write(&c->state_lock); up_write(&c->state_lock);
return 0; return 0;
err: err:

View File

@ -2,8 +2,10 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "alloc_types.h" #include "alloc_types.h"
#include "buckets.h" #include "buckets.h"
#include "btree_cache.h"
#include "btree_iter.h" #include "btree_iter.h"
#include "btree_locking.h" #include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h" #include "keylist.h"
#include "opts.h" #include "opts.h"

View File

@ -240,36 +240,6 @@ bool bch2_is_zero(const void *_p, size_t n)
return true; return true;
} }
static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct bch2_quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
{ {
while (nr_bits) while (nr_bits)
@ -343,6 +313,36 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
/* time stats: */ /* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct bch2_quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
u64 start, u64 end) u64 start, u64 end)
{ {

View File

@ -168,10 +168,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
void bio_free_pages(struct bio *bio) void bio_free_pages(struct bio *bio)
{ {
struct bvec_iter_all iter; struct bvec_iter_all iter;
struct bio_vec *bvec; struct bio_vec bvec;
bio_for_each_segment_all(bvec, bio, iter) bio_for_each_segment_all(bvec, bio, iter)
__free_page(bvec->bv_page); __free_page(bvec.bv_page);
} }
void bio_advance(struct bio *bio, unsigned bytes) void bio_advance(struct bio *bio, unsigned bytes)