From 7f102ee83d83fd918783ca542fac1574f9b2c623 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 31 Mar 2023 15:52:24 -0400 Subject: [PATCH] Update bcachefs sources to 8fd009dd76 bcachefs: Rip out code for storing backpointers in alloc keys Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- include/linux/bio.h | 35 +- include/linux/bvec.h | 52 +- include/trace/events/bcachefs.h | 31 +- libbcachefs/alloc_background.c | 67 +- libbcachefs/alloc_foreground.c | 6 +- libbcachefs/backpointers.c | 334 ++------ libbcachefs/backpointers.h | 25 +- libbcachefs/bcachefs.h | 23 +- libbcachefs/bcachefs_format.h | 3 +- libbcachefs/btree_gc.c | 72 +- libbcachefs/btree_iter.c | 6 +- libbcachefs/btree_update.h | 3 +- libbcachefs/btree_update_interior.c | 9 +- libbcachefs/btree_update_leaf.c | 82 +- libbcachefs/btree_write_buffer.c | 43 +- libbcachefs/buckets.c | 8 +- libbcachefs/darray.h | 20 +- libbcachefs/data_update.c | 11 + libbcachefs/ec.c | 20 +- libbcachefs/fs-io.c | 1174 +++++++++++++++------------ libbcachefs/fs-ioctl.c | 15 +- libbcachefs/fs.c | 132 ++- libbcachefs/fs.h | 1 + libbcachefs/inode.c | 3 - libbcachefs/io.c | 45 +- libbcachefs/journal.c | 92 ++- libbcachefs/journal_reclaim.c | 19 +- libbcachefs/lru.c | 3 +- libbcachefs/move.c | 37 +- libbcachefs/move_types.h | 13 +- libbcachefs/movinggc.c | 220 +++-- libbcachefs/recovery.c | 92 ++- libbcachefs/subvolume.c | 4 +- libbcachefs/subvolume_types.h | 12 + libbcachefs/super.c | 11 + libbcachefs/trace.c | 2 + libbcachefs/util.c | 60 +- linux/bio.c | 4 +- 39 files changed, 1552 insertions(+), 1239 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index ff46d7d8..febf3dcf 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -0342eebf85b7be76f01bacec8f958c6e6039535b +8fd009dd764dabd79e2b42e1c85812a08ad1d6c0 diff --git a/include/linux/bio.h b/include/linux/bio.h index 0ad5a87d..206e5baa 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -113,17 +113,40 @@ static inline void *bio_data(struct bio *bio) #define __bio_kunmap_atomic(addr) kunmap_atomic(addr) -static inline struct bio_vec *bio_next_segment(const struct bio *bio, +static inline struct bio_vec bio_iter_all_peek(const struct bio *bio, struct bvec_iter_all *iter) { - if (iter->idx >= bio->bi_vcnt) - return NULL; + if (WARN_ON(iter->idx >= bio->bi_vcnt)) + return (struct bio_vec) { NULL }; - return &bio->bi_io_vec[iter->idx]; + return bvec_iter_all_peek(bio->bi_io_vec, iter); } -#define bio_for_each_segment_all(bvl, bio, iter) \ - for ((iter).idx = 0; (bvl = bio_next_segment((bio), &(iter))); (iter).idx++) +static inline void bio_iter_all_advance(const struct bio *bio, + struct bvec_iter_all *iter, + unsigned bytes) +{ + bvec_iter_all_advance(bio->bi_io_vec, iter, bytes); + + WARN_ON(iter->idx > bio->bi_vcnt || + (iter->idx == bio->bi_vcnt && iter->done)); +} + +#define bio_for_each_segment_all_continue(bvl, bio, iter) \ + for (; \ + iter.idx < bio->bi_vcnt && \ + ((bvl = bio_iter_all_peek(bio, &iter)), true); \ + bio_iter_all_advance((bio), &iter, bvl.bv_len)) + +/* + * drivers should _never_ use the all version - the bio may have been split + * before it got to the driver and the driver won't own all of it + */ +#define bio_for_each_segment_all(bvl, bio, iter) \ + for (bvec_iter_all_init(&iter); \ + iter.idx < (bio)->bi_vcnt && \ + ((bvl = bio_iter_all_peek((bio), &iter)), true); \ + bio_iter_all_advance((bio), &iter, bvl.bv_len)) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 5bc68b42..a11373db 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -43,10 +43,6 @@ struct bvec_iter { current bvec */ }; -struct bvec_iter_all { - int idx; -}; - /* * various member access, note that bio_data should of course not be used * on highmem page vectors @@ -98,4 +94,52 @@ static inline void bvec_iter_advance(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) +/* + * bvec_iter_all: for advancing over individual pages in a bio, as it was when + * it was first created: + */ +struct bvec_iter_all { + int idx; + unsigned done; +}; + +static inline void bvec_iter_all_init(struct bvec_iter_all *iter_all) +{ + iter_all->done = 0; + iter_all->idx = 0; +} + +static inline struct bio_vec __bvec_iter_all_peek(const struct bio_vec *bvec, + const struct bvec_iter_all *iter) +{ + struct bio_vec bv = bvec[iter->idx]; + + BUG_ON(iter->done >= bv.bv_len); + + bv.bv_offset += iter->done; + bv.bv_len -= iter->done; + return bv; +} + +static inline struct bio_vec bvec_iter_all_peek(const struct bio_vec *bvec, + const struct bvec_iter_all *iter) +{ + struct bio_vec bv = __bvec_iter_all_peek(bvec, iter); + + bv.bv_len = min_t(unsigned, PAGE_SIZE - bv.bv_offset, bv.bv_len); + return bv; +} + +static inline void bvec_iter_all_advance(const struct bio_vec *bvec, + struct bvec_iter_all *iter, + unsigned bytes) +{ + iter->done += bytes; + + while (iter->done && iter->done >= bvec[iter->idx].bv_len) { + iter->done -= bvec[iter->idx].bv_len; + iter->idx++; + } +} + #endif /* __LINUX_BVEC_ITER_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index ae184220..2f6acfc7 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -831,10 +831,35 @@ DEFINE_EVENT(transaction_event, trans_restart_injected, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_split_race, +TRACE_EVENT(trans_restart_split_race, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + struct btree *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, level ) + __field(u16, written ) + __field(u16, blocks ) + __field(u16, u64s_remaining ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->level = b->c.level; + __entry->written = b->written; + __entry->blocks = btree_blocks(trans->c); + __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + ), + + TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->level, + __entry->written, __entry->blocks, + __entry->u64s_remaining) ); DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 009a85bc..aef796b5 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -451,6 +451,8 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) if (src < dst) memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); } else { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); @@ -476,20 +478,13 @@ static noinline struct bkey_i_alloc_v4 * __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i_alloc_v4 *ret; - if (k.k->type == KEY_TYPE_alloc_v4) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - unsigned bytes = sizeof(struct bkey_i_alloc_v4) + - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) * - sizeof(struct bch_backpointer); - void *src, *dst; - /* - * Reserve space for one more backpointer here: - * Not sketchy at doing it this way, nope... - */ - ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); - if (IS_ERR(ret)) - return ret; + ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4)); + if (IS_ERR(ret)) + return ret; + + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; bkey_reassemble(&ret->k_i, k); @@ -497,17 +492,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(&ret->v); - memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * - sizeof(struct bch_backpointer)); if (src < dst) memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); set_alloc_v4_u64s(ret); } else { - ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) + - sizeof(struct bch_backpointer)); - if (IS_ERR(ret)) - return ret; - bkey_alloc_v4_init(&ret->k_i); ret->k.p = k.k->p; bch2_alloc_to_v4(k, &ret->v); @@ -517,8 +507,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { + struct bkey_s_c_alloc_v4 a; + if (likely(k.k->type == KEY_TYPE_alloc_v4) && - BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { + ((a = bkey_s_c_to_alloc_v4(k), true) && + BCH_ALLOC_V4_BACKPOINTERS_START(a.v) == BCH_ALLOC_V4_U64s && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) { /* * Reserve space for one more backpointer here: * Not sketchy at doing it this way, nope... @@ -962,10 +956,17 @@ struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, s struct bpos next; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, - bkey_min(bkey_min(end, - iter->path->l[0].b->key.k.p), - POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1))); + + if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + + end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); + + /* + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ + k = bch2_btree_iter_peek_upto(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); @@ -1760,7 +1761,7 @@ static void bch2_do_discards_work(struct work_struct *work) void bch2_do_discards(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(system_long_wq, &c->discard_work)) + !queue_work(c->write_ref_wq, &c->discard_work)) bch2_write_ref_put(c, BCH_WRITE_REF_discard); } @@ -1886,11 +1887,12 @@ err: void bch2_do_invalidates(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(system_long_wq, &c->invalidate_work)) + !queue_work(c->write_ref_wq, &c->invalidate_work)) bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } -static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + unsigned long *last_updated) { struct btree_trans trans; struct btree_iter iter; @@ -1910,6 +1912,12 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { + if (*last_updated + HZ * 10 < jiffies) { + bch_info(ca, "%s: currently at %llu/%llu", + __func__, iter.pos.offset, ca->mi.nbuckets); + *last_updated = jiffies; + } + bch2_trans_begin(&trans); if (bkey_ge(iter.pos, end)) { @@ -1989,6 +1997,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) unsigned i; int ret = 0; bool doing_init = false; + unsigned long last_updated = jiffies; /* * We can crash during the device add path, so we need to check this on @@ -2004,7 +2013,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca); + ret = bch2_dev_freespace_init(c, ca, &last_updated); if (ret) { percpu_ref_put(&ca->ref); return ret; diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index d52f30ac..350635f3 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -345,17 +345,17 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { struct bch_backpointer bp; - u64 bp_offset = 0; + struct bpos bp_pos = POS_MIN; ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, - &bp_offset, &bp, + &bp_pos, &bp, BTREE_ITER_NOPRESERVE); if (ret) { ob = ERR_PTR(ret); goto err; } - if (bp_offset != U64_MAX) { + if (!bkey_eq(bp_pos, POS_MAX)) { /* * Bucket may have data in it - we don't call * bc2h_trans_inconnsistent() because fsck hasn't diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 740084b3..a3a1ed6e 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -69,6 +69,10 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } @@ -81,117 +85,6 @@ void bch2_backpointer_swab(struct bkey_s k) bch2_bpos_swab(&bp.v->pos); } -#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1) - -static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r) -{ - return cmp_int(l.bucket_offset, r.bucket_offset); -} - -static int bch2_backpointer_del_by_offset(struct btree_trans *trans, - struct bpos bucket, - u64 bp_offset, - struct bch_backpointer bp) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - if (bp_offset < BACKPOINTER_OFFSET_MAX) { - struct bch_backpointer *bps; - struct bkey_i_alloc_v4 *a; - unsigned i, nr; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_alloc_v4) { - ret = -ENOENT; - goto err; - } - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; - bps = alloc_v4_backpointers(&a->v); - nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); - - for (i = 0; i < nr; i++) { - if (bps[i].bucket_offset == bp_offset) - goto found; - if (bps[i].bucket_offset > bp_offset) - break; - } - - ret = -ENOENT; - goto err; -found: - if (memcmp(&bps[i], &bp, sizeof(bp))) { - ret = -ENOENT; - goto err; - } - array_remove_item(bps, nr, i); - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); - set_alloc_v4_u64s(a); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } else { - bp_offset -= BACKPOINTER_OFFSET_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp_offset), - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { - ret = -ENOENT; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -bool bch2_bucket_backpointer_del(struct btree_trans *trans, - struct bkey_i_alloc_v4 *a, - struct bch_backpointer bp) -{ - struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); - unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); - - for (i = 0; i < nr; i++) { - int cmp = backpointer_cmp(bps[i], bp) ?: - memcmp(&bps[i], &bp, sizeof(bp)); - if (!cmp) { - array_remove_item(bps, nr, i); - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); - set_alloc_v4_u64s(a); - return true; - } - if (cmp >= 0) - break; - } - - return false; -} - static noinline int backpointer_mod_err(struct btree_trans *trans, struct bch_backpointer bp, struct bkey_s_c bp_k, @@ -245,7 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_i_alloc_v4 *a, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) @@ -262,7 +155,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -271,7 +164,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, } bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), + bp_k->k.p, BTREE_ITER_INTENT| BTREE_ITER_SLOTS| BTREE_ITER_WITH_UPDATES); @@ -298,94 +191,62 @@ err: /* * Find the next backpointer >= *bp_offset: */ -int __bch2_get_next_backpointer(struct btree_trans *trans, - struct bpos bucket, int gen, - u64 *bp_offset, - struct bpos *bp_pos_ret, - struct bch_backpointer *dst, - unsigned iter_flags) +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + struct bpos *bp_pos, + struct bch_backpointer *bp, + unsigned iter_flags) { struct bch_fs *c = trans->c; - struct bpos bp_pos, bp_end_pos; - struct btree_iter alloc_iter, bp_iter = { NULL }; + struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; - struct bkey_s_c_alloc_v4 a; - size_t i; - int ret; + int ret = 0; - if (*bp_offset == U64_MAX) - return 0; - - bp_pos = bucket_pos_to_bp(c, bucket, - max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); - bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); - - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(k); - if (ret) - goto out; - - if (k.k->type != KEY_TYPE_alloc_v4) + if (bpos_ge(*bp_pos, bp_end_pos)) goto done; - a = bkey_s_c_to_alloc_v4(k); - if (gen >= 0 && a.v->gen != gen) - goto done; + if (gen >= 0) { + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED|iter_flags); + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto out; - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { - if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) - continue; - - *dst = alloc_v4_backpointers_c(a.v)[i]; - *bp_offset = dst->bucket_offset; - goto out; + if (k.k->type != KEY_TYPE_alloc_v4 || + bkey_s_c_to_alloc_v4(k).v->gen != gen) + goto done; } + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, - bp_pos, 0, k, ret) { + *bp_pos, iter_flags, k, ret) { if (bpos_ge(k.k->p, bp_end_pos)) break; - if (k.k->type != KEY_TYPE_backpointer) - continue; - - *dst = *bkey_s_c_to_backpointer(k).v; - *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX; - *bp_pos_ret = k.k->p; + *bp_pos = k.k->p; + *bp = *bkey_s_c_to_backpointer(k).v; goto out; } done: - *bp_offset = U64_MAX; + *bp_pos = SPOS_MAX; out: bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &alloc_iter); return ret; } -int bch2_get_next_backpointer(struct btree_trans *trans, - struct bpos bucket, int gen, - u64 *bp_offset, - struct bch_backpointer *dst, - unsigned iter_flags) -{ - struct bpos bp_pos; - - return __bch2_get_next_backpointer(trans, bucket, gen, - bp_offset, &bp_pos, - dst, iter_flags); -} - static void backpointer_not_found(struct btree_trans *trans, - struct bpos bucket, - u64 bp_offset, + struct bpos bp_pos, struct bch_backpointer bp, struct bkey_s_c k, const char *thing_it_points_to) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); if (likely(!bch2_backpointers_no_use_write_buffer)) return; @@ -396,14 +257,9 @@ static void backpointer_not_found(struct btree_trans *trans, bch2_bpos_to_text(&buf, bucket); prt_printf(&buf, "\n "); - if (bp_offset >= BACKPOINTER_OFFSET_MAX) { - struct bpos bp_pos = - bucket_pos_to_bp(c, bucket, - bp_offset - BACKPOINTER_OFFSET_MAX); - prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp_pos); - prt_printf(&buf, "\n "); - } + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); bch2_backpointer_to_text(&buf, &bp); prt_printf(&buf, "\n "); @@ -418,11 +274,12 @@ static void backpointer_not_found(struct btree_trans *trans, struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct btree_iter *iter, - struct bpos bucket, - u64 bp_offset, - struct bch_backpointer bp) + struct bpos bp_pos, + struct bch_backpointer bp, + unsigned iter_flags) { struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct bkey_s_c k; bch2_trans_node_iter_init(trans, iter, @@ -430,7 +287,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, bp.pos, 0, min(bp.level, c->btree_roots[bp.btree_id].level), - 0); + iter_flags); k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); @@ -455,7 +312,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, * been written out yet - backpointer_get_node() checks for * this: */ - b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); + b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); if (!IS_ERR_OR_NULL(b)) return bkey_i_to_s_c(&b->key); @@ -466,7 +323,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, return bkey_s_c_null; } - backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); + backpointer_not_found(trans, bp_pos, bp, k, "extent"); } return bkey_s_c_null; @@ -474,11 +331,11 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree_iter *iter, - struct bpos bucket, - u64 bp_offset, + struct bpos bp_pos, struct bch_backpointer bp) { struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct btree *b; BUG_ON(!bp.level); @@ -501,7 +358,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (b && btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bucket, bp_offset, bp, + backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key), "btree node"); b = NULL; } @@ -570,7 +427,7 @@ struct bpos_level { }; static int check_bp_exists(struct btree_trans *trans, - struct bpos bucket_pos, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, struct bpos bucket_start, @@ -578,40 +435,20 @@ static int check_bp_exists(struct btree_trans *trans, struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter, bp_iter = { NULL }; + struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; - struct bkey_s_c alloc_k, bp_k; + struct bkey_s_c bp_k; int ret; - if (bpos_lt(bucket_pos, bucket_start) || - bpos_gt(bucket_pos, bucket_end)) + if (bpos_lt(bucket, bucket_start) || + bpos_gt(bucket, bucket_end)) return 0; - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); - alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(alloc_k); - if (ret) - goto err; - - if (alloc_k.k->type == KEY_TYPE_alloc_v4) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v); - unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); - - for (i = 0; i < nr; i++) { - int cmp = backpointer_cmp(bps[i], bp) ?: - memcmp(&bps[i], &bp, sizeof(bp)); - if (!cmp) - goto out; - if (cmp >= 0) - break; - } - } else { + if (!bch2_dev_bucket_exists(c, bucket)) goto missing; - } bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), + bucket_pos_to_bp(c, bucket, bp.bucket_offset), 0); bp_k = bch2_btree_iter_peek_slot(&bp_iter); ret = bkey_err(bp_k); @@ -635,11 +472,9 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; missing: - prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_ids[bp.btree_id], bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); @@ -648,12 +483,8 @@ missing: if (c->sb.version < bcachefs_metadata_version_backpointers || c->opts.reconstruct_alloc || - fsck_err(c, "%s", buf.buf)) { - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); - - ret = PTR_ERR_OR_ZERO(a) ?: - bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true); - } + fsck_err(c, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; } @@ -952,53 +783,40 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) } static int check_one_backpointer(struct btree_trans *trans, - struct bpos bucket, - u64 *bp_offset, struct bbpos start, struct bbpos end, + struct bkey_s_c_backpointer bp, struct bpos *last_flushed_pos) { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bch_backpointer bp; - struct bbpos pos; - struct bpos bp_pos; + struct bbpos pos = bp_to_bbpos(*bp.v); struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ret = __bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp_pos, &bp, 0); - if (ret || *bp_offset == U64_MAX) - return ret; - - pos = bp_to_bbpos(bp); if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; - k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) { - *last_flushed_pos = bp_pos; + if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { + *last_flushed_pos = bp.k->p; ret = bch2_btree_write_buffer_flush_sync(trans) ?: -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } if (fsck_err_on(!k.k, c, - "backpointer for %llu:%llu:%llu (btree pos %llu:%llu) points to missing extent\n %s", - bucket.inode, bucket.offset, (u64) bp.bucket_offset, - bp_pos.inode, bp_pos.offset, - (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { - ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); - if (ret == -ENOENT) - bch_err(c, "backpointer at %llu not found", *bp_offset); - } + "backpointer for missing extent\n %s", + (bch2_backpointer_k_to_text(&buf, c, bp.s_c), buf.buf))) + return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); out: fsck_err: bch2_trans_iter_exit(trans, &iter); @@ -1013,25 +831,13 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct bpos last_flushed_pos = SPOS_MAX; - int ret = 0; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - u64 bp_offset = 0; - - while (!(ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_one_backpointer(trans, iter.pos, &bp_offset, - start, end, &last_flushed_pos))) && - bp_offset < U64_MAX) - bp_offset++; - - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - return ret < 0 ? ret : 0; + return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed_pos)); } int bch2_check_backpointers_to_extents(struct bch_fs *c) diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index d0ba5d85..9c03709a 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -53,16 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -bool bch2_bucket_backpointer_del(struct btree_trans *, - struct bkey_i_alloc_v4 *, - struct bch_backpointer); - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, - struct bkey_i_alloc_v4 *, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bkey_i_alloc_v4 *a, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) @@ -71,13 +66,8 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_i_backpointer *bp_k; int ret; - if (!insert && - unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) && - bch2_bucket_backpointer_del(trans, a, bp)) - return 0; - if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert); + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); ret = PTR_ERR_OR_ZERO(bp_k); @@ -85,7 +75,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -126,11 +116,12 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, } int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, - u64 *, struct bch_backpointer *, unsigned); + struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, - struct bpos, u64, struct bch_backpointer); + struct bpos, struct bch_backpointer, + unsigned); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, - struct bpos, u64, struct bch_backpointer); + struct bpos, struct bch_backpointer); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 348ee8e8..1e7c810d 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -629,18 +629,6 @@ struct btree_path_buf { #define REPLICAS_DELTA_LIST_MAX (1U << 16) -struct snapshot_t { - u32 parent; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 equiv; -}; - -typedef struct { - u32 subvol; - u64 inum; -} subvol_inum; - #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) @@ -808,6 +796,12 @@ struct bch_fs { struct workqueue_struct *btree_io_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; + /* + * Use a dedicated wq for write ref holder tasks. Required to avoid + * dependency problems with other wq tasks that can block on ref + * draining, such as read-only transition. + */ + struct workqueue_struct *write_ref_wq; /* ALLOCATION */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; @@ -937,6 +931,7 @@ struct bch_fs { /* COPYGC */ struct task_struct *copygc_thread; struct write_point copygc_write_point; + s64 copygc_wait_at; s64 copygc_wait; bool copygc_running; wait_queue_head_t copygc_running_wq; @@ -971,6 +966,10 @@ struct bch_fs { reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; + /* fs.c */ + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 8b29e462..7d1c0b1e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1554,7 +1554,8 @@ struct bch_sb_field_journal_seq_blacklist { x(unwritten_extents, 24) \ x(bucket_gens, 25) \ x(lru_v2, 26) \ - x(fragmentation_lru, 27) + x(fragmentation_lru, 27) \ + x(no_bps_in_alloc_keys, 28) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index a728e990..fb4226aa 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -572,15 +572,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); - if (c->opts.reconstruct_alloc || - fsck_err_on(!g->gen_valid, c, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!g->gen_valid && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -589,14 +589,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (gen_cmp(p.ptr.gen, g->gen) > 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -609,25 +610,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) do_update = true; - if (fsck_err_on(!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0, c, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) @@ -757,7 +759,7 @@ found: if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); - if (c->opts.verbose) { + if (0) { printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, *k); bch_info(c, "updated %s", buf.buf); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 8f7d3769..f524e4b3 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2722,12 +2722,12 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { - if (iter->path) - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 46fb4a9e..4adb6f64 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -60,6 +60,7 @@ enum btree_insert_flags { int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_update_flags); @@ -94,8 +95,8 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); -int bch2_trans_log_msg(struct btree_trans *, const char *, ...); int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +int bch2_journal_log_msg(struct bch_fs *, const char *, ...); /** * bch2_trans_commit - insert keys at given iterator positions diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 4d63c4d7..e42e8521 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -11,6 +11,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "buckets.h" +#include "clock.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -363,6 +364,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, BUG_ON(ret); trace_and_count(c, btree_node_alloc, c, b); + bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -686,7 +688,8 @@ err: bch2_trans_unlock(&trans); btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); - bch2_btree_path_level_init(&trans, path, b); + path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].b = b; bch2_btree_node_lock_write_nofail(&trans, path, &b->c); @@ -1677,7 +1680,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - if (!(local_clock() & 63)) + if ((local_clock() & 63) == 63) return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); ret = bch2_btree_node_lock_write(trans, path, &b->c); @@ -1717,7 +1720,7 @@ split: * bch2_btree_path_upgrade() and allocating more nodes: */ if (b->c.level >= as->update_level) { - trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_); + trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index de98d760..c17d048b 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -622,14 +622,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, prefetch(&trans->c->journal.flags); - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - return ret; - h = h->next; - } - trans_for_each_update(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) @@ -696,6 +688,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto revert_fs_usage; } + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } + trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ret = run_one_mem_trigger(trans, i, i->flags); @@ -1426,10 +1426,15 @@ int bch2_trans_update_extent(struct btree_trans *trans, update->k.p = k.k->p; update->k.p.snapshot = insert->k.p.snapshot; - if (insert->k.p.snapshot != k.k->p.snapshot || - (btree_type_has_snapshots(btree_id) && - need_whiteout_for_snapshot(trans, btree_id, update->k.p))) + if (insert->k.p.snapshot != k.k->p.snapshot) { update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + goto err; + if (ret) + update->k.type = KEY_TYPE_whiteout; + } ret = bch2_btree_insert_nonextent(trans, btree_id, update, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); @@ -1797,6 +1802,20 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, unsigned update_flags, @@ -1919,14 +1938,19 @@ err: return ret; } -int bch2_trans_log_msg(struct btree_trans *trans, const char *fmt, ...) +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) { - va_list args; int ret; - va_start(args, fmt); - ret = __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args); - va_end(args); + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW|commit_flags, + __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); + } return ret; } @@ -1937,16 +1961,22 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) int ret; va_start(args, fmt); - - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); - } else { - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, - __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); - } - + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args); va_end(args); - return ret; - } diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 80f4b983..9983a478 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -109,9 +109,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *dst, *keys; + struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; union btree_write_buffer_state s; int ret = 0; @@ -135,15 +135,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is - * flushed - which means this could deadlock the journal, if we weren't - * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail + * flushed - which means this could deadlock the journal if we weren't + * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail * if it would block taking a journal reservation. * - * If that happens, we sort them by the order they appeared in the - * journal - after dropping redundant entries - and then restart - * flushing, this time dropping journal pins as we go. + * If that happens, simply skip the key so we can optimistically insert + * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), btree_write_buffered_key_cmp, NULL); @@ -152,6 +150,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f i[0].btree == i[1].btree && bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; + i->journal_seq = 0; continue; } @@ -177,8 +176,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + slowpath++; + continue; + } if (ret) break; + + i->journal_seq = 0; } if (write_locked) @@ -187,7 +192,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + if (slowpath) goto slowpath; bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); @@ -198,23 +203,19 @@ out: slowpath: trace_write_buffer_flush_slowpath(trans, i - keys, nr); - dst = keys; - for (; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) - continue; - - *dst = *i; - dst++; - } - nr = dst - keys; - + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ sort(keys, nr, sizeof(keys[0]), btree_write_buffered_journal_cmp, NULL); for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; + if (i->journal_seq > pin.seq) { struct journal_entry_pin pin2; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 9f2ecff5..0362e10e 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1407,17 +1407,17 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans, bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct btree_iter iter; struct bkey_i_alloc_v4 *a; - struct bpos bucket_pos; + struct bpos bucket; struct bch_backpointer bp; s64 sectors; int ret; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); sectors = bp.bucket_len; if (!insert) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); + a = bch2_trans_start_alloc_update(trans, &iter, bucket); if (IS_ERR(a)) return PTR_ERR(a); @@ -1428,7 +1428,7 @@ static inline int bch2_trans_mark_pointer(struct btree_trans *trans, goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); if (ret) goto err; } diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 519ab9b9..d4485fa0 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -19,11 +19,11 @@ struct { \ typedef DARRAY(void) darray_void; -static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) { if (d->nr + more > d->size) { size_t new_size = roundup_pow_of_two(d->nr + more); - void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + void *data = krealloc_array(d->data, new_size, t_size, gfp); if (!data) return -ENOMEM; @@ -35,20 +35,30 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) return 0; } +#define darray_make_room_gfp(_d, _more, _gfp) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + #define darray_make_room(_d, _more) \ - __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) + darray_make_room_gfp(_d, _more, GFP_KERNEL) #define darray_top(_d) ((_d).data[(_d).nr]) -#define darray_push(_d, _item) \ +#define darray_push_gfp(_d, _item, _gfp) \ ({ \ - int _ret = darray_make_room((_d), 1); \ + int _ret = darray_make_room_gfp((_d), 1, _gfp); \ \ if (!_ret) \ (_d)->data[(_d)->nr++] = (_item); \ _ret; \ }) +#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) + +#define darray_pop(_d) ((_d)->data[--(_d)->nr]) + +#define darray_first(_d) ((_d).data[0]) +#define darray_last(_d) ((_d).data[(_d).nr - 1]) + #define darray_insert_item(_d, _pos, _item) \ ({ \ size_t pos = (_pos); \ diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index e414d1af..de808fcc 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -163,7 +163,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + /* + * See comment below: bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + */ rewrites_found |= 1U << i; } i++; @@ -205,7 +209,14 @@ restart_drop_extra_replicas: if (!p.ptr.cached && durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); + /* + * Currently, we're dropping unneeded replicas + * instead of marking them as cached, since + * cached data in stripe buckets prevents them + * from being reused: bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + */ goto restart_drop_extra_replicas; } } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 1e621dcc..1855d08e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -826,7 +826,7 @@ static void ec_stripe_delete_work(struct work_struct *work) void bch2_do_stripe_deletes(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && - !schedule_work(&c->ec_stripe_delete_work)) + !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -887,7 +887,7 @@ err: static int ec_stripe_update_extent(struct btree_trans *trans, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, - u64 *bp_offset) + struct bpos *bp_pos) { struct bch_fs *c = trans->c; struct bch_backpointer bp; @@ -900,10 +900,10 @@ static int ec_stripe_update_extent(struct btree_trans *trans, int ret, dev, block; ret = bch2_get_next_backpointer(trans, bucket, gen, - bp_offset, &bp, BTREE_ITER_CACHED); + bp_pos, &bp, BTREE_ITER_CACHED); if (ret) return ret; - if (*bp_offset == U64_MAX) + if (bpos_eq(*bp_pos, SPOS_MAX)) return 0; if (bp.level) { @@ -911,7 +911,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct btree_iter node_iter; struct btree *b; - b = bch2_backpointer_get_node(trans, &node_iter, bucket, *bp_offset, bp); + b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); bch2_trans_iter_exit(trans, &node_iter); if (!b) @@ -925,7 +925,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); ret = bkey_err(k); if (ret) return ret; @@ -984,7 +984,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_fs *c = trans->c; struct bch_extent_ptr bucket = s->key.v.ptrs[block]; struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); - u64 bp_offset = 0; + struct bpos bp_pos = POS_MIN; int ret = 0; while (1) { @@ -992,13 +992,13 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, - s, &bp_offset)); + s, &bp_pos)); if (ret) break; - if (bp_offset == U64_MAX) + if (bkey_eq(bp_pos, POS_MAX)) break; - bp_offset++; + bp_pos = bpos_nosnap_successor(bp_pos); } return ret; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index df2f317f..db138570 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -35,6 +35,61 @@ #include #include +static inline loff_t folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline loff_t folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +typedef DARRAY(struct folio *) folios; + +static int filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, loff_t end, + int fgp_flags, gfp_t gfp, + folios *folios) +{ + struct folio *f; + loff_t pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (!f) + break; + + BUG_ON(folios->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(folios, f); + } + + if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return folios->nr ? 0 : ret; +} + struct nocow_flush { struct closure *cl; struct bch_dev *ca; @@ -336,7 +391,66 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* stored in page->private: */ -struct bch_page_sector { +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ unsigned nr_replicas:4; @@ -344,109 +458,118 @@ struct bch_page_sector { unsigned replicas_reserved:4; /* i_sectors: */ - enum { - SECTOR_UNALLOCATED, - SECTOR_RESERVED, - SECTOR_DIRTY, - SECTOR_DIRTY_RESERVED, - SECTOR_ALLOCATED, - } state:8; + enum bch_folio_sector_state state:8; }; -struct bch_page_state { +struct bch_folio { spinlock_t lock; atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ bool uptodate; - struct bch_page_sector s[PAGE_SECTORS]; + struct bch_folio_sector s[]; }; -static inline struct bch_page_state *__bch2_page_state(struct page *page) +static inline void folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) { - return page_has_private(page) - ? (struct bch_page_state *) page_private(page) + s->s[i].state = n; +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) +{ + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) : NULL; } -static inline struct bch_page_state *bch2_page_state(struct page *page) +static inline struct bch_folio *bch2_folio(struct folio *folio) { - EBUG_ON(!PageLocked(page)); + EBUG_ON(!folio_test_locked(folio)); - return __bch2_page_state(page); + return __bch2_folio(folio); } -/* for newly allocated pages: */ -static void __bch2_page_state_release(struct page *page) +/* for newly allocated folios: */ +static void __bch2_folio_release(struct folio *folio) { - kfree(detach_page_private(page)); + kfree(folio_detach_private(folio)); } -static void bch2_page_state_release(struct page *page) +static void bch2_folio_release(struct folio *folio) { - EBUG_ON(!PageLocked(page)); - __bch2_page_state_release(page); + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); } -/* for newly allocated pages: */ -static struct bch_page_state *__bch2_page_state_create(struct page *page, - gfp_t gfp) +/* for newly allocated folios: */ +static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) { - struct bch_page_state *s; + struct bch_folio *s; - s = kzalloc(sizeof(*s), GFP_NOFS|gfp); + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), GFP_NOFS|gfp); if (!s) return NULL; spin_lock_init(&s->lock); - attach_page_private(page, s); + folio_attach_private(folio, s); return s; } -static struct bch_page_state *bch2_page_state_create(struct page *page, - gfp_t gfp) +static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) { - return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); } static unsigned bkey_to_sector_state(struct bkey_s_c k) { if (bkey_extent_is_reservation(k)) - return SECTOR_RESERVED; + return SECTOR_reserved; if (bkey_extent_is_allocation(k.k)) - return SECTOR_ALLOCATED; - return SECTOR_UNALLOCATED; + return SECTOR_allocated; + return SECTOR_unallocated; } -static void __bch2_page_state_set(struct page *page, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) { - struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); - unsigned i; + struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL); + unsigned i, sectors = folio_sectors(folio); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); spin_lock(&s->lock); for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = state; + s->s[i].nr_replicas = nr_ptrs; + folio_sector_set(folio, s, i, state); } - if (i == PAGE_SECTORS) + if (i == sectors) s->uptodate = true; spin_unlock(&s->lock); } -static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, - struct page **pages, unsigned nr_pages) +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **folios, unsigned nr_folios) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; - unsigned pg_idx = 0; + u64 offset = folio_sector(folios[0]); + unsigned folio_idx = 0; u32 snapshot; int ret; @@ -464,25 +587,25 @@ retry: unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); - while (pg_idx < nr_pages) { - struct page *page = pages[pg_idx]; - u64 pg_start = page->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; - unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; + while (folio_idx < nr_folios) { + struct folio *folio = folios[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; - BUG_ON(k.k->p.offset < pg_start); - BUG_ON(bkey_start_offset(k.k) > pg_end); + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) - __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); + if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - if (k.k->p.offset < pg_end) + if (k.k->p.offset < folio_end) break; - pg_idx++; + folio_idx++; } - if (pg_idx == nr_pages) + if (folio_idx == nr_folios) break; } @@ -499,14 +622,16 @@ err: static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; - struct bio_vec bv; + struct folio_vec fv; unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); - bio_for_each_segment(bv, bio, iter) - __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, - bv.bv_len >> 9, nr_ptrs, state); + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); } static void mark_pagecache_unallocated(struct bch_inode_info *inode, @@ -526,22 +651,20 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode, &index, end_index, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(start, pg_start) - pg_start; - unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; - struct bch_page_state *s; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; - BUG_ON(end <= pg_start); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(end <= folio_start); folio_lock(folio); - s = bch2_page_state(&folio->page); + s = bch2_folio(folio); if (s) { spin_lock(&s->lock); - for (j = pg_offset; j < pg_offset + pg_len; j++) + for (j = folio_offset; j < folio_offset + folio_len; j++) s->s[j].nr_replicas = 0; spin_unlock(&s->lock); } @@ -572,33 +695,23 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode, &index, end_index, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(start, pg_start) - pg_start; - unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; - struct bch_page_state *s; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; - BUG_ON(end <= pg_start); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(end <= folio_start); folio_lock(folio); - s = bch2_page_state(&folio->page); + s = bch2_folio(folio); if (s) { spin_lock(&s->lock); - for (j = pg_offset; j < pg_offset + pg_len; j++) - switch (s->s[j].state) { - case SECTOR_UNALLOCATED: - s->s[j].state = SECTOR_RESERVED; - break; - case SECTOR_DIRTY: - s->s[j].state = SECTOR_DIRTY_RESERVED; - i_sectors_delta--; - break; - default: - break; - } + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); + } spin_unlock(&s->lock); } @@ -619,28 +732,28 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info : c->opts.data_replicas; } -static inline unsigned sectors_to_reserve(struct bch_page_sector *s, - unsigned nr_replicas) +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) { return max(0, (int) nr_replicas - s->nr_replicas - s->replicas_reserved); } -static int bch2_get_page_disk_reservation(struct bch_fs *c, +static int bch2_get_folio_disk_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page, bool check_enospc) + struct folio *folio, bool check_enospc) { - struct bch_page_state *s = bch2_page_state_create(page, 0); + struct bch_folio *s = bch2_folio_create(folio, 0); unsigned nr_replicas = inode_nr_replicas(c, inode); struct disk_reservation disk_res = { 0 }; - unsigned i, disk_res_sectors = 0; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; int ret; if (!s) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(s->s); i++) + for (i = 0; i < sectors; i++) disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); if (!disk_res_sectors) @@ -654,41 +767,42 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c, if (unlikely(ret)) return ret; - for (i = 0; i < ARRAY_SIZE(s->s); i++) + for (i = 0; i < sectors; i++) s->s[i].replicas_reserved += sectors_to_reserve(&s->s[i], nr_replicas); return 0; } -struct bch2_page_reservation { +struct bch2_folio_reservation { struct disk_reservation disk; struct quota_res quota; }; -static void bch2_page_reservation_init(struct bch_fs *c, +static void bch2_folio_reservation_init(struct bch_fs *c, struct bch_inode_info *inode, - struct bch2_page_reservation *res) + struct bch2_folio_reservation *res) { memset(res, 0, sizeof(*res)); res->disk.nr_replicas = inode_nr_replicas(c, inode); } -static void bch2_page_reservation_put(struct bch_fs *c, +static void bch2_folio_reservation_put(struct bch_fs *c, struct bch_inode_info *inode, - struct bch2_page_reservation *res) + struct bch2_folio_reservation *res) { bch2_disk_reservation_put(c, &res->disk); bch2_quota_reservation_put(c, inode, &res->quota); } -static int bch2_page_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, struct page *page, - struct bch2_page_reservation *res, +static int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, unsigned offset, unsigned len) { - struct bch_page_state *s = bch2_page_state_create(page, 0); + struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; int ret; @@ -702,7 +816,7 @@ static int bch2_page_reservation_get(struct bch_fs *c, i++) { disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; + quota_sectors += s->s[i].state == SECTOR_unallocated; } if (disk_sectors) { @@ -728,55 +842,49 @@ static int bch2_page_reservation_get(struct bch_fs *c, return 0; } -static void bch2_clear_page_bits(struct page *page) +static void bch2_clear_folio_bits(struct folio *folio) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_page_state *s = bch2_page_state(page); + struct bch_folio *s = bch2_folio(folio); struct disk_reservation disk_res = { 0 }; - int i, dirty_sectors = 0; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; if (!s) return; - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); - for (i = 0; i < ARRAY_SIZE(s->s); i++) { + for (i = 0; i < sectors; i++) { disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - switch (s->s[i].state) { - case SECTOR_DIRTY: - s->s[i].state = SECTOR_UNALLOCATED; - --dirty_sectors; - break; - case SECTOR_DIRTY_RESERVED: - s->s[i].state = SECTOR_RESERVED; - break; - default: - break; - } + dirty_sectors -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); } bch2_disk_reservation_put(c, &disk_res); i_sectors_acct(c, inode, NULL, dirty_sectors); - bch2_page_state_release(page); + bch2_folio_release(folio); } -static void bch2_set_page_dirty(struct bch_fs *c, - struct bch_inode_info *inode, struct page *page, - struct bch2_page_reservation *res, +static void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, unsigned offset, unsigned len) { - struct bch_page_state *s = bch2_page_state(page); + struct bch_folio *s = bch2_folio(folio); unsigned i, dirty_sectors = 0; - WARN_ON((u64) page_offset(page) + offset + len > + WARN_ON((u64) folio_pos(folio) + offset + len > round_up((u64) i_size_read(&inode->v), block_bytes(c))); + BUG_ON(!s->uptodate); + spin_lock(&s->lock); for (i = round_down(offset, block_bytes(c)) >> 9; @@ -794,25 +902,17 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; - switch (s->s[i].state) { - case SECTOR_UNALLOCATED: - s->s[i].state = SECTOR_DIRTY; - dirty_sectors++; - break; - case SECTOR_RESERVED: - s->s[i].state = SECTOR_DIRTY_RESERVED; - break; - default: - break; - } + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); } spin_unlock(&s->lock); i_sectors_acct(c, inode, &res->quota, dirty_sectors); - if (!PageDirty(page)) - filemap_dirty_folio(inode->v.i_mapping, page_folio(page)); + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); } vm_fault_t bch2_page_fault(struct vm_fault *vmf) @@ -855,17 +955,17 @@ got_lock: vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = file->f_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation res; + struct bch2_folio_reservation res; unsigned len; loff_t isize; int ret; - bch2_page_reservation_init(c, inode, &res); + bch2_folio_reservation_init(c, inode, &res); sb_start_pagefault(inode->v.i_sb); file_update_time(file); @@ -878,35 +978,35 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) */ bch2_pagecache_add_get(inode); - lock_page(page); + folio_lock(folio); isize = i_size_read(&inode->v); - if (page->mapping != mapping || page_offset(page) >= isize) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { - unlock_page(page); + if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) { + if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) { + folio_unlock(folio); ret = VM_FAULT_SIGBUS; goto out; } } - if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) { - unlock_page(page); + if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); ret = VM_FAULT_SIGBUS; goto out; } - bch2_set_page_dirty(c, inode, page, &res, 0, len); - bch2_page_reservation_put(c, inode, &res); + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; out: bch2_pagecache_add_put(inode); @@ -920,7 +1020,7 @@ void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) if (offset || length < folio_size(folio)) return; - bch2_clear_page_bits(&folio->page); + bch2_clear_folio_bits(folio); } bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) @@ -928,7 +1028,7 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) if (folio_test_dirty(folio) || folio_test_writeback(folio)) return false; - bch2_clear_page_bits(&folio->page); + bch2_clear_folio_bits(folio); return true; } @@ -937,18 +1037,16 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) static void bch2_readpages_end_io(struct bio *bio) { struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) { - struct page *page = bv->bv_page; + struct folio_vec fv; + bio_for_each_folio_all(fv, bio, iter) { if (!bio->bi_status) { - SetPageUptodate(page); + folio_mark_uptodate(fv.fv_folio); } else { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(fv.fv_folio); + folio_set_error(fv.fv_folio); } - unlock_page(page); + folio_unlock(fv.fv_folio); } bio_put(bio); @@ -956,44 +1054,48 @@ static void bch2_readpages_end_io(struct bio *bio) struct readpages_iter { struct address_space *mapping; - struct page **pages; - unsigned nr_pages; unsigned idx; - pgoff_t offset; + folios folios; }; static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { - unsigned i, nr_pages = readahead_count(ractl); + struct folio **fi; + int ret; memset(iter, 0, sizeof(*iter)); - iter->mapping = ractl->mapping; - iter->offset = readahead_index(ractl); - iter->nr_pages = nr_pages; + iter->mapping = ractl->mapping; - iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!iter->pages) - return -ENOMEM; + ret = filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; - nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); - put_page(iter->pages[i]); + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL); + folio_put(*fi); + folio_put(*fi); } return 0; } -static inline struct page *readpage_iter_next(struct readpages_iter *iter) +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) { - if (iter->idx >= iter->nr_pages) + if (iter->idx >= iter->folios.nr) return NULL; + return iter->folios.data[iter->idx]; +} - EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); - - return iter->pages[iter->idx]; +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; } static bool extent_partial_reads_expensive(struct bkey_s_c k) @@ -1015,44 +1117,43 @@ static void readpage_bio_extend(struct readpages_iter *iter, { while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { - pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - struct page *page = readpage_iter_next(iter); + struct folio *folio = readpage_iter_peek(iter); int ret; - if (page) { - if (iter->offset + iter->idx != page_offset) - break; - - iter->idx++; + if (folio) { + readpage_iter_advance(iter); } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + if (!get_more) break; - page = xa_load(&iter->mapping->i_pages, page_offset); - if (page && !xa_is_value(page)) + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) break; - page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); - if (!page) + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) break; - if (!__bch2_page_state_create(page, 0)) { - put_page(page); + if (!__bch2_folio_create(folio, 0)) { + folio_put(folio); break; } - ret = add_to_page_cache_lru(page, iter->mapping, - page_offset, GFP_NOFS); + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS); if (ret) { - __bch2_page_state_release(page); - put_page(page); + __bch2_folio_release(folio); + folio_put(folio); break; } - put_page(page); + folio_put(folio); } - BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); } } @@ -1170,7 +1271,7 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; struct btree_trans trans; - struct page *page; + struct folio *folio; struct readpages_iter readpages_iter; int ret; @@ -1183,10 +1284,9 @@ void bch2_readahead(struct readahead_control *ractl) bch2_pagecache_add_get(inode); - while ((page = readpage_iter_next(&readpages_iter))) { - pgoff_t index = readpages_iter.offset + readpages_iter.idx; + while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, - readpages_iter.nr_pages - + readpages_iter.folios.nr - readpages_iter.idx, BIO_MAX_VECS); struct bch_read_bio *rbio = @@ -1194,11 +1294,11 @@ void bch2_readahead(struct readahead_control *ractl) GFP_NOFS, &c->bio_read), opts); - readpages_iter.idx++; + readpage_iter_advance(&readpages_iter); - rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(&trans, rbio, inode_inum(inode), &readpages_iter); @@ -1207,33 +1307,32 @@ void bch2_readahead(struct readahead_control *ractl) bch2_pagecache_add_put(inode); bch2_trans_exit(&trans); - kfree(readpages_iter.pages); + darray_exit(&readpages_iter.folios); } -static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct page *page) +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) { struct btree_trans trans; - bch2_page_state_create(page, __GFP_NOFAIL); + bch2_folio_create(folio, __GFP_NOFAIL); rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = - (sector_t) page->index << PAGE_SECTORS_SHIFT; - BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bch2_trans_init(&trans, c, 0, 0); bchfs_read(&trans, rbio, inum, NULL); bch2_trans_exit(&trans); } -static void bch2_read_single_page_end_io(struct bio *bio) +static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); } -static int bch2_read_single_page(struct page *page, - struct address_space *mapping) +static int bch2_read_single_folio(struct folio *folio, + struct address_space *mapping) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1247,9 +1346,9 @@ static int bch2_read_single_page(struct page *page, rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), opts); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_page_end_io; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readpage(c, rbio, inode_inum(inode), page); + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -1258,16 +1357,15 @@ static int bch2_read_single_page(struct page *page, if (ret < 0) return ret; - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } int bch2_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret; - ret = bch2_read_single_page(page, page->mapping); + ret = bch2_read_single_folio(folio, folio->mapping); folio_unlock(folio); return bch2_err_class(ret); } @@ -1277,6 +1375,8 @@ int bch2_read_folio(struct file *file, struct folio *folio) struct bch_writepage_state { struct bch_writepage_io *io; struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; }; static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, @@ -1295,33 +1395,33 @@ static void bch2_writepage_io_done(struct bch_write_op *op) struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; struct bvec_iter_all iter; - struct bio_vec *bvec; + struct folio_vec fv; unsigned i; if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s; + bio_for_each_folio_all(fv, bio, iter) { + struct bch_folio *s; - SetPageError(bvec->bv_page); - mapping_set_error(bvec->bv_page->mapping, -EIO); + folio_set_error(fv.fv_folio); + mapping_set_error(fv.fv_folio->mapping, -EIO); - s = __bch2_page_state(bvec->bv_page); + s = __bch2_folio(fv.fv_folio); spin_lock(&s->lock); - for (i = 0; i < PAGE_SECTORS; i++) + for (i = 0; i < folio_sectors(fv.fv_folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s; + bio_for_each_folio_all(fv, bio, iter) { + struct bch_folio *s; - s = __bch2_page_state(bvec->bv_page); + s = __bch2_folio(fv.fv_folio); spin_lock(&s->lock); - for (i = 0; i < PAGE_SECTORS; i++) + for (i = 0; i < folio_sectors(fv.fv_folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } @@ -1346,11 +1446,11 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s = __bch2_page_state(bvec->bv_page); + bio_for_each_folio_all(fv, bio, iter) { + struct bch_folio *s = __bch2_folio(fv.fv_folio); if (atomic_dec_and_test(&s->write_count)) - end_page_writeback(bvec->bv_page); + folio_end_writeback(fv.fv_folio); } bio_put(&io->op.wbio.bio); @@ -1398,56 +1498,64 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } -static int __bch2_writepage(struct page *page, +static int __bch2_writepage(struct page *_page, struct writeback_control *wbc, void *data) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); + struct folio *folio = page_folio(_page); + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; - struct bch_page_state *s, orig; - unsigned i, offset, nr_replicas_this_write = U32_MAX; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; loff_t i_size = i_size_read(&inode->v); - pgoff_t end_index = i_size >> PAGE_SHIFT; int ret; - EBUG_ON(!PageUptodate(page)); + EBUG_ON(!folio_test_uptodate(folio)); - /* Is the page fully inside i_size? */ - if (page->index < end_index) + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) goto do_io; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index > end_index || !offset) { - unlock_page(page); + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); return 0; } /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_SIZE); + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); do_io: - s = bch2_page_state_create(page, __GFP_NOFAIL); + f_sectors = folio_sectors(folio); + s = bch2_folio_create(folio, __GFP_NOFAIL); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kzalloc(sizeof(struct bch_folio_sector) * + f_sectors, __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } /* * Things get really hairy with errors during writeback: */ - ret = bch2_get_page_disk_reservation(c, inode, page, false); + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ spin_lock(&s->lock); - orig = *s; - spin_unlock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - for (i = 0; i < PAGE_SECTORS; i++) { - if (s->s[i].state < SECTOR_DIRTY) + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) continue; nr_replicas_this_write = @@ -1456,50 +1564,51 @@ do_io: s->s[i].replicas_reserved); } - for (i = 0; i < PAGE_SECTORS; i++) { - if (s->s[i].state < SECTOR_DIRTY) + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) continue; s->s[i].nr_replicas = w->opts.compression ? 0 : nr_replicas_this_write; s->s[i].replicas_reserved = 0; - s->s[i].state = SECTOR_ALLOCATED; + folio_sector_set(folio, s, i, SECTOR_allocated); } + spin_unlock(&s->lock); BUG_ON(atomic_read(&s->write_count)); atomic_set(&s->write_count, 1); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); - unlock_page(page); + folio_unlock(folio); offset = 0; while (1) { unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; - while (offset < PAGE_SECTORS && - orig.s[offset].state < SECTOR_DIRTY) + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) offset++; - if (offset == PAGE_SECTORS) + if (offset == f_sectors) break; - while (offset + sectors < PAGE_SECTORS && - orig.s[offset + sectors].state >= SECTOR_DIRTY) { - reserved_sectors += orig.s[offset + sectors].replicas_reserved; - dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; sectors++; } BUG_ON(!sectors); - sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; + sector = folio_sector(folio) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || + bio_full(&w->io->op.wbio.bio, sectors << 9) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_VECS * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1512,7 +1621,7 @@ do_io: atomic_inc(&s->write_count); BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); /* Check for writing past i_size: */ @@ -1532,7 +1641,7 @@ do_io: } if (atomic_dec_and_test(&s->write_count)) - end_page_writeback(page); + folio_end_writeback(folio); return 0; } @@ -1550,6 +1659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc if (w.io) bch2_writepage_do_io(&w); blk_finish_plug(&plug); + kfree(w.tmp); return bch2_err_class(ret); } @@ -1561,61 +1671,65 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation *res; - pgoff_t index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - struct page *page; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; int ret = -ENOMEM; res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) return -ENOMEM; - bch2_page_reservation_init(c, inode, res); + bch2_folio_reservation_init(c, inode, res); *fsdata = res; bch2_pagecache_add_get(inode); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (!folio) goto err_unlock; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* If we're writing entire page, don't need to read it in first: */ - if (len == PAGE_SIZE) + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) goto out; if (!offset && pos + len >= inode->v.i_size) { - zero_user_segment(page, len, PAGE_SIZE); - flush_dcache_page(page); + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); goto out; } - if (index > inode->v.i_size >> PAGE_SHIFT) { - zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); - flush_dcache_page(page); + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); goto out; } readpage: - ret = bch2_read_single_page(page, mapping); + ret = bch2_read_single_folio(folio, mapping); if (ret) goto err; out: - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); + if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) { + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); if (ret) goto err; } - ret = bch2_page_reservation_get(c, inode, page, res, offset, len); + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); if (ret) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* - * If the page hasn't been read in, we won't know if we + * If the folio hasn't been read in, we won't know if we * actually need a reservation - we don't actually need - * to read here, we just need to check if the page is + * to read here, we just need to check if the folio is * fully backed by uncompressed data: */ goto readpage; @@ -1624,11 +1738,11 @@ out: goto err; } - *pagep = page; + *pagep = &folio->page; return 0; err: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); @@ -1643,19 +1757,21 @@ int bch2_write_end(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation *res = fsdata; - unsigned offset = pos & (PAGE_SIZE - 1); + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); - if (unlikely(copied < len && !PageUptodate(page))) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * The page needs to be read in, but that would destroy + * The folio needs to be read in, but that would destroy * our partial write - simplest thing is to just force * userspace to redo the write: */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); copied = 0; } @@ -1665,25 +1781,33 @@ int bch2_write_end(struct file *file, struct address_space *mapping, spin_unlock(&inode->v.i_lock); if (copied) { - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); - bch2_set_page_dirty(c, inode, page, res, offset, copied); + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); inode->ei_last_dirtied = (unsigned long) current; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); bch2_pagecache_add_put(inode); - bch2_page_reservation_put(c, inode, res); + bch2_folio_reservation_put(c, inode, res); kfree(res); return copied; } -#define WRITE_BATCH_PAGES 32 +static noinline void folios_trunc(folios *folios, struct folio **fi) +{ + while (folios->data + folios->nr > fi) { + struct folio *f = darray_pop(folios); + + folio_unlock(f); + folio_put(f); + } +} static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, @@ -1691,61 +1815,57 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct page *pages[WRITE_BATCH_PAGES]; - struct bch2_page_reservation res; - unsigned long index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - unsigned i, reserved = 0, set_dirty = 0; - unsigned copied = 0, nr_pages_copied = 0; + struct bch2_folio_reservation res; + folios folios; + struct folio **fi, *f; + unsigned copied = 0, f_offset; + loff_t end = pos + len, f_pos; + loff_t last_folio_pos = inode->v.i_size; int ret = 0; BUG_ON(!len); - BUG_ON(nr_pages > ARRAY_SIZE(pages)); - bch2_page_reservation_init(c, inode, &res); + bch2_folio_reservation_init(c, inode, &res); + darray_init(&folios); - for (i = 0; i < nr_pages; i++) { - pages[i] = grab_cache_page_write_begin(mapping, index + i); - if (!pages[i]) { - nr_pages = i; - if (!i) { - ret = -ENOMEM; - goto out; - } - len = min_t(unsigned, len, - nr_pages * PAGE_SIZE - offset); - break; - } - } + ret = filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &folios); + if (ret) + goto out; - if (offset && !PageUptodate(pages[0])) { - ret = bch2_read_single_page(pages[0], mapping); + BUG_ON(!folios.nr); + + f = darray_first(folios); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } - if ((pos + len) & (PAGE_SIZE - 1) && - !PageUptodate(pages[nr_pages - 1])) { - if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { - zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); + f = darray_last(folios); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); } else { - ret = bch2_read_single_page(pages[nr_pages - 1], mapping); + ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } } - while (reserved < len) { - unsigned i = (offset + reserved) >> PAGE_SHIFT; - struct page *page = pages[i]; - unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, len - reserved, - PAGE_SIZE - pg_offset); + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + unsigned f_len = min(end, folio_end_pos(f)) - f_pos; - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - ret = bch2_page_state_set(c, inode_inum(inode), - pages + i, nr_pages - i); + if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) { + ret = bch2_folio_set(c, inode_inum(inode), fi, + folios.data + folios.nr - fi); if (ret) goto out; } @@ -1758,79 +1878,98 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, * we aren't completely out of disk space - we don't do that * yet: */ - ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len); + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); if (unlikely(ret)) { - if (!reserved) + folios_trunc(&folios, fi); + if (!folios.nr) goto out; + + end = min(end, folio_end_pos(darray_last(folios))); break; } - reserved += pg_len; + f_pos = folio_end_pos(f); + f_offset = 0; } if (mapping_writably_mapped(mapping)) - for (i = 0; i < nr_pages; i++) - flush_dcache_page(pages[i]); + darray_for_each(folios, fi) + flush_dcache_folio(*fi); - while (copied < reserved) { - struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; - unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, reserved - copied, - PAGE_SIZE - pg_offset); - unsigned pg_copied = copy_page_from_iter_atomic(page, - pg_offset, pg_len, iter); + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + unsigned f_len = min(end, folio_end_pos(f)) - f_pos; + unsigned f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); - if (!pg_copied) - break; - - if (!PageUptodate(page) && - pg_copied != PAGE_SIZE && - pos + copied + pg_copied < inode->v.i_size) { - zero_user(page, 0, PAGE_SIZE); + if (!f_copied) { + folios_trunc(&folios, fi); break; } - flush_dcache_page(page); - copied += pg_copied; - - if (pg_copied != pg_len) + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&folios, fi); break; + } + + flush_dcache_folio(f); + copied += f_copied; + + if (f_copied != f_len) { + folios_trunc(&folios, fi + 1); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; } if (!copied) goto out; + end = pos + copied; + spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); spin_unlock(&inode->v.i_lock); - while (set_dirty < copied) { - struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; - unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, copied - set_dirty, - PAGE_SIZE - pg_offset); + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + unsigned f_len = min(end, folio_end_pos(f)) - f_pos; - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); - bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); - unlock_page(page); - put_page(page); + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - set_dirty += pg_len; + f_pos = folio_end_pos(f); + f_offset = 0; } - nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); inode->ei_last_dirtied = (unsigned long) current; out: - for (i = nr_pages_copied; i < nr_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + darray_for_each(folios, fi) { + folio_unlock(*fi); + folio_put(*fi); } - bch2_page_reservation_put(c, inode, &res); + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&folios); + bch2_folio_reservation_put(c, inode, &res); return copied ?: ret; } @@ -1848,8 +1987,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) do { unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE * WRITE_BATCH_PAGES - offset); + unsigned bytes = iov_iter_count(iter); again: /* * Bring in the user page that we will copy from _first_. @@ -2251,8 +2389,6 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; struct bio *bio = &dio->op.wbio.bio; - struct bvec_iter_all iter; - struct bio_vec *bv; req->ki_pos += (u64) dio->op.written << 9; dio->written += dio->op.written; @@ -2271,9 +2407,13 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) mutex_unlock(&inode->ei_quota_lock); } - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) { + struct bvec_iter_all iter; + struct folio_vec fv; + + bio_for_each_folio_all(fv, bio, iter) + folio_put(fv.fv_folio); + } if (unlikely(dio->op.error)) set_bit(EI_INODE_ERROR, &inode->ei_flags); @@ -2394,10 +2534,10 @@ err: if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { struct bvec_iter_all iter; - struct bio_vec *bv; + struct folio_vec fv; - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); + bio_for_each_folio_all(fv, bio, iter) + folio_put(fv.fv_folio); } bch2_quota_reservation_put(c, inode, &dio->quota_res); @@ -2608,7 +2748,7 @@ retry: goto err; for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) - if (bkey_extent_is_data(k.k)) { + if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { ret = 1; break; } @@ -2622,33 +2762,25 @@ err: return ret; } -static int __bch2_truncate_page(struct bch_inode_info *inode, - pgoff_t index, loff_t start, loff_t end) +static int __bch2_truncate_folio(struct bch_inode_info *inode, + pgoff_t index, loff_t start, loff_t end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bch_page_state *s; + struct bch_folio *s; unsigned start_offset = start & (PAGE_SIZE - 1); unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; unsigned i; - struct page *page; + struct folio *folio; s64 i_sectors_delta = 0; int ret = 0; + loff_t end_pos; - /* Page boundary? Nothing to do */ - if (!((index == start >> PAGE_SHIFT && start_offset) || - (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) - return 0; - - /* Above i_size? */ - if (index << PAGE_SHIFT >= inode->v.i_size) - return 0; - - page = find_lock_page(mapping, index); - if (!page) { + folio = filemap_lock_folio(mapping, index); + if (!folio) { /* * XXX: we're doing two index lookups when we end up reading the - * page + * folio */ ret = range_has_data(c, inode->ei_subvol, POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), @@ -2656,90 +2788,113 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, if (ret <= 0) return ret; - page = find_or_create_page(mapping, index, GFP_KERNEL); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); + if (unlikely(!folio)) { ret = -ENOMEM; goto out; } } - s = bch2_page_state_create(page, 0); + BUG_ON(start >= folio_end_pos(folio)); + BUG_ON(end <= folio_pos(folio)); + + start_offset = max(start, folio_pos(folio)) - folio_pos(folio); + end_offset = min(end, folio_end_pos(folio)) - folio_pos(folio); + + /* Folio boundary? Nothing to do */ + if (start_offset == 0 && + end_offset == folio_size(folio)) { + ret = 0; + goto unlock; + } + + s = bch2_folio_create(folio, 0); if (!s) { ret = -ENOMEM; goto unlock; } - if (!PageUptodate(page)) { - ret = bch2_read_single_page(page, mapping); + if (!folio_test_uptodate(folio)) { + ret = bch2_read_single_folio(folio, mapping); if (ret) goto unlock; } - if (index != start >> PAGE_SHIFT) - start_offset = 0; - if (index != end >> PAGE_SHIFT) - end_offset = PAGE_SIZE; + if (!s->uptodate) { + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto unlock; + } for (i = round_up(start_offset, block_bytes(c)) >> 9; i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; - if (s->s[i].state == SECTOR_DIRTY) - i_sectors_delta--; - s->s[i].state = SECTOR_UNALLOCATED; + + i_sectors_delta -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, SECTOR_unallocated); } i_sectors_acct(c, inode, NULL, i_sectors_delta); /* - * Caller needs to know whether this page will be written out by + * Caller needs to know whether this folio will be written out by * writeback - doing an i_size update if necessary - or whether it will - * be responsible for the i_size update: + * be responsible for the i_size update. + * + * Note that we shouldn't ever see a folio beyond EOF, but check and + * warn if so. This has been observed by failure to clean up folios + * after a short write and there's still a chance reclaim will fix + * things up. */ - ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), - PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; + WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); + end_pos = folio_end_pos(folio); + if (inode->v.i_size > folio_pos(folio)) + end_pos = min(inode->v.i_size, end_pos); + ret = s->s[(end_pos - folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty; - zero_user_segment(page, start_offset, end_offset); + folio_zero_segment(folio, start_offset, end_offset); /* * Bit of a hack - we don't want truncate to fail due to -ENOSPC. * - * XXX: because we aren't currently tracking whether the page has actual + * XXX: because we aren't currently tracking whether the folio has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); + BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); /* * This removes any writeable userspace mappings; we need to force * .page_mkwrite to be called again before any mmapped writes, to * redirty the full page: */ - page_mkclean(page); - filemap_dirty_folio(mapping, page_folio(page)); + folio_mkclean(folio); + filemap_dirty_folio(mapping, folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: return ret; } -static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) +static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) { - return __bch2_truncate_page(inode, from >> PAGE_SHIFT, - from, round_up(from, PAGE_SIZE)); + return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, + from, ANYSINT_MAX(loff_t)); } -static int bch2_truncate_pages(struct bch_inode_info *inode, - loff_t start, loff_t end) +static int bch2_truncate_folios(struct bch_inode_info *inode, + loff_t start, loff_t end) { - int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, - start, end); + int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, + start, end); if (ret >= 0 && start >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - start, end); + ret = __bch2_truncate_folio(inode, + (end - 1) >> PAGE_SHIFT, + start, end); return ret; } @@ -2834,7 +2989,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, iattr->ia_valid &= ~ATTR_SIZE; - ret = bch2_truncate_page(inode, iattr->ia_size); + ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; @@ -2912,7 +3067,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len bool truncated_last_page; int ret = 0; - ret = bch2_truncate_pages(inode, offset, end); + ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) goto err; @@ -3233,7 +3388,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = bch2_truncate_pages(inode, offset, end); + ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) return ret; @@ -3450,12 +3605,12 @@ err: static int folio_data_offset(struct folio *folio, unsigned offset) { - struct bch_page_state *s = bch2_page_state(&folio->page); - unsigned i; + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); if (s) - for (i = offset >> 9; i < PAGE_SECTORS; i++) - if (s->s[i].state >= SECTOR_DIRTY) + for (i = offset >> 9; i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty) return i << 9; return -1; @@ -3482,12 +3637,10 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode, folio_lock(folio); offset = folio_data_offset(folio, - folio->index == start_index - ? start_offset & (PAGE_SIZE - 1) - : 0); + max(folio_pos(folio), start_offset) - + folio_pos(folio)); if (offset >= 0) { - ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + - offset, + ret = clamp(folio_pos(folio) + offset, start_offset, end_offset); folio_unlock(folio); folio_batch_release(&fbatch); @@ -3555,38 +3708,34 @@ err: return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static int __page_hole_offset(struct page *page, unsigned offset) +static bool folio_hole_offset(struct address_space *mapping, loff_t *offset) { - struct bch_page_state *s = bch2_page_state(page); - unsigned i; + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors, f_offset; + bool ret = true; + folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT); + if (!folio) + return true; + + s = bch2_folio(folio); if (!s) - return 0; + goto unlock; - for (i = offset >> 9; i < PAGE_SECTORS; i++) - if (s->s[i].state < SECTOR_DIRTY) - return i << 9; + sectors = folio_sectors(folio); + f_offset = *offset - folio_pos(folio); - return -1; -} - -static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) -{ - pgoff_t index = offset >> PAGE_SHIFT; - struct page *page; - int pg_offset; - loff_t ret = -1; - - page = find_lock_page(mapping, index); - if (!page) - return offset; - - pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); - if (pg_offset >= 0) - ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; - - unlock_page(page); + for (i = f_offset >> 9; i < sectors; i++) + if (s->s[i].state < SECTOR_dirty) { + *offset = max(*offset, folio_pos(folio) + (i << 9)); + goto unlock; + } + *offset = folio_end_pos(folio); + ret = false; +unlock: + folio_unlock(folio); return ret; } @@ -3595,18 +3744,13 @@ static loff_t bch2_seek_pagecache_hole(struct inode *vinode, loff_t end_offset) { struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset, hole; + loff_t offset = start_offset; - while (offset < end_offset) { - hole = page_hole_offset(mapping, offset); - if (hole >= 0 && hole <= end_offset) - return max(start_offset, hole); + while (offset < end_offset && + !folio_hole_offset(mapping, &offset)) + ; - offset += PAGE_SIZE; - offset &= PAGE_MASK; - } - - return end_offset; + return min(offset, end_offset); } static loff_t bch2_seek_hole(struct file *file, u64 offset) diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 2bb68082..571b4dca 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -451,19 +451,20 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, return ret; if (path.dentry->d_sb->s_fs_info != c) { - path_put(&path); - return -EXDEV; + ret = -EXDEV; + goto err; } dir = path.dentry->d_parent->d_inode; ret = __bch2_unlink(dir, path.dentry, true); - if (!ret) { - fsnotify_rmdir(dir, path.dentry); - d_delete(path.dentry); - } - path_put(&path); + if (ret) + goto err; + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); +err: + path_put(&path); return ret; } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c23309f1..fab80d9c 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -105,6 +105,11 @@ retry: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_fs_fatal_err_on(ret == -ENOENT, c, + "inode %u:%llu not found when updating", + inode_inum(inode).subvol, + inode_inum(inode).inum); + bch2_trans_exit(&trans); return ret < 0 ? ret : 0; } @@ -201,6 +206,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) return ERR_PTR(ret); } + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + unlock_new_inode(&inode->v); return &inode->v; @@ -314,6 +323,9 @@ err_before_quota: inode = old; } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); /* * we really don't want insert_inode_locked2() to be setting * I_NEW... @@ -442,19 +454,27 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_trans_init(&trans, c, 4, 1024); ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_unlink_trans(&trans, - inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name, - deleting_snapshot)); + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); + if (unlikely(ret)) + goto err; - if (likely(!ret)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, - ATTR_MTIME); + bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_MTIME); + + if (inode_u.bi_subvol) { + /* + * Subvolume deletion is asynchronous, but we still want to tell + * the VFS that it's been deleted here: + */ + set_nlink(&inode->v, 0); } - +err: bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -1349,6 +1369,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->v.i_op = &bch_special_inode_operations; break; } + + mapping_set_large_folios(inode->v.i_mapping); } static struct inode *bch2_alloc_inode(struct super_block *sb) @@ -1362,6 +1384,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); mutex_init(&inode->ei_quota_lock); return &inode->v; @@ -1426,53 +1449,78 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); } + + mutex_lock(&c->vfs_inodes_lock); + list_del_init(&inode->ei_vfs_inode_list); + mutex_unlock(&c->vfs_inodes_lock); } -void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct super_block *sb = c->vfs_sb; - struct inode *inode; + struct bch_inode_info *inode, **i; + DARRAY(struct bch_inode_info *) grabbed; + bool clean_pass = false, this_pass_clean; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) - continue; + /* + * Initially, we scan for inodes without I_DONTCACHE, then mark them to + * be pruned with d_mark_dontcache(). + * + * Once we've had a clean pass where we didn't find any inodes without + * I_DONTCACHE, we wait for them to be freed: + */ - d_mark_dontcache(inode); - d_prune_aliases(inode); - } - spin_unlock(&sb->s_inode_list_lock); + darray_init(&grabbed); + darray_make_room(&grabbed, 1024); again: cond_resched(); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) + this_pass_clean = true; + + mutex_lock(&c->vfs_inodes_lock); + list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { + if (!snapshot_list_has_id(s, inode->ei_subvol)) continue; - if (!(inode->i_state & I_DONTCACHE)) { - d_mark_dontcache(inode); - d_prune_aliases(inode); - } + if (!(inode->v.i_state & I_DONTCACHE) && + !(inode->v.i_state & I_FREEING)) { + this_pass_clean = false; + + d_mark_dontcache(&inode->v); + d_prune_aliases(&inode->v); + + /* + * If i_count was zero, we have to take and release a + * ref in order for I_DONTCACHE to be noticed and the + * inode to be dropped; + */ + + if (!atomic_read(&inode->v.i_count) && + igrab(&inode->v) && + darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) + break; + } else if (clean_pass && this_pass_clean) { + wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); - spin_lock(&inode->i_lock); - if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && - !(inode->i_state & I_FREEING)) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + mutex_unlock(&c->vfs_inodes_lock); + schedule(); finish_wait(wq, &wait.wq_entry); goto again; } - - spin_unlock(&inode->i_lock); } - spin_unlock(&sb->s_inode_list_lock); + mutex_unlock(&c->vfs_inodes_lock); + + darray_for_each(grabbed, i) + iput(&(*i)->v); + grabbed.nr = 0; + + if (!clean_pass || !this_pass_clean) { + clean_pass = this_pass_clean; + goto again; + } + + darray_exit(&grabbed); } static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 783e77c3..cf041353 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -13,6 +13,7 @@ struct bch_inode_info { struct inode v; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; struct mutex ei_update_lock; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 560545a7..7ccbc00b 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -803,9 +803,6 @@ retry: bch2_inode_unpack(k, &inode_u); - /* Subvolume root? */ - BUG_ON(inode_u.bi_subvol); - bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index d11feb10..c0371e23 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -151,11 +151,11 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bvec_iter_all iter; - struct bio_vec *bv; + struct bio_vec bv; bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); + if (bv.bv_page != ZERO_PAGE(0)) + mempool_free(bv.bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; } @@ -385,6 +385,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct open_buckets open_buckets; struct bkey_s_c k; struct bkey_buf old, new; + unsigned sectors_allocated; bool have_reservation = false; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; @@ -395,6 +396,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, closure_init_stack(&cl); open_buckets.nr = 0; retry: + sectors_allocated = 0; + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -451,15 +454,16 @@ retry: opts.data_replicas, opts.data_replicas, RESERVE_none, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { + if (ret) { bch2_trans_unlock(trans); closure_sync(&cl); - goto retry; - } - if (ret) + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + goto retry; return ret; + } sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; bch2_key_resize(&e->k, sectors); @@ -486,6 +490,9 @@ out: goto retry; } + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); @@ -1475,7 +1482,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_iter iter; struct bkey_s_c k; struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr, *ptr2; + const struct bch_extent_ptr *ptr; struct { struct bpos b; unsigned gen; @@ -1530,11 +1537,12 @@ retry: bucket_to_u64(buckets[nr_buckets].b)); prefetch(buckets[nr_buckets].l); - nr_buckets++; if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) goto err_get_ioref; + nr_buckets++; + if (ptr->unwritten) op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; } @@ -1625,12 +1633,8 @@ err: } return; err_get_ioref: - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); - } + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1639,9 +1643,8 @@ err_bucket_stale: bch2_bucket_nocow_unlock(&c->nocow_locks, buckets[i].b, BUCKET_NOCOW_LOCK_UPDATE); - - bkey_for_each_ptr(ptrs, ptr2) - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* We can retry this: */ ret = BCH_ERR_transaction_restart; @@ -1889,6 +1892,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_str(out, "pos: "); bch2_bpos_to_text(out, op->pos); prt_newline(out); + printbuf_indent_add(out, 2); prt_str(out, "started: "); bch2_pr_time_units(out, local_clock() - op->start_time); @@ -1897,6 +1901,11 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_str(out, "flags: "); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); } /* Cache promotion on read */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index c9c2ee9c..3f0e6d71 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -76,6 +76,67 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) p->devs.nr = 0; } +/* + * Detect stuck journal conditions and trigger shutdown. Technically the journal + * can end up stuck for a variety of reasons, such as a blocked I/O, journal + * reservation lockup, etc. Since this is a fatal error with potentially + * unpredictable characteristics, we want to be fairly conservative before we + * decide to shut things down. + * + * Consider the journal stuck when it appears full with no ability to commit + * btree transactions, to discard journal buckets, nor acquire priority + * (reserved watermark) reservation. + */ +static inline bool +journal_error_check_stuck(struct journal *j, int error, unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool stuck = false; + struct printbuf buf = PRINTBUF; + + if (!(error == JOURNAL_ERR_journal_full || + error == JOURNAL_ERR_journal_pin_full) || + nr_unwritten_journal_entries(j) || + (flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved) + return stuck; + + spin_lock(&j->lock); + + if (j->can_discard) { + spin_unlock(&j->lock); + return stuck; + } + + stuck = true; + + /* + * The journal shutdown path will set ->err_seq, but do it here first to + * serialize against concurrent failures and avoid duplicate error + * reports. + */ + if (j->err_seq) { + spin_unlock(&j->lock); + return stuck; + } + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", + bch2_journal_errors[error]); + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + dump_stack(); + + return stuck; +} + /* journal entry close/open: */ void __bch2_journal_buf_put(struct journal *j) @@ -163,6 +224,7 @@ void bch2_journal_halt(struct journal *j) __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); if (!j->err_seq) j->err_seq = journal_cur_seq(j); + journal_wake(j); spin_unlock(&j->lock); } @@ -363,6 +425,12 @@ retry: spin_lock(&j->lock); + /* check once more in case somebody else shut things down... */ + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return -BCH_ERR_erofs_journal_err; + } + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call journal_entry_close() @@ -410,28 +478,8 @@ unlock: if (!ret) goto retry; - - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && - !can_discard && - !nr_unwritten_journal_entries(j) && - (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { - struct printbuf buf = PRINTBUF; - - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", - bch2_journal_errors[ret]); - - bch2_journal_debug_to_text(&buf, j); - bch_err(c, "%s", buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); - bch_err(c, "Journal pins:\n%s", buf.buf); - - printbuf_exit(&buf); - bch2_fatal_error(c); - dump_stack(); - } + if (journal_error_check_stuck(j, ret, flags)) + ret = -BCH_ERR_journal_res_get_blocked; /* * Journal is full - can't rely on reclaim from work item due to diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 8c88884c..37c6846a 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -210,24 +210,7 @@ void bch2_journal_space_available(struct journal *j) clean = j->space[journal_space_clean].total; total = j->space[journal_space_total].total; - if (!clean_ondisk && - journal_cur_seq(j) == j->seq_ondisk) { - struct printbuf buf = PRINTBUF; - - __bch2_journal_debug_to_text(&buf, j); - bch_err(c, "journal stuck\n%s", buf.buf); - printbuf_exit(&buf); - - /* - * Hack: bch2_fatal_error() calls bch2_journal_halt() which - * takes journal lock: - */ - spin_unlock(&j->lock); - bch2_fatal_error(c); - spin_lock(&j->lock); - - ret = JOURNAL_ERR_journal_stuck; - } else if (!j->space[journal_space_discarded].next_entry) + if (!j->space[journal_space_discarded].next_entry) ret = JOURNAL_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index e913b90f..c2dece27 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -148,7 +148,8 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + if (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", bch2_lru_types[type], diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 6f5851ea..d7bcdc88 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -627,9 +627,12 @@ void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, struct bkey_s_c k; struct printbuf buf = PRINTBUF; struct bch_backpointer bp; - u64 bp_offset = 0; + struct bpos bp_pos = POS_MIN; + unsigned nr_bps = 0; int ret; + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); again: @@ -650,6 +653,7 @@ again: } } + set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); return; failed_to_evacuate: @@ -665,17 +669,16 @@ failed_to_evacuate: bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, - &bp_offset, &bp, + &bp_pos, &bp, BTREE_ITER_CACHED); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (bp_offset == U64_MAX) + if (bkey_eq(bp_pos, POS_MAX)) break; - k = bch2_backpointer_get_key(trans, &iter, - bucket, bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -686,6 +689,10 @@ failed_to_evacuate: prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); bch2_trans_iter_exit(trans, &iter); + + if (++nr_bps > 10) + break; + bp_pos = bpos_nosnap_successor(bp_pos); } bch2_print_string_as_lines(KERN_ERR, buf.buf); @@ -709,11 +716,17 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, struct data_update_opts data_opts; unsigned dirty_sectors, bucket_size; u64 fragmentation; - u64 bp_offset = 0, cur_inum = U64_MAX; + u64 cur_inum = U64_MAX; + struct bpos bp_pos = POS_MIN; int ret = 0; bch2_bkey_buf_init(&sk); + /* + * We're not run in a context that handles transaction restarts: + */ + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); ret = lockrestart_do(trans, @@ -740,13 +753,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, - &bp_offset, &bp, + &bp_pos, &bp, BTREE_ITER_CACHED); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (bp_offset == U64_MAX) + if (bkey_eq(bp_pos, POS_MAX)) break; if (!bp.level) { @@ -754,8 +767,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, struct bkey_s_c k; unsigned i = 0; - k = bch2_backpointer_get_key(trans, &iter, - bucket, bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -810,8 +822,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, } else { struct btree *b; - b = bch2_backpointer_get_node(trans, &iter, - bucket, bp_offset, bp); + b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) continue; @@ -839,7 +850,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, } } next: - bp_offset++; + bp_pos = bpos_nosnap_successor(bp_pos); } trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index 285ffdb7..baf1f857 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -16,9 +16,20 @@ struct bch_move_stats { atomic64_t sectors_raced; }; -struct move_bucket_in_flight { +struct move_bucket_key { struct bpos bucket; u8 gen; +}; + +struct move_bucket { + struct move_bucket_key k; + unsigned sectors; +}; + +struct move_bucket_in_flight { + struct move_bucket_in_flight *next; + struct rhash_head hash; + struct move_bucket bucket; atomic_t count; }; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 178f96a6..d13a120d 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -34,8 +34,51 @@ #include #include +struct buckets_in_flight { + struct rhashtable table; + struct move_bucket_in_flight *first; + struct move_bucket_in_flight *last; + size_t nr; + size_t sectors; +}; + +static const struct rhashtable_params bch_move_bucket_params = { + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), +}; + +static struct move_bucket_in_flight * +move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +{ + struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); + int ret; + + if (!new) + return ERR_PTR(-ENOMEM); + + new->bucket = b; + + ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, + bch_move_bucket_params); + if (ret) { + kfree(new); + return ERR_PTR(ret); + } + + if (!list->first) + list->first = new; + else + list->last->next = new; + + list->last = new; + list->nr++; + list->sectors += b.sectors; + return new; +} + static int bch2_bucket_is_movable(struct btree_trans *trans, - struct bpos bucket, u64 time, u8 *gen) + struct move_bucket *b, u64 time) { struct btree_iter iter; struct bkey_s_c k; @@ -43,10 +86,13 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, const struct bch_alloc_v4 *a; int ret; - if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset)) + if (bch2_bucket_is_open(trans->c, + b->k.bucket.inode, + b->k.bucket.offset)) return 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); bch2_trans_iter_exit(trans, &iter); @@ -55,12 +101,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return ret; a = bch2_alloc_to_v4(k, &_a); - *gen = a->gen; + b->k.gen = a->gen; + b->sectors = a->dirty_sectors; + ret = data_type_movable(a->data_type) && a->fragmentation_lru && a->fragmentation_lru <= time; - if (ret) { + if (!ret) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, trans->c, k); @@ -71,41 +119,16 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return ret; } -typedef FIFO(struct move_bucket_in_flight) move_buckets_in_flight; - -struct move_bucket { - struct bpos bucket; - u8 gen; -}; - -typedef DARRAY(struct move_bucket) move_buckets; - -static int move_bucket_cmp(const void *_l, const void *_r) -{ - const struct move_bucket *l = _l; - const struct move_bucket *r = _r; - - return bkey_cmp(l->bucket, r->bucket); -} - -static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b) -{ - return bsearch(&b, - buckets_sorted->data, - buckets_sorted->nr, - sizeof(buckets_sorted->data[0]), - move_bucket_cmp) != NULL; -} - static void move_buckets_wait(struct btree_trans *trans, struct moving_context *ctxt, - move_buckets_in_flight *buckets_in_flight, - size_t nr, bool verify_evacuated) + struct buckets_in_flight *list, + bool flush) { - while (!fifo_empty(buckets_in_flight)) { - struct move_bucket_in_flight *i = &fifo_peek_front(buckets_in_flight); + struct move_bucket_in_flight *i; + int ret; - if (fifo_used(buckets_in_flight) > nr) + while ((i = list->first)) { + if (flush) move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); if (atomic_read(&i->count)) @@ -116,66 +139,82 @@ static void move_buckets_wait(struct btree_trans *trans, * reads, which inits another btree_trans; this one must be * unlocked: */ - if (verify_evacuated) - bch2_verify_bucket_evacuated(trans, i->bucket, i->gen); - buckets_in_flight->front++; + bch2_verify_bucket_evacuated(trans, i->bucket.k.bucket, i->bucket.k.gen); + + list->first = i->next; + if (!list->first) + list->last = NULL; + + list->nr--; + list->sectors -= i->bucket.sectors; + + ret = rhashtable_remove_fast(&list->table, &i->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(i); } bch2_trans_unlock(trans); } +static bool bucket_in_flight(struct buckets_in_flight *list, + struct move_bucket_key k) +{ + return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); +} + +typedef DARRAY(struct move_bucket) move_buckets; + static int bch2_copygc_get_buckets(struct btree_trans *trans, struct moving_context *ctxt, - move_buckets_in_flight *buckets_in_flight, + struct buckets_in_flight *buckets_in_flight, move_buckets *buckets) { + struct bch_fs *c = trans->c; struct btree_iter iter; - move_buckets buckets_sorted = { 0 }; - struct move_bucket_in_flight *i; struct bkey_s_c k; - size_t fifo_iter, nr_to_get; + size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); + size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; - move_buckets_wait(trans, ctxt, buckets_in_flight, buckets_in_flight->size / 2, true); + move_buckets_wait(trans, ctxt, buckets_in_flight, false); - nr_to_get = max(16UL, fifo_used(buckets_in_flight) / 4); - - fifo_for_each_entry_ptr(i, buckets_in_flight, fifo_iter) { - ret = darray_push(&buckets_sorted, ((struct move_bucket) {i->bucket, i->gen})); - if (ret) { - bch_err(trans->c, "error allocating move_buckets_sorted"); - goto err; - } - } - - sort(buckets_sorted.data, - buckets_sorted.nr, - sizeof(buckets_sorted.data[0]), - move_bucket_cmp, - NULL); + ret = bch2_btree_write_buffer_flush(trans); + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + __func__, bch2_err_str(ret))) + return ret; ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ - struct move_bucket b = { .bucket = u64_to_bucket(k.k->p.offset) }; + struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret = 0; - if (!bucket_in_flight(&buckets_sorted, b) && - bch2_bucket_is_movable(trans, b.bucket, lru_pos_time(k.k->p), &b.gen)) - ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + saw++; + if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + not_movable++; + else if (bucket_in_flight(buckets_in_flight, b.k)) + in_flight++; + else { + ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret >= 0) + sectors += b.sectors; + } ret; })); -err: - darray_exit(&buckets_sorted); + + pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", + buckets_in_flight->nr, buckets_in_flight->sectors, + saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); return ret < 0 ? ret : 0; } static int bch2_copygc(struct btree_trans *trans, struct moving_context *ctxt, - move_buckets_in_flight *buckets_in_flight) + struct buckets_in_flight *buckets_in_flight) { struct bch_fs *c = trans->c; struct data_update_opts data_opts = { @@ -187,11 +226,6 @@ static int bch2_copygc(struct btree_trans *trans, u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", - __func__, bch2_err_str(ret))) - return ret; - ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); if (ret) goto err; @@ -200,12 +234,17 @@ static int bch2_copygc(struct btree_trans *trans, if (unlikely(freezing(current))) break; - f = fifo_push_ref(buckets_in_flight); - f->bucket = i->bucket; - f->gen = i->gen; - atomic_set(&f->count, 0); + f = move_bucket_in_flight_add(buckets_in_flight, *i); + ret = PTR_ERR_OR_ZERO(f); + if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ + continue; + if (ret == -ENOMEM) { /* flush IO, continue later */ + ret = 0; + break; + } - ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket, f->gen, data_opts); + ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, + f->bucket.k.gen, data_opts); if (ret) goto err; } @@ -269,6 +308,12 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) atomic64_read(&c->io_clock[WRITE].now)) << 9); prt_newline(out); + prt_printf(out, "Currently waiting since: "); + prt_human_readable_u64(out, max(0LL, + atomic64_read(&c->io_clock[WRITE].now) - + c->copygc_wait_at) << 9); + prt_newline(out); + prt_printf(out, "Currently calculated wait: "); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); @@ -281,13 +326,17 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - move_buckets_in_flight move_buckets; + struct buckets_in_flight move_buckets; u64 last, wait; int ret = 0; - if (!init_fifo(&move_buckets, 1 << 14, GFP_KERNEL)) { - bch_err(c, "error allocating copygc buckets in flight"); - return -ENOMEM; + memset(&move_buckets, 0, sizeof(move_buckets)); + + ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); + if (ret) { + bch_err(c, "error allocating copygc buckets in flight: %s", + bch2_err_str(ret)); + return ret; } set_freezable(); @@ -303,12 +352,12 @@ static int bch2_copygc_thread(void *arg) cond_resched(); if (!c->copy_gc_enabled) { - move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); + move_buckets_wait(&trans, &ctxt, &move_buckets, true); kthread_wait_freezable(c->copy_gc_enabled); } if (unlikely(freezing(current))) { - move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); + move_buckets_wait(&trans, &ctxt, &move_buckets, true); __refrigerator(false); continue; } @@ -317,9 +366,10 @@ static int bch2_copygc_thread(void *arg) wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { - move_buckets_wait(&trans, &ctxt, &move_buckets, 0, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); + c->copygc_wait_at = last; c->copygc_wait = last + wait; + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); continue; @@ -334,9 +384,9 @@ static int bch2_copygc_thread(void *arg) wake_up(&c->copygc_running_wq); } + move_buckets_wait(&trans, &ctxt, &move_buckets, true); bch2_trans_exit(&trans); bch2_moving_ctxt_exit(&ctxt); - free_fifo(&move_buckets); return 0; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 73f7663c..8cc8af6d 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -476,6 +476,26 @@ void bch2_journal_keys_free(struct journal_keys *keys) keys->nr = keys->gap = keys->size = 0; } +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + static int journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; @@ -483,8 +503,7 @@ static int journal_keys_sort(struct bch_fs *c) struct jset_entry *entry; struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; - struct journal_key *src, *dst; - size_t nr_keys = 0; + size_t nr_keys = 0, nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -503,9 +522,19 @@ static int journal_keys_sort(struct bch_fs *c) keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys)", + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } } genradix_for_each(&c->journal_entries, iter, _i) { @@ -514,7 +543,17 @@ static int journal_keys_sort(struct bch_fs *c) if (!i || i->ignore) continue; - for_each_jset_key(k, entry, &i->j) + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + keys->d[keys->nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, @@ -522,23 +561,15 @@ static int journal_keys_sort(struct bch_fs *c) .journal_seq = le64_to_cpu(i->j.seq), .journal_offset = k->_data - i->j._data, }; + + nr_read++; + } } - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); - - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) - src++; - - *dst++ = *src++; - } - - keys->nr = dst - keys->d; + __journal_keys_sort(keys); keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); return 0; } @@ -614,8 +645,8 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq) journal_sort_seq_cmp, NULL); if (keys->nr) { - ret = bch2_fs_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", - keys->nr, start_seq, end_seq); + ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", + keys->nr, start_seq, end_seq); if (ret) goto err; } @@ -649,7 +680,7 @@ static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq) ret = bch2_journal_error(j); if (keys->nr && !ret) - bch2_fs_log_msg(c, "journal replay finished"); + bch2_journal_log_msg(c, "journal replay finished"); err: kvfree(keys_sorted); return ret; @@ -1103,14 +1134,11 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!c->opts.nochanges) { - if (c->sb.version < bcachefs_metadata_version_lru_v2) { - bch_info(c, "version prior to backpointers, upgrade and fsck required"); + if (c->sb.version < bcachefs_metadata_version_no_bps_in_alloc_keys) { + bch_info(c, "version prior to no_bps_in_alloc_keys, upgrade and fsck required"); c->opts.version_upgrade = true; c->opts.fsck = true; c->opts.fix_errors = FSCK_OPT_YES; - } else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) { - bch_info(c, "version prior to backpointers, upgrade required"); - c->opts.version_upgrade = true; } } @@ -1213,8 +1241,8 @@ use_clean: journal_seq += 8; if (blacklist_seq != journal_seq) { - ret = bch2_fs_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_seq) ?: + ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", + blacklist_seq, journal_seq) ?: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { @@ -1223,14 +1251,14 @@ use_clean: } } - ret = bch2_fs_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_seq, last_seq, blacklist_seq - 1) ?: + ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", + journal_seq, last_seq, blacklist_seq - 1) ?: bch2_fs_journal_start(&c->journal, journal_seq); if (ret) goto err; if (c->opts.reconstruct_alloc) - bch2_fs_log_msg(c, "dropping alloc info"); + bch2_journal_log_msg(c, "dropping alloc info"); /* * Skip past versions that might have possibly been used (as nonces), diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 43d83705..6407d19e 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -714,7 +714,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) void bch2_delete_dead_snapshots_async(struct bch_fs *c) { if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(system_long_wq, &c->snapshot_delete_work)) + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } @@ -926,7 +926,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; - if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index f7562b5d..aa49c45a 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -6,4 +6,16 @@ typedef DARRAY(u32) snapshot_id_list; +struct snapshot_t { + u32 parent; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 equiv; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 8a269b68..613d09f5 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -494,6 +494,8 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints); + if (c->write_ref_wq) + destroy_workqueue(c->write_ref_wq); if (c->io_complete_wq) destroy_workqueue(c->io_complete_wq); if (c->copygc_wq) @@ -709,6 +711,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) sema_init(&c->io_in_flight, 128); + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -784,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || @@ -1738,6 +1745,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); + ret = bch2_fs_freespace_init(c); + if (ret) + bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + up_write(&c->state_lock); return 0; err: diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c index 70573981..6813147d 100644 --- a/libbcachefs/trace.c +++ b/libbcachefs/trace.c @@ -2,8 +2,10 @@ #include "bcachefs.h" #include "alloc_types.h" #include "buckets.h" +#include "btree_cache.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update_interior.h" #include "keylist.h" #include "opts.h" diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 56c21c61..c50473d4 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -240,36 +240,6 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) @@ -343,6 +313,36 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) /* time stats: */ #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct bch2_quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, u64 start, u64 end) { diff --git a/linux/bio.c b/linux/bio.c index 93a791c4..29d096d4 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -168,10 +168,10 @@ struct bio *bio_split(struct bio *bio, int sectors, void bio_free_pages(struct bio *bio) { struct bvec_iter_all iter; - struct bio_vec *bvec; + struct bio_vec bvec; bio_for_each_segment_all(bvec, bio, iter) - __free_page(bvec->bv_page); + __free_page(bvec.bv_page); } void bio_advance(struct bio *bio, unsigned bytes)