From 5c811f012bfdab4a0428cd31356c8a441427d3b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Nov 2025 13:29:10 -0500 Subject: [PATCH] Update bcachefs sources to ba3f652e4cdc bcachefs: Decrypt before checking if we read the right btree node Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- include/linux/bio.h | 42 +------ include/linux/bvec.h | 18 +-- libbcachefs/bcachefs.h | 5 +- libbcachefs/btree/check.c | 7 +- libbcachefs/btree/interior.c | 39 +++++-- libbcachefs/btree/node_scan.c | 33 +++--- libbcachefs/btree/read.c | 54 ++++----- libbcachefs/data/checksum.c | 11 +- libbcachefs/data/compress.c | 52 +++++---- libbcachefs/data/extents.c | 2 + libbcachefs/data/read.c | 29 +++-- libbcachefs/data/read.h | 2 + libbcachefs/data/write.c | 70 ++++++----- libbcachefs/data/write.h | 2 +- libbcachefs/debug/trace.h | 5 + libbcachefs/init/dev.c | 1 + libbcachefs/init/fs.c | 3 + libbcachefs/init/passes.c | 48 +++++--- libbcachefs/init/passes.h | 2 + libbcachefs/init/passes_format.h | 1 + libbcachefs/journal/read.h | 6 +- libbcachefs/journal/reclaim.c | 4 +- libbcachefs/journal/sb.c | 76 ++++++------ libbcachefs/sb/counters_format.h | 1 + libbcachefs/sb/errors.c | 25 ++-- libbcachefs/sb/errors_format.h | 10 +- libbcachefs/sb/members.c | 193 ++++++++++++++++++++++++++++--- libbcachefs/sb/members.h | 37 +++++- libbcachefs/util/darray.h | 3 + libbcachefs/util/util.c | 65 ++++------- libbcachefs/vfs/fs.c | 8 -- linux/bio.c | 51 ++------ linux/blkdev.c | 9 +- 34 files changed, 533 insertions(+), 383 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index b93e74cc..e79bddec 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -c53ba9651da768e74b787eb40bc05fd56e9ca5ef +ba3f652e4cdc86313cb13380efd59f1e6e6f484f diff --git a/include/linux/bio.h b/include/linux/bio.h index 6528fe19..7741b3e9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -34,15 +34,9 @@ #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) -#define bio_iter_page(bio, iter) \ - bvec_iter_page((bio)->bi_io_vec, (iter)) #define bio_iter_len(bio, iter) \ bvec_iter_len((bio)->bi_io_vec, (iter)) -#define bio_iter_offset(bio, iter) \ - bvec_iter_offset((bio)->bi_io_vec, (iter)) -#define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter) -#define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter) #define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter) #define bio_multiple_segments(bio) \ @@ -99,20 +93,6 @@ static inline unsigned int bio_cur_bytes(struct bio *bio) return bio->bi_iter.bi_size; } -static inline void *bio_data(struct bio *bio) -{ - if (bio_has_data(bio)) - return page_address(bio_page(bio)) + bio_offset(bio); - - return NULL; -} - -#define __bio_kmap_atomic(bio, iter) \ - (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ - bio_iter_iovec((bio), (iter)).bv_offset) - -#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) - static inline struct bio_vec *bio_next_segment(const struct bio *bio, struct bvec_iter_all *iter) { @@ -238,7 +218,6 @@ struct bio *bio_alloc_bioset(struct block_device *, unsigned, extern void bio_put(struct bio *); -int bio_add_page(struct bio *, struct page *, unsigned, unsigned); void bio_add_virt_nofail(struct bio *, void *, unsigned); static inline void bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned len) @@ -265,8 +244,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); -void bio_free_pages(struct bio *bio); - void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); static inline void zero_fill_bio(struct bio *bio) @@ -284,30 +261,13 @@ do { \ (dst)->bi_bdev = (src)->bi_bdev; \ } while (0) -static inline void *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - return page_address(bvec->bv_page) + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - *flags = 0; -} - static inline void *bvec_kmap_local(struct bio_vec *bvec) { - return page_address(bvec->bv_page) + bvec->bv_offset; + return bvec_virt(bvec); } static inline void bvec_kunmap_local(char *buffer) {} -static inline void *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, - unsigned long *flags) -{ - return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags); -} -#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) - #define bio_kmap_irq(bio, flags) \ __bio_kmap_irq((bio), (bio)->bi_iter, (flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 5bc68b42..2bc26d80 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -27,9 +27,8 @@ * was unsigned short, but we might as well be ready for > 64kB I/O pages */ struct bio_vec { - struct page *bv_page; + void *bv_addr; unsigned int bv_len; - unsigned int bv_offset; }; struct bvec_iter { @@ -53,21 +52,22 @@ struct bvec_iter_all { */ #define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx]) -#define bvec_iter_page(bvec, iter) \ - (__bvec_iter_bvec((bvec), (iter))->bv_page) +static inline void *bvec_virt(struct bio_vec *bv) +{ + return bv->bv_addr; +} + +#define bvec_iter_addr(bvec, iter) \ + (__bvec_iter_bvec((bvec), (iter))->bv_addr + (iter).bi_bvec_done) #define bvec_iter_len(bvec, iter) \ min((iter).bi_size, \ __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done) -#define bvec_iter_offset(bvec, iter) \ - (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done) - #define bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ - .bv_page = bvec_iter_page((bvec), (iter)), \ + .bv_addr = bvec_iter_addr((bvec), (iter)), \ .bv_len = bvec_iter_len((bvec), (iter)), \ - .bv_offset = bvec_iter_offset((bvec), (iter)), \ }) static inline void bvec_iter_advance(const struct bio_vec *bv, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 72039d3f..7fabd6cf 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -593,6 +593,7 @@ struct bch_dev { * Committed by bch2_write_super() -> bch_fs_mi_update() */ struct bch_member_cpu mi; + u64 btree_allocated_bitmap_gc; atomic64_t errors[BCH_MEMBER_ERROR_NR]; unsigned long write_errors_start; @@ -865,6 +866,8 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; + struct delayed_work maybe_schedule_btree_bitmap_gc; + /* snapshot.c: */ struct snapshot_table __rcu *snapshots; struct mutex snapshot_table_lock; @@ -1037,7 +1040,7 @@ struct bch_fs { struct bio_set bio_write; struct bio_set replica_set; struct mutex bio_bounce_pages_lock; - mempool_t bio_bounce_pages; + mempool_t bio_bounce_bufs; struct bucket_nocow_lock_table nocow_locks; struct rhashtable promote_table; diff --git a/libbcachefs/btree/check.c b/libbcachefs/btree/check.c index 8dbf3d29..4d42007b 100644 --- a/libbcachefs/btree/check.c +++ b/libbcachefs/btree/check.c @@ -661,16 +661,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k.k->bversion.lo); } - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked_nogc(c, k), trans, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - guard(mutex)(&c->sb_lock); + buf.buf))) bch2_dev_btree_bitmap_mark(c, k); - bch2_write_super(c); - } /* * We require a commit before key_trigger() because diff --git a/libbcachefs/btree/interior.c b/libbcachefs/btree/interior.c index 3f98f2cc..7e978db3 100644 --- a/libbcachefs/btree/interior.c +++ b/libbcachefs/btree/interior.c @@ -639,10 +639,12 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) struct bch_fs *c = as->c; guard(mutex)(&c->sb_lock); + bool write_sb = false; darray_for_each(as->new_nodes, i) - bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(&i->key)); + bch2_dev_btree_bitmap_mark_locked(c, bkey_i_to_s_c(&i->key), &write_sb); - bch2_write_super(c); + if (write_sb) + bch2_write_super(c); } static void bkey_strip_reconcile(const struct bch_fs *c, struct bkey_s k) @@ -2133,18 +2135,35 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + btree_node_u64s_with_format(m->nr, &m->format, &new_f); - if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { - sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - sib_u64s /= 2; - sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + if (trace_btree_node_merge_attempt_enabled()) { + CLASS(printbuf, buf)(); + guard(printbuf_indent)(&buf); + + bch2_btree_pos_to_text(&buf, c, prev); + prt_printf(&buf, "live u64s %u (%zu%% full)\n", + prev->nr.live_u64s, + prev->nr.live_u64s * 100 / btree_max_u64s(c)); + + bch2_btree_pos_to_text(&buf, c, next); + prt_printf(&buf, "live u64s %u (%zu%% full)\n", + next->nr.live_u64s, + next->nr.live_u64s * 100 / btree_max_u64s(c)); + + prt_printf(&buf, "merged would have %zu threshold %u\n", + sib_u64s, c->btree_foreground_merge_threshold); + trace_btree_node_merge_attempt(c, buf.buf); } + count_event(c, btree_node_merge_attempt); - sib_u64s = min(sib_u64s, btree_max_u64s(c)); - sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); - b->sib_u64s[sib] = sib_u64s; + if (sib_u64s > c->btree_foreground_merge_threshold) { + if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) + sib_u64s -= (sib_u64s - BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) / 2; - if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); + b->sib_u64s[sib] = sib_u64s; goto out; + } parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, diff --git a/libbcachefs/btree/node_scan.c b/libbcachefs/btree/node_scan.c index 2c1c74e5..7ac23e56 100644 --- a/libbcachefs/btree/node_scan.c +++ b/libbcachefs/btree/node_scan.c @@ -235,27 +235,30 @@ static int read_btree_nodes_worker(void *p) goto err; } + u64 buckets_to_scan = 0; for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + buckets_to_scan += c->sb.version_upgrade_complete < bcachefs_metadata_version_mi_btree_bitmap || + bch2_dev_btree_bitmap_marked_sectors_any(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size); + + u64 buckets_scanned = 0; + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors_any(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) + continue; + for (unsigned bucket_offset = 0; bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; - bucket_offset += btree_sectors(c)) { - if (time_after(jiffies, last_print + HZ * 30)) { - u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; - u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) + try_read_btree_node(w->f, ca, b, bio, bucket_to_sector(ca, bucket) + bucket_offset); - bch_info(ca, "%s: %2u%% done", __func__, - (unsigned) div64_u64(cur_sector * 100, end_sector)); - last_print = jiffies; - } + buckets_scanned++; - u64 sector = bucket * ca->mi.bucket_size + bucket_offset; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && - !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) - continue; - - try_read_btree_node(w->f, ca, b, bio, sector); + if (time_after(jiffies, last_print + HZ * 30)) { + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(buckets_scanned * 100, buckets_to_scan)); + last_print = jiffies; } + } err: if (b) __btree_node_data_free(b); diff --git a/libbcachefs/btree/read.c b/libbcachefs/btree/read.c index 407c1c0f..668f2bae 100644 --- a/libbcachefs/btree/read.c +++ b/libbcachefs/btree/read.c @@ -660,33 +660,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - bch2_bpos_to_text(&buf, b->data->min_key); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, b->data->max_key); - - btree_err_on(b->data->keys.seq != bp->seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "got wrong btree node: got\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } else { - btree_err_on(!b->data->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "bad btree header: seq 0\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } - while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; bool first = !b->written; @@ -743,6 +716,33 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, goto fsck_err; } + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + bch2_bpos_to_text(&buf, b->data->min_key); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, b->data->max_key); + + btree_err_on(b->data->keys.seq != bp->seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, NULL, + btree_node_bad_seq, + "got wrong btree node: got\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); + } else { + btree_err_on(!b->data->keys.seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, NULL, + btree_node_bad_seq, + "bad btree header: seq 0\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); + } + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, diff --git a/libbcachefs/data/checksum.c b/libbcachefs/data/checksum.c index d0944b2e..7a6e9609 100644 --- a/libbcachefs/data/checksum.c +++ b/libbcachefs/data/checksum.c @@ -202,15 +202,14 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + void *p = bvec_kmap_local(&bv); bch2_checksum_update(&state, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) - bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); + bch2_checksum_update(&state, bvec_virt(&bv), bv.bv_len); #endif return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } @@ -225,16 +224,14 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + void *p = bvec_kmap_local(&bv); poly1305_update(&dctx, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) - poly1305_update(&dctx, - page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); + poly1305_update(&dctx, bvec_virt(&bv), bv.bv_len); #endif poly1305_final(&dctx, digest); diff --git a/libbcachefs/data/compress.c b/libbcachefs/data/compress.c index 96071056..9bc94a3b 100644 --- a/libbcachefs/data/compress.c +++ b/libbcachefs/data/compress.c @@ -95,12 +95,12 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) void *expected_start = NULL; __bio_for_each_bvec(bv, bio, iter, start) { - if (expected_start && - expected_start != page_address(bv.bv_page) + bv.bv_offset) + void *bv_addr = bvec_virt(&bv); + + if (expected_start && expected_start != bv_addr) return false; - expected_start = page_address(bv.bv_page) + - bv.bv_offset + bv.bv_len; + expected_start = bv_addr + bv.bv_len; } return true; @@ -109,27 +109,27 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, struct bvec_iter start, int rw) { - struct bio_vec bv; - struct bvec_iter iter; - unsigned nr_pages = 0; - struct page *stack_pages[16]; - struct page **pages = NULL; - void *data; - BUG_ON(start.bi_size > c->opts.encoded_extent_max); - if (!PageHighMem(bio_iter_page(bio, start)) && - bio_phys_contig(bio, start)) +#ifndef CONFIG_HIGHMEM + if (bio_phys_contig(bio, start)) return (struct bbuf) { .c = c, - .b = page_address(bio_iter_page(bio, start)) + - bio_iter_offset(bio, start), + .b = bvec_virt(&bio_iter_iovec(bio, start)), .type = BB_none, .rw = rw }; +#endif +#ifdef __KERNEL__ /* check if we can map the pages contiguously: */ + struct bio_vec bv; + struct bvec_iter iter; + unsigned nr_pages = 0; + __bio_for_each_segment(bv, bio, iter, start) { + BUG_ON(bv.bv_offset + bv.bv_len > PAGE_SIZE); + if (iter.bi_size != start.bi_size && bv.bv_offset) return bio_bounce(c, bio, start, rw); @@ -143,7 +143,8 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); - pages = nr_pages > ARRAY_SIZE(stack_pages) + struct page *stack_pages[16]; + struct page **pages = nr_pages > ARRAY_SIZE(stack_pages) ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) : stack_pages; if (!pages) @@ -153,19 +154,20 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, __bio_for_each_segment(bv, bio, iter, start) pages[nr_pages++] = bv.bv_page; - data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + void *data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (pages != stack_pages) kfree(pages); - if (!data) - return bio_bounce(c, bio, start, rw); + if (data) + return (struct bbuf) { + c, + data + bio_iter_offset(bio, start), + BB_vmap, + rw + }; +#endif /* __KERNEL__ */ - return (struct bbuf) { - c, - data + bio_iter_offset(bio, start), - BB_vmap, - rw - }; + return bio_bounce(c, bio, start, rw); } static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) diff --git a/libbcachefs/data/extents.c b/libbcachefs/data/extents.c index 72d980fd..cfd92d87 100644 --- a/libbcachefs/data/extents.c +++ b/libbcachefs/data/extents.c @@ -28,6 +28,8 @@ #include "util/util.h" #ifdef CONFIG_BCACHEFS_DEBUG +#include + static int bch2_force_read_device = -1; module_param_named(force_read_device, bch2_force_read_device, int, 0644); diff --git a/libbcachefs/data/read.c b/libbcachefs/data/read.c index a919ee7d..e1adc8fe 100644 --- a/libbcachefs/data/read.c +++ b/libbcachefs/data/read.c @@ -344,7 +344,7 @@ err_remove_hash: BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params)); err: - bio_free_pages(&op->write.op.wbio.bio); + bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: @@ -1253,7 +1253,7 @@ retry_pick: &c->bio_read_split), orig); - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + bch2_bio_alloc_pages_pool(c, &rbio->bio, 512, sectors << 9); rbio->bounce = true; } else if (flags & BCH_READ_must_clone) { /* @@ -1591,16 +1591,29 @@ void bch2_fs_io_read_exit(struct bch_fs *c) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); - mempool_exit(&c->bio_bounce_pages); + mempool_exit(&c->bio_bounce_bufs); +} + +static void *bio_bounce_buf_alloc_fn(gfp_t gfp, void *pool_data) +{ + return (void *) __get_free_pages(gfp, PAGE_ALLOC_COSTLY_ORDER); +} + +static void bio_bounce_buf_free_fn(void *p, void *pool_data) +{ + free_pages((unsigned long) p, PAGE_ALLOC_COSTLY_ORDER); } int bch2_fs_io_read_init(struct bch_fs *c) { - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) + if (mempool_init(&c->bio_bounce_bufs, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + BIO_BOUNCE_BUF_POOL_LEN, + bio_bounce_buf_alloc_fn, + bio_bounce_buf_free_fn, + NULL)) return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), diff --git a/libbcachefs/data/read.h b/libbcachefs/data/read.h index b7543531..f24b6613 100644 --- a/libbcachefs/data/read.h +++ b/libbcachefs/data/read.h @@ -7,6 +7,8 @@ #include "extents_types.h" #include "data/reflink.h" +#define BIO_BOUNCE_BUF_POOL_LEN (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void bch2_dev_congested_to_text(struct printbuf *, struct bch_dev *); #endif diff --git a/libbcachefs/data/write.c b/libbcachefs/data/write.c index 5b9957dd..68995bdd 100644 --- a/libbcachefs/data/write.c +++ b/libbcachefs/data/write.c @@ -113,42 +113,41 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { - struct bvec_iter_all iter; - struct bio_vec *bv; + for (struct bio_vec *bv = bio->bi_io_vec; + bv < bio->bi_io_vec + bio->bi_vcnt; + bv++) { + void *p = bvec_virt(bv); - bio_for_each_segment_all(bv, bio, iter) - mempool_free(bv->bv_page, &c->bio_bounce_pages); + if (bv->bv_len == BIO_BOUNCE_BUF_POOL_LEN) + mempool_free(p, &c->bio_bounce_bufs); + else + free_pages((unsigned long) p, get_order(bv->bv_len)); + } bio->bi_vcnt = 0; } -static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) +static void __bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + unsigned bs, size_t size) { - if (likely(!*using_mempool)) { - struct page *page = alloc_page(GFP_NOFS); - if (likely(page)) - return page; + mutex_lock(&c->bio_bounce_pages_lock); - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - } - return mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); + while (bio->bi_iter.bi_size < size) + bio_add_virt_nofail(bio, + mempool_alloc(&c->bio_bounce_bufs, GFP_NOFS), + BIO_BOUNCE_BUF_POOL_LEN); + + bio->bi_iter.bi_size = min(bio->bi_iter.bi_size, size); + + mutex_unlock(&c->bio_bounce_pages_lock); } void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t size) + unsigned bs, size_t size) { - bool using_mempool = false; + bch2_bio_alloc_pages(bio, c->opts.block_size, size, GFP_NOFS); - while (size) { - struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - BUG_ON(!bio_add_page(bio, page, len, 0)); - size -= len; - } - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); + if (bio->bi_iter.bi_size < size) + __bch2_bio_alloc_pages_pool(c, bio, bs, size); } /* Extent update path: */ @@ -837,23 +836,22 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, return bio; } - wbio->bounce = true; + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: */ - bch2_bio_alloc_pages_pool(c, bio, - min_t(unsigned, output_available, - c->opts.encoded_extent_max)); + bch2_bio_alloc_pages(bio, + c->opts.block_size, + output_available, + GFP_NOFS); - if (bio->bi_iter.bi_size < output_available) - *page_alloc_failed = - bch2_bio_alloc_pages(bio, - c->opts.block_size, - output_available - - bio->bi_iter.bi_size, - GFP_NOFS) != 0; + unsigned required = min(output_available, c->opts.encoded_extent_max); + + if (unlikely(bio->bi_iter.bi_size < required)) + __bch2_bio_alloc_pages_pool(c, bio, c->opts.block_size, required); return bio; } diff --git a/libbcachefs/data/write.h b/libbcachefs/data/write.h index 6498b0c0..99a6bd7e 100644 --- a/libbcachefs/data/write.h +++ b/libbcachefs/data/write.h @@ -9,7 +9,7 @@ container_of((_bio), struct bch_write_bio, bio) void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, unsigned, size_t); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); diff --git a/libbcachefs/debug/trace.h b/libbcachefs/debug/trace.h index c125bbd6..3a065d83 100644 --- a/libbcachefs/debug/trace.h +++ b/libbcachefs/debug/trace.h @@ -556,6 +556,11 @@ DEFINE_EVENT(fs_str, btree_node_rewrite, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, btree_node_merge_attempt, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, btree_node_merge, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/libbcachefs/init/dev.c b/libbcachefs/init/dev.c index 4d65d0b0..3271c20e 100644 --- a/libbcachefs/init/dev.c +++ b/libbcachefs/init/dev.c @@ -331,6 +331,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); + ca->btree_allocated_bitmap_gc = le64_to_cpu(member->btree_allocated_bitmap); for (i = 0; i < ARRAY_SIZE(member->errors); i++) atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); diff --git a/libbcachefs/init/fs.c b/libbcachefs/init/fs.c index 55ce70ab..8eab7024 100644 --- a/libbcachefs/init/fs.c +++ b/libbcachefs/init/fs.c @@ -264,11 +264,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) unsigned clean_passes = 0; u64 seq = 0; + bch2_maybe_schedule_btree_bitmap_gc_stop(c); bch2_fs_ec_stop(c); bch2_open_buckets_stop(c, NULL, true); bch2_reconcile_stop(c); bch2_copygc_stop(c); bch2_fs_ec_flush(c); + cancel_delayed_work_sync(&c->maybe_schedule_btree_bitmap_gc); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", journal_cur_seq(&c->journal)); @@ -524,6 +526,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_invalidates(c); bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); + bch2_maybe_schedule_btree_bitmap_gc(c); return 0; } diff --git a/libbcachefs/init/passes.c b/libbcachefs/init/passes.c index 8c6b184a..dcfaac43 100644 --- a/libbcachefs/init/passes.c +++ b/libbcachefs/init/passes.c @@ -176,31 +176,43 @@ void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, } } -static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) +static bool bch2_recovery_pass_entry_get_locked(struct bch_fs *c, enum bch_recovery_pass pass, + struct recovery_pass_entry *e) { - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - bool ret = false; - lockdep_assert_held(&c->sb_lock); struct bch_sb_field_recovery_passes *r = bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - if (stable < recovery_passes_nr_entries(r)) { - struct recovery_pass_entry *i = r->start + stable; + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + bool found = stable < recovery_passes_nr_entries(r); + if (found) + *e = r->start[stable]; - /* - * Ratelimit if the last runtime was more than 1% of the time - * since we last ran - */ - ret = (u64) le32_to_cpu(i->last_runtime) * 100 > - ktime_get_real_seconds() - le64_to_cpu(i->last_run); + return found; +} - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - ret = false; - } +static bool bch2_recovery_pass_want_ratelimit_locked(struct bch_fs *c, enum bch_recovery_pass pass, + unsigned runtime_fraction) +{ + struct recovery_pass_entry e; + if (!bch2_recovery_pass_entry_get_locked(c, pass, &e)) + return false; - return ret; + /* + * Ratelimit if the last runtime was more than 1% of the time + * since we last ran + */ + return !BCH_RECOVERY_PASS_NO_RATELIMIT(&e) && + (u64) le32_to_cpu(e.last_runtime) * runtime_fraction > + ktime_get_real_seconds() - le64_to_cpu(e.last_run); +} + +bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass, + unsigned runtime_fraction) +{ + guard(mutex)(&c->sb_lock); + return bch2_recovery_pass_want_ratelimit_locked(c, pass, runtime_fraction); } const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { @@ -311,7 +323,7 @@ static bool recovery_pass_needs_set(struct bch_fs *c, *flags |= RUN_RECOVERY_PASS_nopersistent; if ((*flags & RUN_RECOVERY_PASS_ratelimit) && - !bch2_recovery_pass_want_ratelimit(c, pass)) + !bch2_recovery_pass_want_ratelimit_locked(c, pass, 100)) *flags &= ~RUN_RECOVERY_PASS_ratelimit; /* @@ -451,7 +463,7 @@ int bch2_require_recovery_pass(struct bch_fs *c, guard(mutex)(&c->sb_lock); - if (bch2_recovery_pass_want_ratelimit(c, pass)) + if (bch2_recovery_pass_want_ratelimit_locked(c, pass, 100)) return 0; enum bch_run_recovery_pass_flags flags = 0; diff --git a/libbcachefs/init/passes.h b/libbcachefs/init/passes.h index c37d7823..370fd1d2 100644 --- a/libbcachefs/init/passes.h +++ b/libbcachefs/init/passes.h @@ -46,6 +46,8 @@ static inline int bch2_recovery_cancelled(struct bch_fs *c) return 0; } +bool bch2_recovery_pass_want_ratelimit(struct bch_fs *, enum bch_recovery_pass, unsigned); + int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, diff --git a/libbcachefs/init/passes_format.h b/libbcachefs/init/passes_format.h index 6001b931..ba4f7cd9 100644 --- a/libbcachefs/init/passes_format.h +++ b/libbcachefs/init/passes_format.h @@ -66,6 +66,7 @@ x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_reconcile, 34, 0) \ + x(btree_bitmap_gc, 46, PASS_ONLINE) \ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) /* We normally enumerate recovery passes in the order we run them: */ diff --git a/libbcachefs/journal/read.h b/libbcachefs/journal/read.h index 556a7ff1..2f97672a 100644 --- a/libbcachefs/journal/read.h +++ b/libbcachefs/journal/read.h @@ -63,10 +63,12 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_jset_validate(struct bch_fs *, struct bch_dev *, struct jset *, u64, enum bch_validate_flags); -struct u64_range { +typedef struct u64_range { u64 start; u64 end; -}; +} u64_range; + +DEFINE_DARRAY(u64_range); struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index 37ad8649..04a138e0 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -726,7 +726,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) * we're holding the reclaim lock: */ lockdep_assert_held(&j->reclaim_lock); - flags = memalloc_noreclaim_save(); + flags = memalloc_nofs_save(); do { if (kthread && kthread_should_stop()) @@ -780,7 +780,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) wake_up(&j->reclaim_wait); } while ((min_nr || min_key_cache) && nr_flushed && !direct); - memalloc_noreclaim_restore(flags); + memalloc_flags_restore(flags); return ret; } diff --git a/libbcachefs/journal/sb.c b/libbcachefs/journal/sb.c index df66b9d9..12ab92fe 100644 --- a/libbcachefs/journal/sb.c +++ b/libbcachefs/journal/sb.c @@ -2,6 +2,7 @@ #include "bcachefs.h" +#include "journal/read.h" #include "journal/sb.h" #include "util/darray.h" @@ -28,35 +29,33 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, if (!nr) return 0; - u64 *b __free(kvfree) = kvmalloc_array(nr, sizeof(u64), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_validate; + CLASS(darray_u64, b)(); for (unsigned i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); + try(darray_push(&b, le64_to_cpu(journal->buckets[i]))); - sort(b, nr, sizeof(u64), u64_cmp, NULL); + darray_sort(b, u64_cmp); - if (!b[0]) { + if (!darray_first(b)) { prt_printf(err, "journal bucket at sector 0"); return -BCH_ERR_invalid_sb_journal; } - if (b[0] < le16_to_cpu(m.first_bucket)) { + if (darray_first(b) < le16_to_cpu(m.first_bucket)) { prt_printf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m.first_bucket)); + darray_first(b), le16_to_cpu(m.first_bucket)); return -BCH_ERR_invalid_sb_journal; } - if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { + if (darray_last(b) >= le64_to_cpu(m.nbuckets)) { prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m.nbuckets)); + darray_last(b), le64_to_cpu(m.nbuckets)); return -BCH_ERR_invalid_sb_journal; } - for (unsigned i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - prt_printf(err, "duplicate journal buckets %llu", b[i]); + darray_for_each(b, i) + if (i != &darray_last(b) && i[0] == i[1]) { + prt_printf(err, "duplicate journal buckets %llu", *i); return -BCH_ERR_invalid_sb_journal; } @@ -80,11 +79,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal = { .to_text = bch2_sb_journal_to_text, }; -struct u64_range { - u64 start; - u64 end; -}; - static int u64_range_cmp(const void *_l, const void *_r) { const struct u64_range *l = _l; @@ -104,15 +98,16 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f if (!nr) return 0; - struct u64_range *b __free(kvfree) = kvmalloc_array(nr, sizeof(*b), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_v2_validate; + CLASS(darray_u64_range, b)(); for (unsigned i = 0; i < nr; i++) { - b[i].start = le64_to_cpu(journal->d[i].start); - b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + struct u64_range r = { + .start = le64_to_cpu(journal->d[i].start), + .end = le64_to_cpu(journal->d[i].start) + + le64_to_cpu(journal->d[i].nr), + }; - if (b[i].end <= b[i].start) { + if (r.end <= r.start) { prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", le64_to_cpu(journal->d[i].start), le64_to_cpu(journal->d[i].nr)); @@ -120,34 +115,34 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f } sum += le64_to_cpu(journal->d[i].nr); + try(darray_push(&b, r)); } - sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + darray_sort(b, u64_range_cmp); - if (!b[0].start) { + if (!darray_first(b).start) { prt_printf(err, "journal bucket at sector 0"); return -BCH_ERR_invalid_sb_journal; } - if (b[0].start < le16_to_cpu(m.first_bucket)) { + if (darray_first(b).start < le16_to_cpu(m.first_bucket)) { prt_printf(err, "journal bucket %llu before first bucket %u", - b[0].start, le16_to_cpu(m.first_bucket)); + darray_first(b).start, le16_to_cpu(m.first_bucket)); return -BCH_ERR_invalid_sb_journal; } - if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { + if (darray_last(b).end > le64_to_cpu(m.nbuckets)) { prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); + darray_last(b).end - 1, le64_to_cpu(m.nbuckets)); return -BCH_ERR_invalid_sb_journal; } - for (unsigned i = 0; i + 1 < nr; i++) { - if (b[i].end > b[i + 1].start) { + darray_for_each(b, i) + if (i != &darray_last(b) && i[0].end > i[1].start) { prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", - b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + i[0].start, i[0].end, i[1].start, i[1].end); return -BCH_ERR_invalid_sb_journal; } - } if (sum > UINT_MAX) { prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); @@ -179,11 +174,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, u64 *buckets, unsigned nr) { - struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr_compacted = 1; + unsigned dst = 0, nr_compacted = 1; - if (c) - lockdep_assert_held(&c->sb_lock); + lockdep_assert_held(&c->sb_lock); if (!nr) { bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); @@ -191,11 +184,12 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, return 0; } - for (i = 0; i + 1 < nr; i++) + for (unsigned i = 0; i + 1 < nr; i++) if (buckets[i] + 1 != buckets[i + 1]) nr_compacted++; - j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, + struct bch_sb_field_journal_v2 *j = + bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) return bch_err_throw(c, ENOSPC_sb_journal); @@ -205,7 +199,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, j->d[dst].start = cpu_to_le64(buckets[0]); j->d[dst].nr = cpu_to_le64(1); - for (i = 1; i < nr; i++) { + for (unsigned i = 1; i < nr; i++) { if (buckets[i] == buckets[i - 1] + 1) { le64_add_cpu(&j->d[dst].nr, 1); } else { diff --git a/libbcachefs/sb/counters_format.h b/libbcachefs/sb/counters_format.h index a46f89dc..5a3a73f1 100644 --- a/libbcachefs/sb/counters_format.h +++ b/libbcachefs/sb/counters_format.h @@ -55,6 +55,7 @@ enum counters_flags { x(btree_node_read, 14, TYPE_COUNTER) \ x(btree_node_compact, 15, TYPE_COUNTER) \ x(btree_node_merge, 16, TYPE_COUNTER) \ + x(btree_node_merge_attempt, 101, TYPE_COUNTER) \ x(btree_node_split, 17, TYPE_COUNTER) \ x(btree_node_rewrite, 18, TYPE_COUNTER) \ x(btree_node_alloc, 19, TYPE_COUNTER) \ diff --git a/libbcachefs/sb/errors.c b/libbcachefs/sb/errors.c index 48851b87..aec5dcdd 100644 --- a/libbcachefs/sb/errors.c +++ b/libbcachefs/sb/errors.c @@ -5,6 +5,8 @@ #include "sb/errors.h" #include "sb/io.h" +#include "util/darray.h" + const char * const bch2_sb_error_strs[] = { #define x(t, n, ...) [n] = #t, BCH_SB_ERRS() @@ -63,25 +65,25 @@ static int error_entry_cmp(const void *_l, const void *_r) return -cmp_int(l->last_error_time, r->last_error_time); } +DEFINE_DARRAY(bch_sb_field_error_entry); + static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned nr = bch2_sb_field_errors_nr_entries(e); - struct bch_sb_field_error_entry *sorted = kvmalloc_array(nr, sizeof(*sorted), GFP_KERNEL); - - if (sorted) { - memcpy(sorted, e->entries, nr * sizeof(e->entries[0])); - sort(sorted, nr, sizeof(*sorted), error_entry_cmp, NULL); - } else { - sorted = e->entries; - } - if (out->nr_tabstops <= 1) printbuf_tabstop_push(out, 16); - for (struct bch_sb_field_error_entry *i = sorted; i < sorted + nr; i++) { + CLASS(darray_bch_sb_field_error_entry, sorted)(); + + for (struct bch_sb_field_error_entry *i = e->entries; i < e->entries + nr; i++) + darray_push(&sorted, *i); + + darray_sort(sorted, error_entry_cmp); + + darray_for_each(sorted, i) { bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(i)); prt_tab(out); prt_u64(out, BCH_SB_ERROR_ENTRY_NR(i)); @@ -89,9 +91,6 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, bch2_prt_datetime(out, le64_to_cpu(i->last_error_time)); prt_newline(out); } - - if (sorted != e->entries) - kvfree(sorted); } const struct bch_sb_field_ops bch_sb_field_ops_errors = { diff --git a/libbcachefs/sb/errors_format.h b/libbcachefs/sb/errors_format.h index 9958634e..3892a528 100644 --- a/libbcachefs/sb/errors_format.h +++ b/libbcachefs/sb/errors_format.h @@ -360,12 +360,14 @@ enum bch_sb_error_id { #undef x }; +typedef struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; +} bch_sb_field_error_entry; + struct bch_sb_field_errors { struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; + bch_sb_field_error_entry entries[]; }; LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); diff --git a/libbcachefs/sb/members.c b/libbcachefs/sb/members.c index af345115..e95bbc5e 100644 --- a/libbcachefs/sb/members.c +++ b/libbcachefs/sb/members.c @@ -2,16 +2,19 @@ #include "bcachefs.h" +#include "alloc/buckets.h" #include "alloc/disk_groups.h" #include "alloc/replicas.h" #include "btree/cache.h" +#include "btree/iter.h" #include "sb/members.h" #include "sb/io.h" #include "init/error.h" #include "init/passes.h" +#include "init/progress.h" int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) { @@ -512,35 +515,54 @@ void bch2_dev_errors_reset(struct bch_dev *ca) * have to scan full devices: */ -bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +static bool __bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k, bool with_gc) { guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (ca && - !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + !__bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c), with_gc)) return false; } return true; } -static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, - u64 start, unsigned sectors) +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); - u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + return __bch2_dev_btree_bitmap_marked(c, k, true); +} + +bool bch2_dev_btree_bitmap_marked_nogc(struct bch_fs *c, struct bkey_s_c k) +{ + return __bch2_dev_btree_bitmap_marked(c, k, false); +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_dev *ca, + struct bch_sb_field_members_v2 *mi, + u64 start, unsigned sectors, bool *write_sb) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); u64 end = start + sectors; int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); if (resize > 0) { + u64 old_bitmap = le64_to_cpu(m->btree_allocated_bitmap); u64 new_bitmap = 0; + u64 new_gc_bitmap = 0; - for (unsigned i = 0; i < 64; i++) - if (bitmap & BIT_ULL(i)) + for (unsigned i = 0; i < 64; i++) { + if (old_bitmap & BIT_ULL(i)) new_bitmap |= BIT_ULL(i >> resize); - bitmap = new_bitmap; + if (ca->btree_allocated_bitmap_gc & BIT_ULL(i)) + new_gc_bitmap |= BIT_ULL(i >> resize); + } + + m->btree_allocated_bitmap = cpu_to_le64(new_bitmap); m->btree_bitmap_shift += resize; + *write_sb = true; + + ca->btree_allocated_bitmap_gc = new_gc_bitmap; } BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); @@ -548,25 +570,164 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns for (unsigned bit = start >> m->btree_bitmap_shift; (u64) bit << m->btree_bitmap_shift < end; - bit++) - bitmap |= BIT_ULL(bit); + bit++) { + __le64 b = cpu_to_le64(BIT_ULL(bit)); - m->btree_allocated_bitmap = cpu_to_le64(bitmap); + if (!(m->btree_allocated_bitmap & b)) { + m->btree_allocated_bitmap |= b; + *write_sb = true; + } + + ca->btree_allocated_bitmap_gc |= BIT_ULL(bit); + } } -void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +void bch2_dev_btree_bitmap_mark_locked(struct bch_fs *c, struct bkey_s_c k, bool *write_sb) { lockdep_assert_held(&c->sb_lock); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + + guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) continue; - __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + __bch2_dev_btree_bitmap_mark(ca, mi, ptr->offset, btree_sectors(c), write_sb); } } +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + guard(mutex)(&c->sb_lock); + bool write_sb = false; + bch2_dev_btree_bitmap_mark_locked(c, k, &write_sb); + if (write_sb) + bch2_write_super(c); +} + +static int btree_bitmap_gc_btree_level(struct btree_trans *trans, + struct progress_indicator *progress, + enum btree_id btree, unsigned level) +{ + struct bch_fs *c = trans->c; + CLASS(btree_node_iter, iter)(trans, btree, POS_MIN, 0, level, BTREE_ITER_prefetch); + + try(for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!bch2_dev_btree_bitmap_marked(c, k)) + bch2_dev_btree_bitmap_mark(c, k); + + bch2_progress_update_iter(trans, progress, &iter, "btree_bitmap_gc"); + }))); + + return 0; +} + +int bch2_btree_bitmap_gc(struct bch_fs *c) +{ + struct progress_indicator progress; + bch2_progress_init_inner(&progress, c, 0, ~0ULL); + + scoped_guard(mutex, &c->sb_lock) { + guard(rcu)(); + for_each_member_device_rcu(c, ca, NULL) + ca->btree_allocated_bitmap_gc = 0; + } + + { + CLASS(btree_trans, trans)(c); + + for (unsigned btree = 0; btree < btree_id_nr_alive(c); btree++) { + for (unsigned level = 1; level < BTREE_MAX_DEPTH; level++) + try(btree_bitmap_gc_btree_level(trans, &progress, btree, level)); + + CLASS(btree_node_iter, iter)(trans, btree, POS_MIN, 0, + bch2_btree_id_root(c, btree)->b->c.level, 0); + struct btree *b; + try(lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)))); + + if (!bch2_dev_btree_bitmap_marked(c, bkey_i_to_s_c(&b->key))) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(&b->key)); + } + } + + u64 sectors_marked_old = 0, sectors_marked_new = 0; + + scoped_guard(mutex, &c->sb_lock) { + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + + scoped_guard(rcu) + for_each_member_device_rcu(c, ca, NULL) { + sectors_marked_old += hweight64(ca->mi.btree_allocated_bitmap) << ca->mi.btree_bitmap_shift; + sectors_marked_new += hweight64(ca->btree_allocated_bitmap_gc) << ca->mi.btree_bitmap_shift; + + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); + m->btree_allocated_bitmap = cpu_to_le64(ca->btree_allocated_bitmap_gc); + } + bch2_write_super(c); + } + + CLASS(printbuf, buf)(); + prt_str(&buf, "mi_btree_bitmap sectors "); + prt_human_readable_u64(&buf, sectors_marked_old << 9); + prt_str(&buf, " -> "); + prt_human_readable_u64(&buf, sectors_marked_new << 9); + bch_info(c, "%s", buf.buf); + + return 0; +} + +static void bch2_maybe_schedule_btree_bitmap_gc_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, maybe_schedule_btree_bitmap_gc.work); + + if (bch2_recovery_pass_want_ratelimit(c, BCH_RECOVERY_PASS_btree_bitmap_gc, 1000)) + return; + + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + bool want_schedule = false; + for_each_member_device(c, ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + + u64 btree_sectors = bucket_to_sector(ca, u.buckets[BCH_DATA_btree]); + u64 bitmap_sectors = hweight64(ca->mi.btree_allocated_bitmap) << ca->mi.btree_bitmap_shift; + + if (btree_sectors * 4 < bitmap_sectors) { + prt_printf(&buf, "%s has ", ca->name); + prt_human_readable_u64(&buf, btree_sectors << 9); + prt_printf(&buf, " btree buckets and "); + prt_human_readable_u64(&buf, bitmap_sectors << 9); + prt_printf(&buf, " marked in bitmap\n"); + want_schedule = true; + } + } + + if (want_schedule) { + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_btree_bitmap_gc, + RUN_RECOVERY_PASS_ratelimit); + bch2_print_str(c, KERN_NOTICE, buf.buf); + } + + queue_delayed_work(system_long_wq, &c->maybe_schedule_btree_bitmap_gc, HZ * 60 * 60 * 24); +} + +void bch2_maybe_schedule_btree_bitmap_gc_stop(struct bch_fs *c) +{ + cancel_delayed_work_sync(&c->maybe_schedule_btree_bitmap_gc); +} + +void bch2_maybe_schedule_btree_bitmap_gc(struct bch_fs *c) +{ + INIT_DELAYED_WORK(&c->maybe_schedule_btree_bitmap_gc, + bch2_maybe_schedule_btree_bitmap_gc_work); + bch2_maybe_schedule_btree_bitmap_gc_work(&c->maybe_schedule_btree_bitmap_gc.work); +} + unsigned bch2_sb_nr_devices(const struct bch_sb *sb) { unsigned nr = 0; diff --git a/libbcachefs/sb/members.h b/libbcachefs/sb/members.h index d685ac48..e0b87038 100644 --- a/libbcachefs/sb/members.h +++ b/libbcachefs/sb/members.h @@ -389,7 +389,8 @@ void bch2_sb_members_to_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); -static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +static inline bool __bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, + unsigned sectors, bool with_gc) { u64 end = start + sectors; @@ -399,14 +400,46 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 for (unsigned bit = start >> ca->mi.btree_bitmap_shift; (u64) bit << ca->mi.btree_bitmap_shift < end; bit++) - if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + if (!(BIT_ULL(bit) & + ca->mi.btree_allocated_bitmap & + (with_gc + ? ca->btree_allocated_bitmap_gc + : ~0ULL))) return false; return true; } +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, + unsigned sectors) +{ + return __bch2_dev_btree_bitmap_marked_sectors(ca, start, sectors, false); +} + +static inline bool bch2_dev_btree_bitmap_marked_sectors_any(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (start >= 64ULL << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = start >> ca->mi.btree_bitmap_shift; + (u64) bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (ca->mi.btree_allocated_bitmap & BIT_ULL(bit)) + return true; + return false; +} + bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +bool bch2_dev_btree_bitmap_marked_nogc(struct bch_fs *, struct bkey_s_c); + +void bch2_dev_btree_bitmap_mark_locked(struct bch_fs *, struct bkey_s_c, bool *); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); +int bch2_btree_bitmap_gc(struct bch_fs *); +void bch2_maybe_schedule_btree_bitmap_gc_stop(struct bch_fs *); +void bch2_maybe_schedule_btree_bitmap_gc(struct bch_fs *); + int bch2_sb_member_alloc(struct bch_fs *); void bch2_sb_members_clean_deleted(struct bch_fs *); diff --git a/libbcachefs/util/darray.h b/libbcachefs/util/darray.h index e0b84572..94d4ab3f 100644 --- a/libbcachefs/util/darray.h +++ b/libbcachefs/util/darray.h @@ -125,6 +125,9 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool); #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) +#define darray_sort(_d, _cmp) \ + sort((_d).data, (_d).nr, sizeof((_d).data[0]), _cmp, NULL) + /* Init/exit */ #define darray_init(_d) \ diff --git a/libbcachefs/util/util.c b/libbcachefs/util/util.c index 4314eec9..cd743e74 100644 --- a/libbcachefs/util/util.c +++ b/libbcachefs/util/util.c @@ -606,49 +606,32 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size) int bch2_bio_alloc_pages(struct bio *bio, unsigned bs, size_t size, gfp_t gfp_mask) { + BUG_ON(!is_power_of_2(bs)); BUG_ON(size & (bs - 1)); - unsigned bs_pages = DIV_ROUND_UP(bs, PAGE_SIZE); - /* - * XXX: we could do this by allocating higher order pages, but - * - * - the page allocator gets slower at a certain order (5?) - we'd have - * to check for this - * - * - bch2_bio_free_pages_pool() probably does not handle compound pages - * yet - */ - DARRAY_PREALLOCATED(struct page *, 16) pages; - darray_init(&pages); - darray_make_room_gfp(&pages, bs_pages, gfp_mask|__GFP_NOFAIL); + unsigned max_alloc = max(bs, PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); - int ret = 0; - while (size) { - while (pages.nr < bs_pages) { - struct page *page = alloc_pages(gfp_mask, 0); - if (!page) { - ret = -ENOMEM; - goto out; - } + while (bio->bi_iter.bi_size < size) { + unsigned b = min(size - bio->bi_iter.bi_size, max_alloc); - BUG_ON(darray_push(&pages, page)); - } + BUG_ON(b & (bs - 1)); - while (pages.nr) { - BUG_ON(!size); +#ifdef __KERNEL__ + /* + * we don't know the device dma alignment, so in kernel make + * sure allocations are page aligned + */ + void *p = (void *) __get_free_pages(gfp_mask, get_order(b)); +#else + void *p = kmalloc(b, gfp_mask); +#endif + if (!p) + return -ENOMEM; - unsigned len = min(PAGE_SIZE, size); - size -= len; - - struct page *page = darray_pop(&pages); - BUG_ON(!bio_add_page(bio, page, len, 0)); - } + bio_add_virt_nofail(bio, p, b); } -out: - darray_for_each(pages, i) - __free_page(*i); - darray_exit(&pages); - return ret; + + return 0; } u64 bch2_get_random_u64_below(u64 ceil) @@ -678,9 +661,8 @@ void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) struct bvec_iter iter; __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_local_page(bv.bv_page); - - memcpy(dstp + bv.bv_offset, src, bv.bv_len); + void *dstp = bvec_kmap_local(&bv); + memcpy(dstp, src, bv.bv_len); kunmap_local(dstp); src += bv.bv_len; @@ -693,9 +675,8 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) struct bvec_iter iter; __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_local_page(bv.bv_page); - - memcpy(dst, srcp + bv.bv_offset, bv.bv_len); + void *srcp = bvec_kmap_local(&bv); + memcpy(dst, srcp, bv.bv_len); kunmap_local(srcp); dst += bv.bv_len; diff --git a/libbcachefs/vfs/fs.c b/libbcachefs/vfs/fs.c index 4bd1a3eb..dad4cb01 100644 --- a/libbcachefs/vfs/fs.c +++ b/libbcachefs/vfs/fs.c @@ -422,14 +422,6 @@ retry: } } -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - static struct inode *bch2_alloc_inode(struct super_block *sb) { BUG(); diff --git a/linux/bio.c b/linux/bio.c index 65d186b3..50dec656 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -64,27 +64,13 @@ const char *blk_status_to_str(blk_status_t status) void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); - bytes = min(src_bv.bv_len, dst_bv.bv_len); + unsigned bytes = min(src_bv.bv_len, dst_bv.bv_len); - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + memcpy(dst_bv.bv_addr, src_bv.bv_addr, bytes); bio_advance_iter(src, src_iter, bytes); bio_advance_iter(dst, dst_iter, bytes); @@ -109,15 +95,11 @@ void bio_copy_data(struct bio *dst, struct bio *src) void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - __bio_for_each_segment(bv, bio, iter, start) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - bvec_kunmap_irq(data, &flags); - } + __bio_for_each_segment(bv, bio, iter, start) + memset(bv.bv_addr, 0, bv.bv_len); } static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) @@ -165,15 +147,6 @@ struct bio *bio_split(struct bio *bio, int sectors, return split; } -void bio_free_pages(struct bio *bio) -{ - struct bvec_iter_all iter; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, iter) - __free_page(bvec->bv_page); -} - void bio_advance(struct bio *bio, unsigned bytes) { bio_advance_iter(bio, &bio->bi_iter, bytes); @@ -208,26 +181,18 @@ void bio_put(struct bio *bio) } } -int bio_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off) +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs); - bv->bv_page = page; - bv->bv_offset = off; + bv->bv_addr = vaddr; bv->bv_len = len; bio->bi_iter.bi_size += len; bio->bi_vcnt++; - return len; -} - -void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) -{ - bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } static inline bool bio_remaining_done(struct bio *bio) diff --git a/linux/blkdev.c b/linux/blkdev.c index 3569594f..90249337 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -59,18 +59,15 @@ void generic_make_request(struct bio *bio) i = 0; bio_for_each_segment(bv, bio, iter) { - void *start = page_address(bv.bv_page) + bv.bv_offset; - size_t len = bv.bv_len; - iov[i++] = (struct iovec) { - .iov_base = start, - .iov_len = len, + .iov_base = bv.bv_addr, + .iov_len = bv.bv_len, }; #ifdef CONFIG_VALGRIND /* To be pedantic it should only be on IO completion. */ if (bio_op(bio) == REQ_OP_READ) - VALGRIND_MAKE_MEM_DEFINED(start, len); + VALGRIND_MAKE_MEM_DEFINED(bv.bv_addr, bv.bv_len); #endif }