Update bcachefs sources to ba3f652e4cdc bcachefs: Decrypt before checking if we read the right btree node

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-11-20 13:29:10 -05:00
parent 969305f122
commit 5c811f012b
34 changed files with 533 additions and 383 deletions

View File

@ -1 +1 @@
c53ba9651da768e74b787eb40bc05fd56e9ca5ef
ba3f652e4cdc86313cb13380efd59f1e6e6f484f

View File

@ -34,15 +34,9 @@
#define bio_iter_iovec(bio, iter) \
bvec_iter_bvec((bio)->bi_io_vec, (iter))
#define bio_iter_page(bio, iter) \
bvec_iter_page((bio)->bi_io_vec, (iter))
#define bio_iter_len(bio, iter) \
bvec_iter_len((bio)->bi_io_vec, (iter))
#define bio_iter_offset(bio, iter) \
bvec_iter_offset((bio)->bi_io_vec, (iter))
#define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter)
#define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter)
#define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter)
#define bio_multiple_segments(bio) \
@ -99,20 +93,6 @@ static inline unsigned int bio_cur_bytes(struct bio *bio)
return bio->bi_iter.bi_size;
}
static inline void *bio_data(struct bio *bio)
{
if (bio_has_data(bio))
return page_address(bio_page(bio)) + bio_offset(bio);
return NULL;
}
#define __bio_kmap_atomic(bio, iter) \
(kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \
bio_iter_iovec((bio), (iter)).bv_offset)
#define __bio_kunmap_atomic(addr) kunmap_atomic(addr)
static inline struct bio_vec *bio_next_segment(const struct bio *bio,
struct bvec_iter_all *iter)
{
@ -238,7 +218,6 @@ struct bio *bio_alloc_bioset(struct block_device *, unsigned,
extern void bio_put(struct bio *);
int bio_add_page(struct bio *, struct page *, unsigned, unsigned);
void bio_add_virt_nofail(struct bio *, void *, unsigned);
static inline void bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned len)
@ -265,8 +244,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
void bio_free_pages(struct bio *bio);
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
static inline void zero_fill_bio(struct bio *bio)
@ -284,30 +261,13 @@ do { \
(dst)->bi_bdev = (src)->bi_bdev; \
} while (0)
static inline void *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
{
return page_address(bvec->bv_page) + bvec->bv_offset;
}
static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
{
*flags = 0;
}
static inline void *bvec_kmap_local(struct bio_vec *bvec)
{
return page_address(bvec->bv_page) + bvec->bv_offset;
return bvec_virt(bvec);
}
static inline void bvec_kunmap_local(char *buffer) {}
static inline void *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
unsigned long *flags)
{
return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags);
}
#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags)
#define bio_kmap_irq(bio, flags) \
__bio_kmap_irq((bio), (bio)->bi_iter, (flags))
#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)

View File

@ -27,9 +27,8 @@
* was unsigned short, but we might as well be ready for > 64kB I/O pages
*/
struct bio_vec {
struct page *bv_page;
void *bv_addr;
unsigned int bv_len;
unsigned int bv_offset;
};
struct bvec_iter {
@ -53,21 +52,22 @@ struct bvec_iter_all {
*/
#define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx])
#define bvec_iter_page(bvec, iter) \
(__bvec_iter_bvec((bvec), (iter))->bv_page)
static inline void *bvec_virt(struct bio_vec *bv)
{
return bv->bv_addr;
}
#define bvec_iter_addr(bvec, iter) \
(__bvec_iter_bvec((bvec), (iter))->bv_addr + (iter).bi_bvec_done)
#define bvec_iter_len(bvec, iter) \
min((iter).bi_size, \
__bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
#define bvec_iter_offset(bvec, iter) \
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
#define bvec_iter_bvec(bvec, iter) \
((struct bio_vec) { \
.bv_page = bvec_iter_page((bvec), (iter)), \
.bv_addr = bvec_iter_addr((bvec), (iter)), \
.bv_len = bvec_iter_len((bvec), (iter)), \
.bv_offset = bvec_iter_offset((bvec), (iter)), \
})
static inline void bvec_iter_advance(const struct bio_vec *bv,

View File

@ -593,6 +593,7 @@ struct bch_dev {
* Committed by bch2_write_super() -> bch_fs_mi_update()
*/
struct bch_member_cpu mi;
u64 btree_allocated_bitmap_gc;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
unsigned long write_errors_start;
@ -865,6 +866,8 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
struct delayed_work maybe_schedule_btree_bitmap_gc;
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
struct mutex snapshot_table_lock;
@ -1037,7 +1040,7 @@ struct bch_fs {
struct bio_set bio_write;
struct bio_set replica_set;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
mempool_t bio_bounce_bufs;
struct bucket_nocow_lock_table
nocow_locks;
struct rhashtable promote_table;

View File

@ -661,16 +661,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k.k->bversion.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked_nogc(c, k),
trans, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
guard(mutex)(&c->sb_lock);
buf.buf)))
bch2_dev_btree_bitmap_mark(c, k);
bch2_write_super(c);
}
/*
* We require a commit before key_trigger() because

View File

@ -639,10 +639,12 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
struct bch_fs *c = as->c;
guard(mutex)(&c->sb_lock);
bool write_sb = false;
darray_for_each(as->new_nodes, i)
bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(&i->key));
bch2_dev_btree_bitmap_mark_locked(c, bkey_i_to_s_c(&i->key), &write_sb);
bch2_write_super(c);
if (write_sb)
bch2_write_super(c);
}
static void bkey_strip_reconcile(const struct bch_fs *c, struct bkey_s k)
@ -2133,18 +2135,35 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
btree_node_u64s_with_format(m->nr, &m->format, &new_f);
if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
sib_u64s /= 2;
sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
if (trace_btree_node_merge_attempt_enabled()) {
CLASS(printbuf, buf)();
guard(printbuf_indent)(&buf);
bch2_btree_pos_to_text(&buf, c, prev);
prt_printf(&buf, "live u64s %u (%zu%% full)\n",
prev->nr.live_u64s,
prev->nr.live_u64s * 100 / btree_max_u64s(c));
bch2_btree_pos_to_text(&buf, c, next);
prt_printf(&buf, "live u64s %u (%zu%% full)\n",
next->nr.live_u64s,
next->nr.live_u64s * 100 / btree_max_u64s(c));
prt_printf(&buf, "merged would have %zu threshold %u\n",
sib_u64s, c->btree_foreground_merge_threshold);
trace_btree_node_merge_attempt(c, buf.buf);
}
count_event(c, btree_node_merge_attempt);
sib_u64s = min(sib_u64s, btree_max_u64s(c));
sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s;
if (sib_u64s > c->btree_foreground_merge_threshold) {
if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c))
sib_u64s -= (sib_u64s - BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) / 2;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
sib_u64s = min(sib_u64s, btree_max_u64s(c));
sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s;
goto out;
}
parent = btree_node_parent(trans->paths + path, b);
as = bch2_btree_update_start(trans, trans->paths + path, level, false,

View File

@ -235,27 +235,30 @@ static int read_btree_nodes_worker(void *p)
goto err;
}
u64 buckets_to_scan = 0;
for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
buckets_to_scan += c->sb.version_upgrade_complete < bcachefs_metadata_version_mi_btree_bitmap ||
bch2_dev_btree_bitmap_marked_sectors_any(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size);
u64 buckets_scanned = 0;
for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) {
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
!bch2_dev_btree_bitmap_marked_sectors_any(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size))
continue;
for (unsigned bucket_offset = 0;
bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
bucket_offset += btree_sectors(c)) {
if (time_after(jiffies, last_print + HZ * 30)) {
u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
bucket_offset += btree_sectors(c))
try_read_btree_node(w->f, ca, b, bio, bucket_to_sector(ca, bucket) + bucket_offset);
bch_info(ca, "%s: %2u%% done", __func__,
(unsigned) div64_u64(cur_sector * 100, end_sector));
last_print = jiffies;
}
buckets_scanned++;
u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
!bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
continue;
try_read_btree_node(w->f, ca, b, bio, sector);
if (time_after(jiffies, last_print + HZ * 30)) {
bch_info(ca, "%s: %2u%% done", __func__,
(unsigned) div64_u64(buckets_scanned * 100, buckets_to_scan));
last_print = jiffies;
}
}
err:
if (b)
__btree_node_data_free(b);

View File

@ -660,33 +660,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
bch2_bpos_to_text(&buf, b->data->min_key);
prt_str(&buf, "-");
bch2_bpos_to_text(&buf, b->data->max_key);
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"got wrong btree node: got\n%s",
(printbuf_reset(&buf),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"bad btree header: seq 0\n%s",
(printbuf_reset(&buf),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
bool first = !b->written;
@ -743,6 +716,33 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
goto fsck_err;
}
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
bch2_bpos_to_text(&buf, b->data->min_key);
prt_str(&buf, "-");
bch2_bpos_to_text(&buf, b->data->max_key);
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"got wrong btree node: got\n%s",
(printbuf_reset(&buf),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL, NULL,
btree_node_bad_seq,
"bad btree header: seq 0\n%s",
(printbuf_reset(&buf),
bch2_btree_node_header_to_text(&buf, b->data),
buf.buf));
}
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-BCH_ERR_btree_node_read_err_incompatible,

View File

@ -202,15 +202,14 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
void *p = bvec_kmap_local(&bv);
bch2_checksum_update(&state, p, bv.bv_len);
kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
bch2_checksum_update(&state, bvec_virt(&bv), bv.bv_len);
#endif
return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
}
@ -225,16 +224,14 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
void *p = bvec_kmap_local(&bv);
poly1305_update(&dctx, p, bv.bv_len);
kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
poly1305_update(&dctx,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
poly1305_update(&dctx, bvec_virt(&bv), bv.bv_len);
#endif
poly1305_final(&dctx, digest);

View File

@ -95,12 +95,12 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
void *expected_start = NULL;
__bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
void *bv_addr = bvec_virt(&bv);
if (expected_start && expected_start != bv_addr)
return false;
expected_start = page_address(bv.bv_page) +
bv.bv_offset + bv.bv_len;
expected_start = bv_addr + bv.bv_len;
}
return true;
@ -109,27 +109,27 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
struct bvec_iter start, int rw)
{
struct bio_vec bv;
struct bvec_iter iter;
unsigned nr_pages = 0;
struct page *stack_pages[16];
struct page **pages = NULL;
void *data;
BUG_ON(start.bi_size > c->opts.encoded_extent_max);
if (!PageHighMem(bio_iter_page(bio, start)) &&
bio_phys_contig(bio, start))
#ifndef CONFIG_HIGHMEM
if (bio_phys_contig(bio, start))
return (struct bbuf) {
.c = c,
.b = page_address(bio_iter_page(bio, start)) +
bio_iter_offset(bio, start),
.b = bvec_virt(&bio_iter_iovec(bio, start)),
.type = BB_none,
.rw = rw
};
#endif
#ifdef __KERNEL__
/* check if we can map the pages contiguously: */
struct bio_vec bv;
struct bvec_iter iter;
unsigned nr_pages = 0;
__bio_for_each_segment(bv, bio, iter, start) {
BUG_ON(bv.bv_offset + bv.bv_len > PAGE_SIZE);
if (iter.bi_size != start.bi_size &&
bv.bv_offset)
return bio_bounce(c, bio, start, rw);
@ -143,7 +143,8 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
pages = nr_pages > ARRAY_SIZE(stack_pages)
struct page *stack_pages[16];
struct page **pages = nr_pages > ARRAY_SIZE(stack_pages)
? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
: stack_pages;
if (!pages)
@ -153,19 +154,20 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
__bio_for_each_segment(bv, bio, iter, start)
pages[nr_pages++] = bv.bv_page;
data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
void *data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (pages != stack_pages)
kfree(pages);
if (!data)
return bio_bounce(c, bio, start, rw);
if (data)
return (struct bbuf) {
c,
data + bio_iter_offset(bio, start),
BB_vmap,
rw
};
#endif /* __KERNEL__ */
return (struct bbuf) {
c,
data + bio_iter_offset(bio, start),
BB_vmap,
rw
};
return bio_bounce(c, bio, start, rw);
}
static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)

View File

@ -28,6 +28,8 @@
#include "util/util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
#include <linux/module.h>
static int bch2_force_read_device = -1;
module_param_named(force_read_device, bch2_force_read_device, int, 0644);

View File

@ -344,7 +344,7 @@ err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params));
err:
bio_free_pages(&op->write.op.wbio.bio);
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
/* We may have added to the rhashtable and thus need rcu freeing: */
kfree_rcu(op, rcu);
err_put:
@ -1253,7 +1253,7 @@ retry_pick:
&c->bio_read_split),
orig);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
bch2_bio_alloc_pages_pool(c, &rbio->bio, 512, sectors << 9);
rbio->bounce = true;
} else if (flags & BCH_READ_must_clone) {
/*
@ -1591,16 +1591,29 @@ void bch2_fs_io_read_exit(struct bch_fs *c)
rhashtable_destroy(&c->promote_table);
bioset_exit(&c->bio_read_split);
bioset_exit(&c->bio_read);
mempool_exit(&c->bio_bounce_pages);
mempool_exit(&c->bio_bounce_bufs);
}
static void *bio_bounce_buf_alloc_fn(gfp_t gfp, void *pool_data)
{
return (void *) __get_free_pages(gfp, PAGE_ALLOC_COSTLY_ORDER);
}
static void bio_bounce_buf_free_fn(void *p, void *pool_data)
{
free_pages((unsigned long) p, PAGE_ALLOC_COSTLY_ORDER);
}
int bch2_fs_io_read_init(struct bch_fs *c)
{
if (mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
c->opts.encoded_extent_max) /
PAGE_SIZE, 0))
if (mempool_init(&c->bio_bounce_bufs,
max_t(unsigned,
c->opts.btree_node_size,
c->opts.encoded_extent_max) /
BIO_BOUNCE_BUF_POOL_LEN,
bio_bounce_buf_alloc_fn,
bio_bounce_buf_free_fn,
NULL))
return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),

View File

@ -7,6 +7,8 @@
#include "extents_types.h"
#include "data/reflink.h"
#define BIO_BOUNCE_BUF_POOL_LEN (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void bch2_dev_congested_to_text(struct printbuf *, struct bch_dev *);
#endif

View File

@ -113,42 +113,41 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
struct bvec_iter_all iter;
struct bio_vec *bv;
for (struct bio_vec *bv = bio->bi_io_vec;
bv < bio->bi_io_vec + bio->bi_vcnt;
bv++) {
void *p = bvec_virt(bv);
bio_for_each_segment_all(bv, bio, iter)
mempool_free(bv->bv_page, &c->bio_bounce_pages);
if (bv->bv_len == BIO_BOUNCE_BUF_POOL_LEN)
mempool_free(p, &c->bio_bounce_bufs);
else
free_pages((unsigned long) p, get_order(bv->bv_len));
}
bio->bi_vcnt = 0;
}
static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
static void __bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
unsigned bs, size_t size)
{
if (likely(!*using_mempool)) {
struct page *page = alloc_page(GFP_NOFS);
if (likely(page))
return page;
mutex_lock(&c->bio_bounce_pages_lock);
mutex_lock(&c->bio_bounce_pages_lock);
*using_mempool = true;
}
return mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
while (bio->bi_iter.bi_size < size)
bio_add_virt_nofail(bio,
mempool_alloc(&c->bio_bounce_bufs, GFP_NOFS),
BIO_BOUNCE_BUF_POOL_LEN);
bio->bi_iter.bi_size = min(bio->bi_iter.bi_size, size);
mutex_unlock(&c->bio_bounce_pages_lock);
}
void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
size_t size)
unsigned bs, size_t size)
{
bool using_mempool = false;
bch2_bio_alloc_pages(bio, c->opts.block_size, size, GFP_NOFS);
while (size) {
struct page *page = __bio_alloc_page_pool(c, &using_mempool);
unsigned len = min_t(size_t, PAGE_SIZE, size);
BUG_ON(!bio_add_page(bio, page, len, 0));
size -= len;
}
if (using_mempool)
mutex_unlock(&c->bio_bounce_pages_lock);
if (bio->bi_iter.bi_size < size)
__bch2_bio_alloc_pages_pool(c, bio, bs, size);
}
/* Extent update path: */
@ -837,23 +836,22 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
return bio;
}
wbio->bounce = true;
wbio->bounce = true;
/*
* We can't use mempool for more than c->sb.encoded_extent_max
* worth of pages, but we'd like to allocate more if we can:
*/
bch2_bio_alloc_pages_pool(c, bio,
min_t(unsigned, output_available,
c->opts.encoded_extent_max));
bch2_bio_alloc_pages(bio,
c->opts.block_size,
output_available,
GFP_NOFS);
if (bio->bi_iter.bi_size < output_available)
*page_alloc_failed =
bch2_bio_alloc_pages(bio,
c->opts.block_size,
output_available -
bio->bi_iter.bi_size,
GFP_NOFS) != 0;
unsigned required = min(output_available, c->opts.encoded_extent_max);
if (unlikely(bio->bi_iter.bi_size < required))
__bch2_bio_alloc_pages_pool(c, bio, c->opts.block_size, required);
return bio;
}

View File

@ -9,7 +9,7 @@
container_of((_bio), struct bch_write_bio, bio)
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, unsigned, size_t);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);

View File

@ -556,6 +556,11 @@ DEFINE_EVENT(fs_str, btree_node_rewrite,
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, btree_node_merge_attempt,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, btree_node_merge,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)

View File

@ -331,6 +331,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
ca->btree_allocated_bitmap_gc = le64_to_cpu(member->btree_allocated_bitmap);
for (i = 0; i < ARRAY_SIZE(member->errors); i++)
atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));

View File

@ -264,11 +264,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
unsigned clean_passes = 0;
u64 seq = 0;
bch2_maybe_schedule_btree_bitmap_gc_stop(c);
bch2_fs_ec_stop(c);
bch2_open_buckets_stop(c, NULL, true);
bch2_reconcile_stop(c);
bch2_copygc_stop(c);
bch2_fs_ec_flush(c);
cancel_delayed_work_sync(&c->maybe_schedule_btree_bitmap_gc);
bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
journal_cur_seq(&c->journal));
@ -524,6 +526,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_do_invalidates(c);
bch2_do_stripe_deletes(c);
bch2_do_pending_node_rewrites(c);
bch2_maybe_schedule_btree_bitmap_gc(c);
return 0;
}

View File

@ -176,31 +176,43 @@ void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
}
}
static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
static bool bch2_recovery_pass_entry_get_locked(struct bch_fs *c, enum bch_recovery_pass pass,
struct recovery_pass_entry *e)
{
enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
bool ret = false;
lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_recovery_passes *r =
bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
if (stable < recovery_passes_nr_entries(r)) {
struct recovery_pass_entry *i = r->start + stable;
enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
bool found = stable < recovery_passes_nr_entries(r);
if (found)
*e = r->start[stable];
/*
* Ratelimit if the last runtime was more than 1% of the time
* since we last ran
*/
ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
ktime_get_real_seconds() - le64_to_cpu(i->last_run);
return found;
}
if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
ret = false;
}
static bool bch2_recovery_pass_want_ratelimit_locked(struct bch_fs *c, enum bch_recovery_pass pass,
unsigned runtime_fraction)
{
struct recovery_pass_entry e;
if (!bch2_recovery_pass_entry_get_locked(c, pass, &e))
return false;
return ret;
/*
* Ratelimit if the last runtime was more than 1% of the time
* since we last ran
*/
return !BCH_RECOVERY_PASS_NO_RATELIMIT(&e) &&
(u64) le32_to_cpu(e.last_runtime) * runtime_fraction >
ktime_get_real_seconds() - le64_to_cpu(e.last_run);
}
bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass,
unsigned runtime_fraction)
{
guard(mutex)(&c->sb_lock);
return bch2_recovery_pass_want_ratelimit_locked(c, pass, runtime_fraction);
}
const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
@ -311,7 +323,7 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
*flags |= RUN_RECOVERY_PASS_nopersistent;
if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
!bch2_recovery_pass_want_ratelimit(c, pass))
!bch2_recovery_pass_want_ratelimit_locked(c, pass, 100))
*flags &= ~RUN_RECOVERY_PASS_ratelimit;
/*
@ -451,7 +463,7 @@ int bch2_require_recovery_pass(struct bch_fs *c,
guard(mutex)(&c->sb_lock);
if (bch2_recovery_pass_want_ratelimit(c, pass))
if (bch2_recovery_pass_want_ratelimit_locked(c, pass, 100))
return 0;
enum bch_run_recovery_pass_flags flags = 0;

View File

@ -46,6 +46,8 @@ static inline int bch2_recovery_cancelled(struct bch_fs *c)
return 0;
}
bool bch2_recovery_pass_want_ratelimit(struct bch_fs *, enum bch_recovery_pass, unsigned);
int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,

View File

@ -66,6 +66,7 @@
x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_reconcile, 34, 0) \
x(btree_bitmap_gc, 46, PASS_ONLINE) \
x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT)
/* We normally enumerate recovery passes in the order we run them: */

View File

@ -63,10 +63,12 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
int bch2_jset_validate(struct bch_fs *, struct bch_dev *, struct jset *,
u64, enum bch_validate_flags);
struct u64_range {
typedef struct u64_range {
u64 start;
u64 end;
};
} u64_range;
DEFINE_DARRAY(u64_range);
struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64);

View File

@ -726,7 +726,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
* we're holding the reclaim lock:
*/
lockdep_assert_held(&j->reclaim_lock);
flags = memalloc_noreclaim_save();
flags = memalloc_nofs_save();
do {
if (kthread && kthread_should_stop())
@ -780,7 +780,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
wake_up(&j->reclaim_wait);
} while ((min_nr || min_key_cache) && nr_flushed && !direct);
memalloc_noreclaim_restore(flags);
memalloc_flags_restore(flags);
return ret;
}

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "journal/read.h"
#include "journal/sb.h"
#include "util/darray.h"
@ -28,35 +29,33 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
if (!nr)
return 0;
u64 *b __free(kvfree) = kvmalloc_array(nr, sizeof(u64), GFP_KERNEL);
if (!b)
return -BCH_ERR_ENOMEM_sb_journal_validate;
CLASS(darray_u64, b)();
for (unsigned i = 0; i < nr; i++)
b[i] = le64_to_cpu(journal->buckets[i]);
try(darray_push(&b, le64_to_cpu(journal->buckets[i])));
sort(b, nr, sizeof(u64), u64_cmp, NULL);
darray_sort(b, u64_cmp);
if (!b[0]) {
if (!darray_first(b)) {
prt_printf(err, "journal bucket at sector 0");
return -BCH_ERR_invalid_sb_journal;
}
if (b[0] < le16_to_cpu(m.first_bucket)) {
if (darray_first(b) < le16_to_cpu(m.first_bucket)) {
prt_printf(err, "journal bucket %llu before first bucket %u",
b[0], le16_to_cpu(m.first_bucket));
darray_first(b), le16_to_cpu(m.first_bucket));
return -BCH_ERR_invalid_sb_journal;
}
if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
if (darray_last(b) >= le64_to_cpu(m.nbuckets)) {
prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
b[nr - 1], le64_to_cpu(m.nbuckets));
darray_last(b), le64_to_cpu(m.nbuckets));
return -BCH_ERR_invalid_sb_journal;
}
for (unsigned i = 0; i + 1 < nr; i++)
if (b[i] == b[i + 1]) {
prt_printf(err, "duplicate journal buckets %llu", b[i]);
darray_for_each(b, i)
if (i != &darray_last(b) && i[0] == i[1]) {
prt_printf(err, "duplicate journal buckets %llu", *i);
return -BCH_ERR_invalid_sb_journal;
}
@ -80,11 +79,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal = {
.to_text = bch2_sb_journal_to_text,
};
struct u64_range {
u64 start;
u64 end;
};
static int u64_range_cmp(const void *_l, const void *_r)
{
const struct u64_range *l = _l;
@ -104,15 +98,16 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
if (!nr)
return 0;
struct u64_range *b __free(kvfree) = kvmalloc_array(nr, sizeof(*b), GFP_KERNEL);
if (!b)
return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
CLASS(darray_u64_range, b)();
for (unsigned i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
struct u64_range r = {
.start = le64_to_cpu(journal->d[i].start),
.end = le64_to_cpu(journal->d[i].start) +
le64_to_cpu(journal->d[i].nr),
};
if (b[i].end <= b[i].start) {
if (r.end <= r.start) {
prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
le64_to_cpu(journal->d[i].start),
le64_to_cpu(journal->d[i].nr));
@ -120,34 +115,34 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
}
sum += le64_to_cpu(journal->d[i].nr);
try(darray_push(&b, r));
}
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
darray_sort(b, u64_range_cmp);
if (!b[0].start) {
if (!darray_first(b).start) {
prt_printf(err, "journal bucket at sector 0");
return -BCH_ERR_invalid_sb_journal;
}
if (b[0].start < le16_to_cpu(m.first_bucket)) {
if (darray_first(b).start < le16_to_cpu(m.first_bucket)) {
prt_printf(err, "journal bucket %llu before first bucket %u",
b[0].start, le16_to_cpu(m.first_bucket));
darray_first(b).start, le16_to_cpu(m.first_bucket));
return -BCH_ERR_invalid_sb_journal;
}
if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
if (darray_last(b).end > le64_to_cpu(m.nbuckets)) {
prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
darray_last(b).end - 1, le64_to_cpu(m.nbuckets));
return -BCH_ERR_invalid_sb_journal;
}
for (unsigned i = 0; i + 1 < nr; i++) {
if (b[i].end > b[i + 1].start) {
darray_for_each(b, i)
if (i != &darray_last(b) && i[0].end > i[1].start) {
prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
i[0].start, i[0].end, i[1].start, i[1].end);
return -BCH_ERR_invalid_sb_journal;
}
}
if (sum > UINT_MAX) {
prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
@ -179,11 +174,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
u64 *buckets, unsigned nr)
{
struct bch_sb_field_journal_v2 *j;
unsigned i, dst = 0, nr_compacted = 1;
unsigned dst = 0, nr_compacted = 1;
if (c)
lockdep_assert_held(&c->sb_lock);
lockdep_assert_held(&c->sb_lock);
if (!nr) {
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
@ -191,11 +184,12 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
return 0;
}
for (i = 0; i + 1 < nr; i++)
for (unsigned i = 0; i + 1 < nr; i++)
if (buckets[i] + 1 != buckets[i + 1])
nr_compacted++;
j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
struct bch_sb_field_journal_v2 *j =
bch2_sb_field_resize(&ca->disk_sb, journal_v2,
(sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
return bch_err_throw(c, ENOSPC_sb_journal);
@ -205,7 +199,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
j->d[dst].start = cpu_to_le64(buckets[0]);
j->d[dst].nr = cpu_to_le64(1);
for (i = 1; i < nr; i++) {
for (unsigned i = 1; i < nr; i++) {
if (buckets[i] == buckets[i - 1] + 1) {
le64_add_cpu(&j->d[dst].nr, 1);
} else {

View File

@ -55,6 +55,7 @@ enum counters_flags {
x(btree_node_read, 14, TYPE_COUNTER) \
x(btree_node_compact, 15, TYPE_COUNTER) \
x(btree_node_merge, 16, TYPE_COUNTER) \
x(btree_node_merge_attempt, 101, TYPE_COUNTER) \
x(btree_node_split, 17, TYPE_COUNTER) \
x(btree_node_rewrite, 18, TYPE_COUNTER) \
x(btree_node_alloc, 19, TYPE_COUNTER) \

View File

@ -5,6 +5,8 @@
#include "sb/errors.h"
#include "sb/io.h"
#include "util/darray.h"
const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
@ -63,25 +65,25 @@ static int error_entry_cmp(const void *_l, const void *_r)
return -cmp_int(l->last_error_time, r->last_error_time);
}
DEFINE_DARRAY(bch_sb_field_error_entry);
static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned nr = bch2_sb_field_errors_nr_entries(e);
struct bch_sb_field_error_entry *sorted = kvmalloc_array(nr, sizeof(*sorted), GFP_KERNEL);
if (sorted) {
memcpy(sorted, e->entries, nr * sizeof(e->entries[0]));
sort(sorted, nr, sizeof(*sorted), error_entry_cmp, NULL);
} else {
sorted = e->entries;
}
if (out->nr_tabstops <= 1)
printbuf_tabstop_push(out, 16);
for (struct bch_sb_field_error_entry *i = sorted; i < sorted + nr; i++) {
CLASS(darray_bch_sb_field_error_entry, sorted)();
for (struct bch_sb_field_error_entry *i = e->entries; i < e->entries + nr; i++)
darray_push(&sorted, *i);
darray_sort(sorted, error_entry_cmp);
darray_for_each(sorted, i) {
bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(i));
prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(i));
@ -89,9 +91,6 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
bch2_prt_datetime(out, le64_to_cpu(i->last_error_time));
prt_newline(out);
}
if (sorted != e->entries)
kvfree(sorted);
}
const struct bch_sb_field_ops bch_sb_field_ops_errors = {

View File

@ -360,12 +360,14 @@ enum bch_sb_error_id {
#undef x
};
typedef struct bch_sb_field_error_entry {
__le64 v;
__le64 last_error_time;
} bch_sb_field_error_entry;
struct bch_sb_field_errors {
struct bch_sb_field field;
struct bch_sb_field_error_entry {
__le64 v;
__le64 last_error_time;
} entries[];
bch_sb_field_error_entry entries[];
};
LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);

View File

@ -2,16 +2,19 @@
#include "bcachefs.h"
#include "alloc/buckets.h"
#include "alloc/disk_groups.h"
#include "alloc/replicas.h"
#include "btree/cache.h"
#include "btree/iter.h"
#include "sb/members.h"
#include "sb/io.h"
#include "init/error.h"
#include "init/passes.h"
#include "init/progress.h"
int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
{
@ -512,35 +515,54 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
* have to scan full devices:
*/
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
static bool __bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k, bool with_gc)
{
guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (ca &&
!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
!__bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c), with_gc))
return false;
}
return true;
}
static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
u64 start, unsigned sectors)
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
{
struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
return __bch2_dev_btree_bitmap_marked(c, k, true);
}
bool bch2_dev_btree_bitmap_marked_nogc(struct bch_fs *c, struct bkey_s_c k)
{
return __bch2_dev_btree_bitmap_marked(c, k, false);
}
static void __bch2_dev_btree_bitmap_mark(struct bch_dev *ca,
struct bch_sb_field_members_v2 *mi,
u64 start, unsigned sectors, bool *write_sb)
{
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
u64 end = start + sectors;
int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
if (resize > 0) {
u64 old_bitmap = le64_to_cpu(m->btree_allocated_bitmap);
u64 new_bitmap = 0;
u64 new_gc_bitmap = 0;
for (unsigned i = 0; i < 64; i++)
if (bitmap & BIT_ULL(i))
for (unsigned i = 0; i < 64; i++) {
if (old_bitmap & BIT_ULL(i))
new_bitmap |= BIT_ULL(i >> resize);
bitmap = new_bitmap;
if (ca->btree_allocated_bitmap_gc & BIT_ULL(i))
new_gc_bitmap |= BIT_ULL(i >> resize);
}
m->btree_allocated_bitmap = cpu_to_le64(new_bitmap);
m->btree_bitmap_shift += resize;
*write_sb = true;
ca->btree_allocated_bitmap_gc = new_gc_bitmap;
}
BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
@ -548,25 +570,164 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns
for (unsigned bit = start >> m->btree_bitmap_shift;
(u64) bit << m->btree_bitmap_shift < end;
bit++)
bitmap |= BIT_ULL(bit);
bit++) {
__le64 b = cpu_to_le64(BIT_ULL(bit));
m->btree_allocated_bitmap = cpu_to_le64(bitmap);
if (!(m->btree_allocated_bitmap & b)) {
m->btree_allocated_bitmap |= b;
*write_sb = true;
}
ca->btree_allocated_bitmap_gc |= BIT_ULL(bit);
}
}
void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
void bch2_dev_btree_bitmap_mark_locked(struct bch_fs *c, struct bkey_s_c k, bool *write_sb)
{
lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca)
continue;
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
__bch2_dev_btree_bitmap_mark(ca, mi, ptr->offset, btree_sectors(c), write_sb);
}
}
void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
{
guard(mutex)(&c->sb_lock);
bool write_sb = false;
bch2_dev_btree_bitmap_mark_locked(c, k, &write_sb);
if (write_sb)
bch2_write_super(c);
}
static int btree_bitmap_gc_btree_level(struct btree_trans *trans,
struct progress_indicator *progress,
enum btree_id btree, unsigned level)
{
struct bch_fs *c = trans->c;
CLASS(btree_node_iter, iter)(trans, btree, POS_MIN, 0, level, BTREE_ITER_prefetch);
try(for_each_btree_key_continue(trans, iter, 0, k, ({
if (!bch2_dev_btree_bitmap_marked(c, k))
bch2_dev_btree_bitmap_mark(c, k);
bch2_progress_update_iter(trans, progress, &iter, "btree_bitmap_gc");
})));
return 0;
}
int bch2_btree_bitmap_gc(struct bch_fs *c)
{
struct progress_indicator progress;
bch2_progress_init_inner(&progress, c, 0, ~0ULL);
scoped_guard(mutex, &c->sb_lock) {
guard(rcu)();
for_each_member_device_rcu(c, ca, NULL)
ca->btree_allocated_bitmap_gc = 0;
}
{
CLASS(btree_trans, trans)(c);
for (unsigned btree = 0; btree < btree_id_nr_alive(c); btree++) {
for (unsigned level = 1; level < BTREE_MAX_DEPTH; level++)
try(btree_bitmap_gc_btree_level(trans, &progress, btree, level));
CLASS(btree_node_iter, iter)(trans, btree, POS_MIN, 0,
bch2_btree_id_root(c, btree)->b->c.level, 0);
struct btree *b;
try(lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))));
if (!bch2_dev_btree_bitmap_marked(c, bkey_i_to_s_c(&b->key)))
bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(&b->key));
}
}
u64 sectors_marked_old = 0, sectors_marked_new = 0;
scoped_guard(mutex, &c->sb_lock) {
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
scoped_guard(rcu)
for_each_member_device_rcu(c, ca, NULL) {
sectors_marked_old += hweight64(ca->mi.btree_allocated_bitmap) << ca->mi.btree_bitmap_shift;
sectors_marked_new += hweight64(ca->btree_allocated_bitmap_gc) << ca->mi.btree_bitmap_shift;
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
m->btree_allocated_bitmap = cpu_to_le64(ca->btree_allocated_bitmap_gc);
}
bch2_write_super(c);
}
CLASS(printbuf, buf)();
prt_str(&buf, "mi_btree_bitmap sectors ");
prt_human_readable_u64(&buf, sectors_marked_old << 9);
prt_str(&buf, " -> ");
prt_human_readable_u64(&buf, sectors_marked_new << 9);
bch_info(c, "%s", buf.buf);
return 0;
}
static void bch2_maybe_schedule_btree_bitmap_gc_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, maybe_schedule_btree_bitmap_gc.work);
if (bch2_recovery_pass_want_ratelimit(c, BCH_RECOVERY_PASS_btree_bitmap_gc, 1000))
return;
CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
bool want_schedule = false;
for_each_member_device(c, ca) {
struct bch_dev_usage u;
bch2_dev_usage_read_fast(ca, &u);
u64 btree_sectors = bucket_to_sector(ca, u.buckets[BCH_DATA_btree]);
u64 bitmap_sectors = hweight64(ca->mi.btree_allocated_bitmap) << ca->mi.btree_bitmap_shift;
if (btree_sectors * 4 < bitmap_sectors) {
prt_printf(&buf, "%s has ", ca->name);
prt_human_readable_u64(&buf, btree_sectors << 9);
prt_printf(&buf, " btree buckets and ");
prt_human_readable_u64(&buf, bitmap_sectors << 9);
prt_printf(&buf, " marked in bitmap\n");
want_schedule = true;
}
}
if (want_schedule) {
bch2_run_explicit_recovery_pass(c, &buf,
BCH_RECOVERY_PASS_btree_bitmap_gc,
RUN_RECOVERY_PASS_ratelimit);
bch2_print_str(c, KERN_NOTICE, buf.buf);
}
queue_delayed_work(system_long_wq, &c->maybe_schedule_btree_bitmap_gc, HZ * 60 * 60 * 24);
}
void bch2_maybe_schedule_btree_bitmap_gc_stop(struct bch_fs *c)
{
cancel_delayed_work_sync(&c->maybe_schedule_btree_bitmap_gc);
}
void bch2_maybe_schedule_btree_bitmap_gc(struct bch_fs *c)
{
INIT_DELAYED_WORK(&c->maybe_schedule_btree_bitmap_gc,
bch2_maybe_schedule_btree_bitmap_gc_work);
bch2_maybe_schedule_btree_bitmap_gc_work(&c->maybe_schedule_btree_bitmap_gc.work);
}
unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
{
unsigned nr = 0;

View File

@ -389,7 +389,8 @@ void bch2_sb_members_to_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
static inline bool __bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start,
unsigned sectors, bool with_gc)
{
u64 end = start + sectors;
@ -399,14 +400,46 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
(u64) bit << ca->mi.btree_bitmap_shift < end;
bit++)
if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
if (!(BIT_ULL(bit) &
ca->mi.btree_allocated_bitmap &
(with_gc
? ca->btree_allocated_bitmap_gc
: ~0ULL)))
return false;
return true;
}
static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start,
unsigned sectors)
{
return __bch2_dev_btree_bitmap_marked_sectors(ca, start, sectors, false);
}
static inline bool bch2_dev_btree_bitmap_marked_sectors_any(struct bch_dev *ca, u64 start, unsigned sectors)
{
u64 end = start + sectors;
if (start >= 64ULL << ca->mi.btree_bitmap_shift)
return false;
for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
(u64) bit << ca->mi.btree_bitmap_shift < end;
bit++)
if (ca->mi.btree_allocated_bitmap & BIT_ULL(bit))
return true;
return false;
}
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
bool bch2_dev_btree_bitmap_marked_nogc(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark_locked(struct bch_fs *, struct bkey_s_c, bool *);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
int bch2_btree_bitmap_gc(struct bch_fs *);
void bch2_maybe_schedule_btree_bitmap_gc_stop(struct bch_fs *);
void bch2_maybe_schedule_btree_bitmap_gc(struct bch_fs *);
int bch2_sb_member_alloc(struct bch_fs *);
void bch2_sb_members_clean_deleted(struct bch_fs *);

View File

@ -125,6 +125,9 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool);
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
#define darray_sort(_d, _cmp) \
sort((_d).data, (_d).nr, sizeof((_d).data[0]), _cmp, NULL)
/* Init/exit */
#define darray_init(_d) \

View File

@ -606,49 +606,32 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
int bch2_bio_alloc_pages(struct bio *bio, unsigned bs, size_t size, gfp_t gfp_mask)
{
BUG_ON(!is_power_of_2(bs));
BUG_ON(size & (bs - 1));
unsigned bs_pages = DIV_ROUND_UP(bs, PAGE_SIZE);
/*
* XXX: we could do this by allocating higher order pages, but
*
* - the page allocator gets slower at a certain order (5?) - we'd have
* to check for this
*
* - bch2_bio_free_pages_pool() probably does not handle compound pages
* yet
*/
DARRAY_PREALLOCATED(struct page *, 16) pages;
darray_init(&pages);
darray_make_room_gfp(&pages, bs_pages, gfp_mask|__GFP_NOFAIL);
unsigned max_alloc = max(bs, PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
int ret = 0;
while (size) {
while (pages.nr < bs_pages) {
struct page *page = alloc_pages(gfp_mask, 0);
if (!page) {
ret = -ENOMEM;
goto out;
}
while (bio->bi_iter.bi_size < size) {
unsigned b = min(size - bio->bi_iter.bi_size, max_alloc);
BUG_ON(darray_push(&pages, page));
}
BUG_ON(b & (bs - 1));
while (pages.nr) {
BUG_ON(!size);
#ifdef __KERNEL__
/*
* we don't know the device dma alignment, so in kernel make
* sure allocations are page aligned
*/
void *p = (void *) __get_free_pages(gfp_mask, get_order(b));
#else
void *p = kmalloc(b, gfp_mask);
#endif
if (!p)
return -ENOMEM;
unsigned len = min(PAGE_SIZE, size);
size -= len;
struct page *page = darray_pop(&pages);
BUG_ON(!bio_add_page(bio, page, len, 0));
}
bio_add_virt_nofail(bio, p, b);
}
out:
darray_for_each(pages, i)
__free_page(*i);
darray_exit(&pages);
return ret;
return 0;
}
u64 bch2_get_random_u64_below(u64 ceil)
@ -678,9 +661,8 @@ void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
struct bvec_iter iter;
__bio_for_each_segment(bv, dst, iter, dst_iter) {
void *dstp = kmap_local_page(bv.bv_page);
memcpy(dstp + bv.bv_offset, src, bv.bv_len);
void *dstp = bvec_kmap_local(&bv);
memcpy(dstp, src, bv.bv_len);
kunmap_local(dstp);
src += bv.bv_len;
@ -693,9 +675,8 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
struct bvec_iter iter;
__bio_for_each_segment(bv, src, iter, src_iter) {
void *srcp = kmap_local_page(bv.bv_page);
memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
void *srcp = bvec_kmap_local(&bv);
memcpy(dst, srcp, bv.bv_len);
kunmap_local(srcp);
dst += bv.bv_len;

View File

@ -422,14 +422,6 @@ retry:
}
}
#define memalloc_flags_do(_flags, _do) \
({ \
unsigned _saved_flags = memalloc_flags_save(_flags); \
typeof(_do) _ret = _do; \
memalloc_noreclaim_restore(_saved_flags); \
_ret; \
})
static struct inode *bch2_alloc_inode(struct super_block *sb)
{
BUG();

View File

@ -64,27 +64,13 @@ const char *blk_status_to_str(blk_status_t status)
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
struct bio_vec src_bv, dst_bv;
void *src_p, *dst_p;
unsigned bytes;
while (src_iter->bi_size && dst_iter->bi_size) {
src_bv = bio_iter_iovec(src, *src_iter);
dst_bv = bio_iter_iovec(dst, *dst_iter);
struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
bytes = min(src_bv.bv_len, dst_bv.bv_len);
unsigned bytes = min(src_bv.bv_len, dst_bv.bv_len);
src_p = kmap_atomic(src_bv.bv_page);
dst_p = kmap_atomic(dst_bv.bv_page);
memcpy(dst_p + dst_bv.bv_offset,
src_p + src_bv.bv_offset,
bytes);
kunmap_atomic(dst_p);
kunmap_atomic(src_p);
flush_dcache_page(dst_bv.bv_page);
memcpy(dst_bv.bv_addr, src_bv.bv_addr, bytes);
bio_advance_iter(src, src_iter, bytes);
bio_advance_iter(dst, dst_iter, bytes);
@ -109,15 +95,11 @@ void bio_copy_data(struct bio *dst, struct bio *src)
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
__bio_for_each_segment(bv, bio, iter, start) {
char *data = bvec_kmap_irq(&bv, &flags);
memset(data, 0, bv.bv_len);
bvec_kunmap_irq(data, &flags);
}
__bio_for_each_segment(bv, bio, iter, start)
memset(bv.bv_addr, 0, bv.bv_len);
}
static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
@ -165,15 +147,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
return split;
}
void bio_free_pages(struct bio *bio)
{
struct bvec_iter_all iter;
struct bio_vec *bvec;
bio_for_each_segment_all(bvec, bio, iter)
__free_page(bvec->bv_page);
}
void bio_advance(struct bio *bio, unsigned bytes)
{
bio_advance_iter(bio, &bio->bi_iter, bytes);
@ -208,26 +181,18 @@ void bio_put(struct bio *bio)
}
}
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off)
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs);
bv->bv_page = page;
bv->bv_offset = off;
bv->bv_addr = vaddr;
bv->bv_len = len;
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
return len;
}
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
{
bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
}
static inline bool bio_remaining_done(struct bio *bio)

View File

@ -59,18 +59,15 @@ void generic_make_request(struct bio *bio)
i = 0;
bio_for_each_segment(bv, bio, iter) {
void *start = page_address(bv.bv_page) + bv.bv_offset;
size_t len = bv.bv_len;
iov[i++] = (struct iovec) {
.iov_base = start,
.iov_len = len,
.iov_base = bv.bv_addr,
.iov_len = bv.bv_len,
};
#ifdef CONFIG_VALGRIND
/* To be pedantic it should only be on IO completion. */
if (bio_op(bio) == REQ_OP_READ)
VALGRIND_MAKE_MEM_DEFINED(start, len);
VALGRIND_MAKE_MEM_DEFINED(bv.bv_addr, bv.bv_len);
#endif
}