diff --git a/.bcachefs_revision b/.bcachefs_revision index 280ed22c..b5a8d6b1 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -c76f7e91e8939751ccc96ca2f8f6bfe6dd368d93 +2f4e24d85692600a698d78938a213f27593bda25 diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 288018fb..359cb23f 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -37,8 +38,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { struct bkey_alloc_unpacked { u64 journal_seq; - u64 bucket; - u8 dev; u8 gen; u8 oldest_gen; u8 data_type; @@ -194,11 +193,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { - .dev = k.k->p.inode, - .bucket = k.k->p.offset, - .gen = 0, - }; + struct bkey_alloc_unpacked ret = { .gen = 0 }; switch (k.k->type) { case KEY_TYPE_alloc: @@ -215,48 +210,6 @@ static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) -{ - if (k.k->type == KEY_TYPE_alloc_v4) { - *out = *bkey_s_c_to_alloc_v4(k).v; - } else { - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - *out = (struct bch_alloc_v4) { - .journal_seq = u.journal_seq, - .flags = u.need_discard, - .gen = u.gen, - .oldest_gen = u.oldest_gen, - .data_type = u.data_type, - .stripe_redundancy = u.stripe_redundancy, - .dirty_sectors = u.dirty_sectors, - .cached_sectors = u.cached_sectors, - .io_time[READ] = u.read_time, - .io_time[WRITE] = u.write_time, - .stripe = u.stripe, - }; - } -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_i_alloc_v4 *ret; - - if (k.k->type == KEY_TYPE_alloc_v4) { - ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if (!IS_ERR(ret)) - bkey_reassemble(&ret->k_i, k); - } else { - ret = bch2_trans_kmalloc(trans, sizeof(*ret)); - if (!IS_ERR(ret)) { - bkey_alloc_v4_init(&ret->k_i); - ret->k.p = k.k->p; - bch2_alloc_to_v4(k, &ret->v); - } - } - return ret; -} - struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) @@ -339,9 +292,15 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) { - prt_printf(err, "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4)); + if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%lu != %u)", + bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); + return -EINVAL; + } + + if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { + prt_printf(err, "invalid backpointers_start"); return -EINVAL; } @@ -401,9 +360,19 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return 0; } +static inline u64 swab40(u64 x) +{ + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; + struct bch_backpointer *bp, *bps; a->journal_seq = swab64(a->journal_seq); a->flags = swab32(a->flags); @@ -413,25 +382,135 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); + + bps = alloc_v4_backpointers(a); + for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { + bp->bucket_offset = swab40(bp->bucket_offset); + bp->bucket_len = swab32(bp->bucket_len); + bch2_bpos_swab(&bp->pos); + } } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - struct bch_alloc_v4 a; + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = &_a; + const struct bch_backpointer *bps; + unsigned i; - bch2_alloc_to_v4(k, &a); + if (k.k->type == KEY_TYPE_alloc_v4) + a = bkey_s_c_to_alloc_v4(k).v; + else + bch2_alloc_to_v4(k, &_a); - prt_printf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu need_inc_gen %llu", - a.gen, a.oldest_gen, bch2_data_types[a.data_type], - a.journal_seq, - BCH_ALLOC_V4_NEED_DISCARD(&a), - BCH_ALLOC_V4_NEED_INC_GEN(&a)); - prt_printf(out, " dirty_sectors %u", a.dirty_sectors); - prt_printf(out, " cached_sectors %u", a.cached_sectors); - prt_printf(out, " stripe %u", a.stripe); - prt_printf(out, " stripe_redundancy %u", a.stripe_redundancy); - prt_printf(out, " read_time %llu", a.io_time[READ]); - prt_printf(out, " write_time %llu", a.io_time[WRITE]); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "gen %u oldest_gen %u data_type %s", + a->gen, a->oldest_gen, bch2_data_types[a->data_type]); + prt_newline(out); + prt_printf(out, "journal_seq %llu", a->journal_seq); + prt_newline(out); + prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_newline(out); + prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_newline(out); + prt_printf(out, "dirty_sectors %u", a->dirty_sectors); + prt_newline(out); + prt_printf(out, "cached_sectors %u", a->cached_sectors); + prt_newline(out); + prt_printf(out, "stripe %u", a->stripe); + prt_newline(out); + prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); + prt_newline(out); + prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); + prt_newline(out); + prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); + prt_newline(out); + prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a)); + printbuf_indent_add(out, 2); + + bps = alloc_v4_backpointers_c(a); + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) { + prt_newline(out); + bch2_backpointer_to_text(out, &bps[i]); + } + + printbuf_indent_sub(out, 4); +} + +void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) +{ + if (k.k->type == KEY_TYPE_alloc_v4) { + int d; + + *out = *bkey_s_c_to_alloc_v4(k).v; + + d = (int) BCH_ALLOC_V4_U64s - + (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0); + if (unlikely(d > 0)) { + memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out), + 0, + d * sizeof(u64)); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } + } else { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { + .journal_seq = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, + .data_type = u.data_type, + .stripe_redundancy = u.stripe_redundancy, + .dirty_sectors = u.dirty_sectors, + .cached_sectors = u.cached_sectors, + .io_time[READ] = u.read_time, + .io_time[WRITE] = u.write_time, + .stripe = u.stripe, + }; + + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + unsigned bytes = k.k->type == KEY_TYPE_alloc_v4 + ? bkey_bytes(k.k) + : sizeof(struct bkey_i_alloc_v4); + struct bkey_i_alloc_v4 *ret; + + /* + * Reserve space for one more backpointer here: + * Not sketchy at doing it this way, nope... + */ + ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); + if (IS_ERR(ret)) + return ret; + + if (k.k->type == KEY_TYPE_alloc_v4) { + bkey_reassemble(&ret->k_i, k); + + if (BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v) < BCH_ALLOC_V4_U64s) { + struct bch_backpointer *src, *dst; + + src = alloc_v4_backpointers(&ret->v); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(&ret->v); + + memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * + sizeof(struct bch_backpointer)); + memset(src, 0, dst - src); + set_alloc_v4_u64s(ret); + } + } else { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; + bch2_alloc_to_v4(k, &ret->v); + } + return ret; } int bch2_alloc_read(struct bch_fs *c) @@ -1052,6 +1131,7 @@ static void bch2_do_discards_work(struct work_struct *work) if (ret) break; + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); discarded++; } bch2_trans_iter_exit(&trans, &iter); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index ff366e61..2ac6b504 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -70,6 +70,22 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_ return pos; } +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0) + + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * + (sizeof(struct bch_backpointer) / sizeof(u64)); + + BUG_ON(ret > U8_MAX - BKEY_U64s); + return ret; +} + +static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) +{ + set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); +} + struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); @@ -143,6 +159,16 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, void bch2_do_invalidates(struct bch_fs *); +static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); +} + +static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); +} + int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index d52282fb..7a878a69 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -506,7 +506,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, int ret; again: usage = bch2_dev_usage_read(ca); - avail = __dev_buckets_available(ca, usage,reserve); + avail = dev_buckets_free(ca, usage,reserve); if (usage.d[BCH_DATA_need_discard].buckets > avail) bch2_do_discards(c); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c new file mode 100644 index 00000000..f3260bbe --- /dev/null +++ b/libbcachefs/backpointers.c @@ -0,0 +1,891 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "error.h" + +#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 + +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc + * btree: + */ +static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, + struct bpos bp_pos) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); +} + +/* + * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: + */ +static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, + struct bpos bucket, + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + return POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); +} + +void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + struct bpos *bucket_pos, struct bch_backpointer *bp) +{ + enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; + s64 sectors = level ? btree_sectors(c) : k.k->size; + u32 bucket_offset; + + *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = data_type, + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = ptr_disk_sectors(sectors, p), + .pos = k.k->p, + }; +} + +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bpos bucket, + struct bch_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket2; + struct bch_backpointer bp2; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, + &bucket2, &bp2); + if (!bpos_cmp(bucket, bucket2) && + !memcmp(&bp, &bp2, sizeof(bp))) + return true; + } + + return false; +} + +int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + + if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) { + prt_str(err, "incorrect value size"); + return -EINVAL; + } + + if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { + prt_str(err, "backpointer at wrong pos"); + return -EINVAL; + } + + return 0; +} + +void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +{ + prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", + bch2_btree_ids[bp->btree_id], + bp->level, + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp->bucket_len); + bch2_bpos_to_text(out, bp->pos); +} + +void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); +} + +void bch2_backpointer_swab(struct bkey_s k) +{ + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + + bp.v->bucket_offset = swab32(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); +} + +#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1) + +static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r) +{ + return cmp_int(l.bucket_offset, r.bucket_offset); +} + +static int bch2_backpointer_del_by_offset(struct btree_trans *trans, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (bp_offset < BACKPOINTER_OFFSET_MAX) { + struct bch_backpointer *bps; + struct bkey_i_alloc_v4 *a; + unsigned i, nr; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_alloc_v4) { + ret = -ENOENT; + goto err; + } + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + bps = alloc_v4_backpointers(&a->v); + nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + + for (i = 0; i < nr; i++) { + if (bps[i].bucket_offset == bp_offset) + goto found; + if (bps[i].bucket_offset > bp_offset) + break; + } + + ret = -ENOENT; + goto err; +found: + if (memcmp(&bps[i], &bp, sizeof(bp))) { + ret = -ENOENT; + goto err; + } + array_remove_item(bps, nr, i); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } else { + bp_offset -= BACKPOINTER_OFFSET_MAX; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket, bp_offset), + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { + ret = -ENOENT; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_bucket_backpointer_del(struct btree_trans *trans, + struct bkey_i_alloc_v4 *a, + struct bch_backpointer bp, + struct bkey_s_c orig_k) +{ + struct bch_fs *c = trans->c; + struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp) ?: + memcmp(&bps[i], &bp, sizeof(bp)); + if (!cmp) + goto found; + if (cmp >= 0) + break; + } + + goto btree; +found: + array_remove_item(bps, nr, i); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + return 0; +btree: + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "backpointer not found when deleting"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "searching for "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + + prt_printf(&buf, "got "); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_str(&buf, "alloc "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + } else { + ret = -EIO; + bch2_trans_inconsistent(trans, "%s", buf.buf); + } + printbuf_exit(&buf); + goto err; + } + + ret = bch2_btree_delete_at(trans, &bp_iter, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +int bch2_bucket_backpointer_add(struct btree_trans *trans, + struct bkey_i_alloc_v4 *a, + struct bch_backpointer bp, + struct bkey_s_c orig_k) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); + struct bkey_i_backpointer *bp_k; + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + /* Check for duplicates: */ + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp); + if (cmp >= 0) + break; + } + + if ((i && + (bps[i - 1].bucket_offset + + bps[i - 1].bucket_len > bp.bucket_offset)) || + (i < nr && + (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "overlapping backpointer found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "into "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err(c, "%s", buf.buf); + else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + } + + if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) { + array_insert_item(bps, nr, i, bp); + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); + set_alloc_v4_u64s(a); + return 0; + } + + /* Overflow: use backpointer btree */ + bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + ca = bch_dev_bkey_exists(c, a->k.p.inode); + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); + bp_k->v = bp; + + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "existing btree backpointer key found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "found "); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err(c, "%s", buf.buf); + else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EIO; + goto err; + } + } + + ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +/* + * Find the next backpointer >= *bp_offset: + */ +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + u64 *bp_offset, + struct bch_backpointer *dst) +{ + struct bch_fs *c = trans->c; + struct bpos bp_pos = + bucket_pos_to_bp(c, bucket, + max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); + struct bpos bp_end_pos = + bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct btree_iter alloc_iter, bp_iter = { NULL }; + struct bkey_s_c k; + struct bkey_s_c_alloc_v4 a; + size_t i; + int ret; + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto out; + + if (k.k->type != KEY_TYPE_alloc_v4) + goto done; + + a = bkey_s_c_to_alloc_v4(k); + if (gen >= 0 && a.v->gen != gen) + goto done; + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { + if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) + continue; + + *dst = alloc_v4_backpointers_c(a.v)[i]; + *bp_offset = dst->bucket_offset; + goto out; + } + + for_each_btree_key(trans, bp_iter, BTREE_ID_backpointers, + bp_pos, 0, k, ret) { + if (bpos_cmp(k.k->p, bp_end_pos) >= 0) + break; + + if (k.k->type != KEY_TYPE_backpointer) + continue; + + *dst = *bkey_s_c_to_backpointer(k).v; + *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX; + goto out; + } +done: + *bp_offset = U64_MAX; +out: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +static void backpointer_not_found(struct btree_trans *trans, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp, + struct bkey_s_c k, + const char *thing_it_points_to) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + thing_it_points_to); + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, "\n "); + + if (bp_offset >= BACKPOINTER_OFFSET_MAX) { + struct bpos bp_pos = + bucket_pos_to_bp(c, bucket, + bp_offset - BACKPOINTER_OFFSET_MAX); + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); + } + + bch2_backpointer_to_text(&buf, &bp); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + bch_err(c, "%s", buf.buf); + else + bch2_trans_inconsistent(trans, "%s", buf.buf); + + printbuf_exit(&buf); +} + +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + min(bp.level, c->btree_roots[bp.btree_id].level), + 0); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bp.level == c->btree_roots[bp.btree_id].level + 1) + k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); + + if (extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; + + backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); + + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_null; +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket, + u64 bp_offset, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_s_c k; + + BUG_ON(!bp.level); + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + bp.level - 1, + 0); + b = bch2_btree_iter_peek_node(iter); + if (IS_ERR(b)) { + bch2_trans_iter_exit(trans, iter); + return b; + } + + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) + return b; + + if (!btree_node_will_make_reachable(b)) + backpointer_not_found(trans, bucket, bp_offset, + bp, k, "btree node"); + + bch2_trans_iter_exit(trans, iter); + return NULL; +} + +static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bch_dev *ca; + struct bkey_s_c k, alloc_k; + struct printbuf buf = PRINTBUF; + int ret = 0; + + k = bch2_btree_iter_peek(bp_iter); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + "backpointer for mising device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bp_pos_to_bucket(c, k.k->p), 0); + + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + goto out; + + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +} + +/* verify that every backpointer has a corresponding alloc key */ +int bch2_check_btree_backpointers(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_backpointers, POS_MIN, 0); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + bch2_check_btree_backpointer(&trans, &iter)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int check_bp_exists(struct btree_trans *trans, + struct bpos bucket_pos, + struct bch_backpointer bp, + struct bkey_s_c orig_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter, bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c alloc_k, bp_k; + int ret; + + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + goto err; + + if (alloc_k.k->type == KEY_TYPE_alloc_v4) { + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k); + const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v); + unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); + + for (i = 0; i < nr; i++) { + int cmp = backpointer_cmp(bps[i], bp) ?: + memcmp(&bps[i], &bp, sizeof(bp)); + if (!cmp) + goto out; + if (cmp >= 0) + break; + } + } else { + goto missing; + } + + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), + 0); + bp_k = bch2_btree_iter_peek_slot(&bp_iter); + ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) + goto missing; +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +missing: + prt_printf(&buf, "missing backpointer for btree=%s l=%u ", + bch2_btree_ids[bp.btree_id], bp.level); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_printf(&buf, "\nin alloc key "); + bch2_bkey_val_to_text(&buf, c, alloc_k); + + if (c->sb.version < bcachefs_metadata_version_backpointers || + fsck_err(c, "%s", buf.buf)) { + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); + + ret = PTR_ERR_OR_ZERO(a) ?: + bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?: + bch2_trans_update(trans, &alloc_iter, &a->k_i, 0); + } + + goto out; +} + +static int check_extent_to_backpointers(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek_all_levels(iter); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k); + if (ret) + return ret; + } + + return 0; +} + +static int check_btree_root_to_backpointers(struct btree_trans *trans, + enum btree_id btree_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + int ret; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + c->btree_roots[btree_id].level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + BUG_ON(b != btree_node_root(c, b)); + + k = bkey_i_to_s_c(&b->key); + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k); + if (ret) + goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_check_extents_to_backpointers(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + enum btree_id btree_id; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0, + 0, + BTREE_ITER_ALL_LEVELS| + BTREE_ITER_PREFETCH); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent_to_backpointers(&trans, &iter)); + if (ret) + break; + } while (!bch2_btree_iter_advance(&iter)); + + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + break; + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_btree_root_to_backpointers(&trans, btree_id)); + if (ret) + break; + } + bch2_trans_exit(&trans); + return ret; +} + +static int check_one_backpointer(struct btree_trans *trans, + struct bpos bucket, + u64 *bp_offset) +{ + struct btree_iter iter; + struct bch_backpointer bp; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + ret = bch2_get_next_backpointer(trans, bucket, -1, + bp_offset, &bp); + if (ret || *bp_offset == U64_MAX) + return ret; + + k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + ret = bkey_err(k); + if (ret) + return ret; + + if (fsck_err_on(!k.k, trans->c, + "%s backpointer points to missing extent\n%s", + *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree", + (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { + ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); + if (ret == -ENOENT) + bch_err(trans->c, "backpointer at %llu not found", *bp_offset); + } + + bch2_trans_iter_exit(trans, &iter); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_backpointers_to_extents(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 bp_offset = 0; + + while (!(ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_one_backpointer(&trans, iter.pos, &bp_offset))) && + bp_offset < U64_MAX) + bp_offset++; + + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h new file mode 100644 index 00000000..fe42af29 --- /dev/null +++ b/libbcachefs/backpointers.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H + +#include "super.h" + +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, + int, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); + +#define bch2_bkey_ops_backpointer (struct bkey_ops) { \ + .key_invalid = bch2_backpointer_invalid, \ + .val_to_text = bch2_backpointer_k_to_text, \ + .swab = bch2_backpointer_swab, \ +} + +void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned, + struct bkey_s_c, struct extent_ptr_decoded, + struct bpos *, struct bch_backpointer *); + +int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, + u64 *, struct bch_backpointer *); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, + struct bpos, u64, struct bch_backpointer); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, + struct bpos, u64, struct bch_backpointer); + +int bch2_check_btree_backpointers(struct bch_fs *); +int bch2_check_extents_to_backpointers(struct bch_fs *); +int bch2_check_backpointers_to_extents(struct bch_fs *); + +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 2eced206..1f0484aa 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -509,6 +509,7 @@ enum { BCH_FS_TOPOLOGY_REPAIR_DONE, BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ BCH_FS_CHECK_LRUS_DONE, + BCH_FS_CHECK_BACKPOINTERS_DONE, BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, BCH_FS_FSCK_DONE, BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index dbe9a37f..147fde14 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -365,7 +365,8 @@ static inline void bkey_init(struct bkey *k) x(alloc_v3, 24) \ x(set, 25) \ x(lru, 26) \ - x(alloc_v4, 27) + x(alloc_v4, 27) \ + x(backpointer, 28) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -886,6 +887,12 @@ struct bch_alloc { x(stripe, 32) \ x(stripe_redundancy, 8) +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + struct bch_alloc_v2 { struct bch_val v; __u8 nr_fields; @@ -914,6 +921,9 @@ struct bch_alloc_v3 { __u8 data[]; } __attribute__((packed, aligned(8))); +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + struct bch_alloc_v4 { struct bch_val v; __u64 journal_seq; @@ -927,22 +937,27 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; - struct bpos backpointers[0]; } __attribute__((packed, aligned(8))); -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; +#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 + +struct bch_backpointer { + struct bch_val v; + __u8 btree_id; + __u8 level; + __u8 data_type; + __u64 bucket_offset:40; + __u32 bucket_len; + struct bpos pos; +} __attribute__((packed, aligned(8))); /* Quotas: */ @@ -1326,7 +1341,8 @@ struct bch_sb_field_disk_groups { x(io_read, 0) \ x(io_write, 1) \ x(io_move, 2) \ - x(bucket_invalidate, 3) + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, @@ -1407,7 +1423,8 @@ struct bch_sb_field_journal_seq_blacklist { x(inode_v2, 18) \ x(freespace, 19) \ x(alloc_v4, 20) \ - x(new_data_types, 21) + x(new_data_types, 21) \ + x(backpointers, 22) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 229d5157..fd352a67 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "backpointers.h" #include "bkey_methods.h" #include "btree_types.h" #include "alloc_background.h" @@ -191,6 +192,9 @@ static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_need_discard] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_set), + [BKEY_TYPE_backpointers] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_backpointer), [BKEY_TYPE_btree] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index d8f92cc9..5382f2b8 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -376,7 +376,7 @@ struct btree_trans_commit_hook { struct btree_trans_commit_hook *next; }; -#define BTREE_TRANS_MEM_MAX (1U << 14) +#define BTREE_TRANS_MEM_MAX (1U << 16) struct btree_trans { struct bch_fs *c; @@ -638,6 +638,11 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; } +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_PTRS; +} + static inline bool btree_node_type_needs_gc(enum btree_node_type type) { return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index e2944fc4..1ea7e2ba 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -7,6 +7,7 @@ #include "bcachefs.h" #include "alloc_background.h" +#include "backpointers.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -655,16 +656,6 @@ err: return ret; } -static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -{ - EBUG_ON(sectors < 0); - - return crc_is_compressed(p.crc) - ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, - p.crc.uncompressed_size) - : sectors; -} - static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, const struct bch_extent_ptr *ptr, @@ -1368,21 +1359,43 @@ need_mark: /* trans_mark: */ static int bch2_trans_mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + unsigned flags) { + bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct btree_iter iter; struct bkey_i_alloc_v4 *a; + struct bpos bucket_pos; + struct bch_backpointer bp; + s64 sectors; int ret; - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr)); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); + sectors = bp.bucket_len; + if (!insert) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); if (IS_ERR(a)) return PTR_ERR(a); - ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, + ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); + &a->v.dirty_sectors, &a->v.cached_sectors); + if (ret) + goto err; + + if (!p.ptr.cached) { + ret = insert + ? bch2_bucket_backpointer_add(trans, a, bp, k) + : bch2_bucket_backpointer_del(trans, a, bp, k); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1476,8 +1489,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans, if (flags & BTREE_TRIGGER_OVERWRITE) disk_sectors = -disk_sectors; - ret = bch2_trans_mark_pointer(trans, k, p, - disk_sectors, data_type); + ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); if (ret < 0) return ret; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 3469327d..670b95b8 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -75,6 +75,15 @@ static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, + const struct bch_extent_ptr *ptr, + u32 *bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); +} + static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { @@ -90,6 +99,16 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } +static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +{ + EBUG_ON(sectors < 0); + + return crc_is_compressed(p.crc) + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; +} + static inline int gen_cmp(u8 a, u8 b) { return (s8) (a - b); @@ -144,12 +163,25 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser return reserved; } +static inline u64 dev_buckets_free(struct bch_dev *ca, + struct bch_dev_usage usage, + enum alloc_reserve reserve) +{ + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets - + ca->nr_open_buckets - + bch2_dev_buckets_reserved(ca, reserve)); +} + static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage usage, enum alloc_reserve reserve) { return max_t(s64, 0, usage.d[BCH_DATA_free].buckets - + usage.d[BCH_DATA_cached].buckets - + usage.d[BCH_DATA_need_gc_gens].buckets - + usage.d[BCH_DATA_need_discard].buckets - ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, reserve)); } diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 0a9dd5af..1dbba7d9 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -95,7 +95,7 @@ struct copygc_heap_entry { u8 replicas; u32 fragmentation; u32 sectors; - u64 offset; + u64 bucket; }; typedef HEAP(struct copygc_heap_entry) copygc_heap; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index f1abec95..81bfd6ea 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1015,47 +1015,6 @@ static int check_subvols(struct bch_fs *c) return ret; } -/* - * Checking for overlapping extents needs to be reimplemented - */ -#if 0 -static int fix_overlapping_extent(struct btree_trans *trans, - struct bkey_s_c k, struct bpos cut_at) -{ - struct btree_iter iter; - struct bkey_i *u; - int ret; - - u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bkey_reassemble(u, k); - bch2_cut_front(cut_at, u); - - - /* - * We don't want to go through the extent_handle_overwrites path: - * - * XXX: this is going to screw up disk accounting, extent triggers - * assume things about extent overwrites - we should be running the - * triggers manually here - */ - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, - BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); - - BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - bch2_trans_iter_exit(trans, &iter); - return ret; -} -#endif - static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) @@ -1150,9 +1109,72 @@ fsck_err: return ret ?: ret2; } +struct extent_end { + u32 snapshot; + u64 offset; +}; + +typedef DARRAY(struct extent_end) extent_ends; + +static int extent_ends_at(extent_ends *extent_ends, + struct bkey_s_c k) +{ + struct extent_end *i, n = (struct extent_end) { + .snapshot = k.k->p.snapshot, + .offset = k.k->p.offset, + }; + + darray_for_each(*extent_ends, i) { + if (i->snapshot == k.k->p.snapshot) { + *i = n; + return 0; + } + + if (i->snapshot >= k.k->p.snapshot) + break; + } + + return darray_insert_item(extent_ends, i - extent_ends->data, n); +} + +static int check_extent_start(struct btree_trans *trans, + struct snapshots_seen *s, + extent_ends *extent_ends, + struct bkey_s_c k, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct extent_end *i; + struct printbuf buf = PRINTBUF; + int ret = 0; + + darray_for_each(*extent_ends, i) { + if (fsck_err_on(i->offset > bkey_start_offset(k.k) && + key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot), c, + "overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s", + i->snapshot, + i->offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + bkey_reassemble(update, k); + ret = bch2_trans_update_extent(trans, iter, update, 0); + if (!ret) + goto err; + } + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode, - struct snapshots_seen *s) + struct snapshots_seen *s, + extent_ends *extent_ends) { struct bch_fs *c = trans->c; struct bkey_s_c k; @@ -1182,6 +1204,8 @@ peek: goto out; if (inode->cur_inum != k.k->p.inode) { + extent_ends->nr = 0; + ret = check_i_sectors(trans, inode); if (ret) goto err; @@ -1195,20 +1219,7 @@ peek: */ goto peek; } -#if 0 - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { - char buf1[200]; - char buf2[200]; - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); - bch2_bkey_val_to_text(&PBUF(buf2), c, k); - - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { - ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; - goto out; - } - } -#endif ret = __walk_inode(trans, inode, k.k->p); if (ret < 0) goto err; @@ -1259,13 +1270,17 @@ peek: } } + ret = check_extent_start(trans, s, extent_ends, k, iter); + if (ret) + goto err; + if (bkey_extent_is_allocation(k.k)) for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) i->count += k.k->size; -#if 0 - bch2_bkey_buf_reassemble(&prev, c, k); -#endif + ret = extent_ends_at(extent_ends, k); + if (ret) + goto err; out: err: fsck_err: @@ -1287,13 +1302,9 @@ static int check_extents(struct bch_fs *c) struct snapshots_seen s; struct btree_trans trans; struct btree_iter iter; + extent_ends extent_ends = { 0 }; int ret = 0; -#if 0 - struct bkey_buf prev; - bch2_bkey_buf_init(&prev); - prev.k->k = KEY(0, 0, 0); -#endif snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -1309,14 +1320,12 @@ static int check_extents(struct bch_fs *c) ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL, - check_extent(&trans, &iter, &w, &s)); + check_extent(&trans, &iter, &w, &s, &extent_ends)); if (ret) break; } while (bch2_btree_iter_advance(&iter)); bch2_trans_iter_exit(&trans, &iter); -#if 0 - bch2_bkey_buf_exit(&prev, c); -#endif + darray_exit(&extent_ends); inode_walker_exit(&w); bch2_trans_exit(&trans); snapshots_seen_exit(&s); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a8f6d5a3..36d20dc8 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" @@ -9,6 +11,7 @@ #include "buckets.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" @@ -632,6 +635,70 @@ err: return ret; } +static int move_ratelimit(struct btree_trans *trans, + struct moving_context *ctxt, + struct bch_ratelimit *rate) +{ + u64 delay; + + do { + delay = rate ? bch2_ratelimit_delay(rate) : 0; + + if (delay) { + bch2_trans_unlock(trans); + set_current_state(TASK_INTERRUPTIBLE); + } + + if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 1; + } + + if (delay) + schedule_timeout(delay); + + if (unlikely(freezing(current))) { + move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->read_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + + return 0; +} + +static int move_get_io_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c k, u64 *cur_inum) +{ + struct bch_inode_unpacked inode; + int ret; + + if (*cur_inum == k.k->p.inode) + return 0; + + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + + ret = lookup_inode(trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (ret == -EINTR) + return ret; + + if (!ret) + bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); + + *cur_inum = k.k->p.inode; + return 0; +} + static int __bch2_move_data(struct bch_fs *c, struct moving_context *ctxt, struct bch_ratelimit *rate, @@ -642,7 +709,6 @@ static int __bch2_move_data(struct bch_fs *c, struct bch_move_stats *stats, enum btree_id btree_id) { - bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; @@ -650,7 +716,7 @@ static int __bch2_move_data(struct bch_fs *c, struct bkey_s_c k; struct data_opts data_opts; enum data_cmd data_cmd; - u64 delay, cur_inum = U64_MAX; + u64 cur_inum = U64_MAX; int ret = 0, ret2; bch2_bkey_buf_init(&sk); @@ -667,37 +733,7 @@ static int __bch2_move_data(struct bch_fs *c, if (rate) bch2_ratelimit_reset(rate); - while (1) { - do { - delay = rate ? bch2_ratelimit_delay(rate) : 0; - - if (delay) { - bch2_trans_unlock(&trans); - set_current_state(TASK_INTERRUPTIBLE); - } - - if (kthread && (ret = kthread_should_stop())) { - __set_current_state(TASK_RUNNING); - goto out; - } - - if (delay) - schedule_timeout(delay); - - if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); - try_to_freeze(); - } - } while (delay); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, &trans, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - + while (!move_ratelimit(&trans, ctxt, rate)) { bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); @@ -718,23 +754,9 @@ static int __bch2_move_data(struct bch_fs *c, if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - if (btree_id == BTREE_ID_extents && - cur_inum != k.k->p.inode) { - struct bch_inode_unpacked inode; - - io_opts = bch2_opts_to_inode_opts(c->opts); - - ret = lookup_inode(&trans, - SPOS(0, k.k->p.inode, k.k->p.snapshot), - &inode); - if (ret == -EINTR) - continue; - - if (!ret) - bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); - - cur_inum = k.k->p.inode; - } + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { case DATA_SKIP: @@ -779,7 +801,6 @@ next: next_nondata: bch2_btree_iter_advance(&iter); } -out: bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -848,7 +869,6 @@ int bch2_move_data(struct bch_fs *c, break; } - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); @@ -862,6 +882,167 @@ int bch2_move_data(struct bch_fs *c, return ret; } +static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + + if (!ret && k.k->type == KEY_TYPE_alloc_v4) { + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + + if (a.v->gen == gen && + a.v->dirty_sectors) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "failed to evacuate bucket "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + enum data_cmd data_cmd, + struct data_opts *data_opts, + struct bch_move_stats *stats) +{ + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct moving_context ctxt = { .stats = stats }; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bch_backpointer bp; + u64 bp_offset = 0, cur_inum = U64_MAX; + int ret = 0; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + progress_list_add(c, stats); + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_user; + + while (!(ret = move_ratelimit(&trans, &ctxt, rate))) { + bch2_trans_begin(&trans); + + ret = bch2_get_next_backpointer(&trans, bucket, gen, + &bp_offset, &bp); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (bp_offset == U64_MAX) + break; + + if (!bp.level) { + struct bkey_s_c k; + + k = bch2_backpointer_get_key(&trans, &iter, + bucket, bp_offset, bp); + ret = bkey_err(k); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (!k.k) + continue; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_iter_exit(&trans, &iter); + + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; + + data_opts->target = io_opts.background_target; + data_opts->rewrite_dev = bucket.inode; + + ret = bch2_move_extent(&trans, &ctxt, wp, io_opts, bp.btree_id, k, + data_cmd, *data_opts); + if (ret == -EINTR) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt, &trans); + continue; + } + if (ret) + goto err; + + if (rate) + bch2_ratelimit_increment(rate, k.k->size); + atomic64_add(k.k->size, &stats->sectors_seen); + } else { + struct btree *b; + + b = bch2_backpointer_get_node(&trans, &iter, + bucket, bp_offset, bp); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -EINTR) + continue; + if (ret) + goto err; + if (!b) + continue; + + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); + bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR) + continue; + if (ret) + goto err; + + if (rate) + bch2_ratelimit_increment(rate, c->opts.btree_node_size >> 9); + atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_moved); + } + + bp_offset++; + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { + bch2_trans_unlock(&trans); + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); + } +err: + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + progress_list_del(c, stats); + + EBUG_ON(atomic_read(&ctxt.write_sectors)); + + trace_move_data(c, + atomic64_read(&stats->sectors_moved), + atomic64_read(&stats->keys_moved)); + + return ret; +} + typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_opts *); diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 2a789a11..c69b6b5a 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -62,6 +62,12 @@ int bch2_move_data(struct bch_fs *, move_pred_fn, void *, struct bch_move_stats *); +int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct bch_ratelimit *, + struct write_point_specifier, + enum data_cmd, + struct data_opts *, + struct bch_move_stats *); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 99980c3d..efb09e1c 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -30,80 +30,6 @@ #include #include -static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: - cmp_int(l->offset, r->offset); -} - -static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - copygc_heap *h = &c->copygc_heap; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct copygc_heap_entry search = { - .dev = p.ptr.dev, - .offset = p.ptr.offset, - }; - ssize_t i; - - if (p.ptr.cached) - continue; - - i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); -#if 0 - /* eytzinger search verify code: */ - ssize_t j = -1, k; - - for (k = 0; k < h->used; k++) - if (h->data[k].offset <= ptr->offset && - (j < 0 || h->data[k].offset > h->data[j].offset)) - j = k; - - BUG_ON(i != j); -#endif - if (i >= 0 && - p.ptr.dev == h->data[i].dev && - p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && - p.ptr.gen == h->data[i].gen) { - /* - * We need to use the journal reserve here, because - * - journal reclaim depends on btree key cache - * flushing to make forward progress, - * - which has to make forward progress when the - * journal is pre-reservation full, - * - and depends on allocation - meaning allocator and - * copygc - */ - - data_opts->target = io_opts->background_target; - data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| - JOURNAL_WATERMARK_copygc; - data_opts->rewrite_dev = p.ptr.dev; - - if (p.has_ec) - data_opts->nr_replicas += p.ec.redundancy; - - return DATA_REWRITE; - } - } - - return DATA_SKIP; -} - static inline int fragmentation_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -111,7 +37,7 @@ static inline int fragmentation_cmp(copygc_heap *heap, return cmp_int(l.fragmentation, r.fragmentation); } -static int walk_buckets_to_copygc(struct bch_fs *c) +static int find_buckets_to_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct btree_trans trans; @@ -122,6 +48,14 @@ static int walk_buckets_to_copygc(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); + /* + * Find buckets with lowest sector counts, skipping completely + * empty buckets, by building a maxheap sorted by sector count, + * and repeatedly replacing the maximum element until all + * buckets have been visited. + */ + h->used = 0; + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); @@ -129,7 +63,8 @@ static int walk_buckets_to_copygc(struct bch_fs *c) bch2_alloc_to_v4(k, &a); - if (a.data_type != BCH_DATA_user || + if ((a.data_type != BCH_DATA_btree && + a.data_type != BCH_DATA_user) || a.dirty_sectors >= ca->mi.bucket_size || bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) continue; @@ -141,7 +76,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c) .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size), .sectors = a.dirty_sectors, - .offset = bucket_to_sector(ca, iter.pos.offset), + .bucket = iter.pos.offset, }; heap_add_or_replace(h, e, -fragmentation_cmp, NULL); @@ -152,77 +87,22 @@ static int walk_buckets_to_copygc(struct bch_fs *c) return ret; } -static int bucket_inorder_cmp(const void *_l, const void *_r) -{ - const struct copygc_heap_entry *l = _l; - const struct copygc_heap_entry *r = _r; - - return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); -} - -static int check_copygc_was_done(struct bch_fs *c, - u64 *sectors_not_moved, - u64 *buckets_not_moved) -{ - copygc_heap *h = &c->copygc_heap; - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; - struct copygc_heap_entry *i; - int ret = 0; - - sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); - - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); - - for (i = h->data; i < h->data + h->used; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - - bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); - - ret = lockrestart_do(&trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (ret) - break; - - bch2_alloc_to_v4(k, &a); - - if (a.gen == i->gen && a.dirty_sectors) { - *sectors_not_moved += a.dirty_sectors; - *buckets_not_moved += 1; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; -} - static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; - struct copygc_heap_entry e, *i; + struct copygc_heap_entry e; struct bch_move_stats move_stats; - u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; - u64 sectors_reserved = 0; - u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; size_t heap_size = 0; + struct data_opts data_opts = { + .nr_replicas = 1, + .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, + }; int ret; bch_move_stats_init(&move_stats, "copygc"); - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - h->used = 0; - for_each_rw_member(ca, c, dev_idx) heap_size += ca->mi.nbuckets >> 7; @@ -234,21 +114,7 @@ static int bch2_copygc(struct bch_fs *c) } } - for_each_rw_member(ca, c, dev_idx) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - - u64 avail = max_t(s64, 0, - usage.d[BCH_DATA_free].buckets + - usage.d[BCH_DATA_need_discard].buckets - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, RESERVE_movinggc)); - - avail = min(avail, ca->mi.nbuckets >> 6); - - sectors_reserved += avail * ca->mi.bucket_size; - } - - ret = walk_buckets_to_copygc(c); + ret = find_buckets_to_copygc(c); if (ret) { bch2_fs_fatal_error(c, "error walking buckets to copygc!"); return ret; @@ -259,68 +125,24 @@ static int bch2_copygc(struct bch_fs *c) return 0; } - /* - * Our btree node allocations also come out of RESERVE_movingc: - */ - sectors_reserved = (sectors_reserved * 3) / 4; - if (!sectors_reserved) { - bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); - return -1; - } + heap_resort(h, fragmentation_cmp, NULL); - for (i = h->data; i < h->data + h->used; i++) { - sectors_to_move += i->sectors; - sectors_to_write += i->sectors * i->replicas; - } - - while (sectors_to_write > sectors_reserved) { + while (h->used) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - sectors_to_write -= e.sectors * e.replicas; + /* not correct w.r.t. device removal */ + + ret = bch2_evacuate_bucket(c, POS(e.dev, e.bucket), e.gen, NULL, + writepoint_ptr(&c->copygc_write_point), + DATA_REWRITE, &data_opts, + &move_stats); + if (ret < 0) + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + if (ret) + return ret; } - buckets_to_move = h->used; - - if (!buckets_to_move) { - bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", - sectors_reserved); - return 0; - } - - eytzinger0_sort(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, NULL); - - ret = bch2_move_data(c, - 0, POS_MIN, - BTREE_ID_NR, POS_MAX, - NULL, - writepoint_ptr(&c->copygc_write_point), - copygc_pred, NULL, - &move_stats); - if (ret < 0) - bch_err(c, "error %i from bch2_move_data() in copygc", ret); - if (ret) - return ret; - - ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); - if (ret) { - bch_err(c, "error %i from check_copygc_was_done()", ret); - return ret; - } - - if (sectors_not_moved) - bch_warn_ratelimited(c, - "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", - sectors_not_moved, sectors_to_move, - buckets_not_moved, buckets_to_move, - atomic64_read(&move_stats.sectors_moved), - atomic64_read(&move_stats.keys_raced), - atomic64_read(&move_stats.sectors_raced)); - - trace_copygc(c, - atomic64_read(&move_stats.sectors_moved), sectors_not_moved, - buckets_to_move, buckets_not_moved); - return 0; + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + return ret; } /* diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 480abf13..63e8c1c3 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" @@ -1075,8 +1076,8 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!c->opts.nochanges) { - if (c->sb.version < bcachefs_metadata_version_new_data_types) { - bch_info(c, "version prior to new_data_types, upgrade and fsck required"); + if (c->sb.version < bcachefs_metadata_version_backpointers) { + bch_info(c, "version prior to backpointers, upgrade and fsck required"); c->opts.version_upgrade = true; c->opts.fsck = true; c->opts.fix_errors = FSCK_OPT_YES; @@ -1254,6 +1255,28 @@ use_clean: bch_verbose(c, "done checking lrus"); set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + bch_info(c, "checking backpointers to alloc keys"); + err = "error checking backpointers to alloc keys"; + ret = bch2_check_btree_backpointers(c); + if (ret) + goto err; + bch_verbose(c, "done checking backpointers to alloc keys"); + + bch_info(c, "checking backpointers to extents"); + err = "error checking backpointers to extents"; + ret = bch2_check_backpointers_to_extents(c); + if (ret) + goto err; + bch_verbose(c, "done checking backpointers to extents"); + + bch_info(c, "checking extents to backpointers"); + err = "error checking extents to backpointers"; + ret = bch2_check_extents_to_backpointers(c); + if (ret) + goto err; + bch_verbose(c, "done checking extents to backpointers"); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); + bch_info(c, "checking alloc to lru refs"); err = "error checking alloc to lru refs"; ret = bch2_check_alloc_to_lru_refs(c); @@ -1265,6 +1288,7 @@ use_clean: set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); @@ -1417,6 +1441,9 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + if (c->sb.version < bcachefs_metadata_version_backpointers) + c->opts.version_upgrade = true; + if (c->opts.version_upgrade) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 71fc231d..29089740 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1433,6 +1433,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, BTREE_TRIGGER_NORUN, NULL); if (ret)