diff --git a/.bcachefs_revision b/.bcachefs_revision index 953107c2..7b4e00b1 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ea3414eed52e5d90c248453e84b2dcd91c960306 +26409a8f755b8faa620a49796d7935566204daaf diff --git a/cmd_debug.c b/cmd_debug.c index befd41f4..c4dd24ba 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -572,14 +572,10 @@ int cmd_list_journal(int argc, char *argv[]) printf("journal entry %8llu\n" " version %8u\n" " last seq %8llu\n" - " read clock %8u\n" - " write clock %8u\n" , le64_to_cpu(p->j.seq), le32_to_cpu(p->j.version), - le64_to_cpu(p->j.last_seq), - le16_to_cpu(p->j.read_clock), - le16_to_cpu(p->j.write_clock)); + le64_to_cpu(p->j.last_seq)); for_each_jset_key(k, _n, entry, &p->j) { char buf[200]; diff --git a/libbcachefs.c b/libbcachefs.c index e7c1ca23..e359d48b 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -623,8 +623,6 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, printf(" flags: %x", le32_to_cpu(clean->flags)); - printf(" read clock: %x", le16_to_cpu(clean->read_clock)); - printf(" write clock: %x", le16_to_cpu(clean->write_clock)); printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq)); } diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 896ec023..a91caf04 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -14,6 +14,7 @@ #include "ec.h" #include "error.h" #include "recovery.h" +#include "varint.h" #include #include @@ -24,15 +25,12 @@ #include #include -static const char * const bch2_alloc_field_names[] = { -#define x(name, bytes) #name, - BCH_ALLOC_FIELDS() +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() #undef x - NULL }; -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work) /* Persistent alloc info: */ -static inline u64 get_alloc_field(const struct bch_alloc *a, - const void **p, unsigned field) +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) @@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a, return v; } -static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, - unsigned field, u64 v) +static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; if (!v) return; @@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; + + out->gen = in->gen; + +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} + +static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + struct bkey_i_alloc *a = bkey_alloc_init(&dst->k); + void *d = a->v.data; + unsigned bytes, idx = 0; + + a->k.p = POS(src.dev, src.bucket); + a->v.fields = 0; + a->v.gen = src.gen; + +#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name); + BCH_ALLOC_FIELDS_V1() +#undef x + bytes = (void *) d - (void *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); +} + +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + u8 *out = a->v.data; + u8 *end = (void *) &dst[1]; + u8 *last_nonzero_field = out; + unsigned bytes; + + a->k.p = POS(src.dev, src.bucket); + a->v.gen = src.gen; + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (src._name) { \ + out += bch2_varint_encode(out, src._name); \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + } + + BCH_ALLOC_FIELDS_V2() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + a->v.nr_fields = last_nonzero_fieldnr; + + bytes = (u8 *) out - (u8 *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); +} + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { .gen = 0 }; + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, + .bucket = k.k->p.offset, + .gen = 0, + }; - if (k.k->type == KEY_TYPE_alloc) { - const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; - const void *d = a->data; - unsigned idx = 0; + if (k.k->type == KEY_TYPE_alloc_v2) + bch2_alloc_unpack_v2(&ret, k); + else if (k.k->type == KEY_TYPE_alloc) + bch2_alloc_unpack_v1(&ret, k); - ret.gen = a->gen; - -#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); - BCH_ALLOC_FIELDS() -#undef x - } return ret; } -void bch2_alloc_pack(struct bkey_i_alloc *dst, +void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, const struct bkey_alloc_unpacked src) { - unsigned idx = 0; - void *d = dst->v.data; - unsigned bytes; - - dst->v.fields = 0; - dst->v.gen = src.gen; - -#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); - BCH_ALLOC_FIELDS() -#undef x - - bytes = (void *) d - (void *) &dst->v; - set_bkey_val_bytes(&dst->k, bytes); - memset_u64s_tail(&dst->v, 0, bytes); + if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2)) + bch2_alloc_pack_v2(dst, src); + else + bch2_alloc_pack_v1(dst, src); } static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) - bytes += BCH_ALLOC_FIELD_BYTES[i]; + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } -const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) +const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - const void *d = a.v->data; - unsigned i; + struct bkey_alloc_unpacked u; - pr_buf(out, "gen %u", a.v->gen); + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; - for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) - if (a.v->fields & (1 << i)) - pr_buf(out, " %s %llu", - bch2_alloc_field_names[i], - get_alloc_field(a.v, &d, i)); + if (bch2_alloc_unpack_v2(&u, k)) + return "unpack error"; + + return NULL; +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + pr_buf(out, "gen %u oldest_gen %u data_type %u", + u.gen, u.oldest_gen, u.data_type); +#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); + BCH_ALLOC_FIELDS_V2() +#undef x } static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, @@ -213,7 +315,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, struct bucket *g; struct bkey_alloc_unpacked u; - if (level || k.k->type != KEY_TYPE_alloc) + if (level || + (k.k->type != KEY_TYPE_alloc && + k.k->type != KEY_TYPE_alloc_v2)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); @@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { - struct bch_dev *ca; - unsigned i; - int ret = 0; + int ret; down_read(&c->gc_lock); ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, @@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - return 0; } @@ -281,8 +363,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bucket *g; struct bucket_mark m; struct bkey_alloc_unpacked old_u, new_u; - __BKEY_PADDED(k, 8) alloc_key; /* hack: */ - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; int ret; retry: bch2_trans_begin(trans); @@ -303,17 +384,14 @@ retry: ca = bch_dev_bkey_exists(c, iter->pos.inode); g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(g, m); + new_u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); if (!bkey_alloc_unpacked_cmp(old_u, new_u)) return 0; - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, new_u); - - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, new_u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_NORUN); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags); @@ -358,114 +436,6 @@ err: /* Bucket IO clocks: */ -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static inline u64 bucket_clock_freq(u64 capacity) -{ - return max(capacity >> 10, 2028ULL); -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += bucket_clock_freq(capacity); - - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = bucket_clock_freq(c->capacity); - mutex_init(&clock->lock); -} - int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { @@ -473,9 +443,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, struct bch_dev *ca = bch_dev_bkey_exists(c, dev); struct btree_iter *iter; struct bucket *g; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; - u16 *time; + u64 *time, now; int ret = 0; iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), @@ -486,28 +456,25 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, if (ret) goto out; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; percpu_down_read(&c->mark_lock); g = bucket(ca, bucket_nr); - u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - time = rw == READ ? &u.read_time : &u.write_time; - if (*time == c->bucket_clock[rw].hand) + now = atomic64_read(&c->io_clock[rw].now); + if (*time == now) goto out; - *time = c->bucket_clock[rw].hand; + *time = now; - bch2_alloc_pack(a, u); - - ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: + bch2_alloc_pack(c, a, u); + ret = bch2_trans_update(trans, iter, &a->k, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: bch2_trans_iter_put(trans, iter); @@ -576,23 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) { u8 gc_gen; - if (!is_available_bucket(mark)) + if (!is_available_bucket(m)) return false; - if (mark.owned_by_allocator) + if (m.owned_by_allocator) return false; if (ca->buckets_nouse && - test_bit(bucket, ca->buckets_nouse)) + test_bit(b, ca->buckets_nouse)) return false; - gc_gen = bucket_gc_gen(ca, bucket); + gc_gen = bucket_gc_gen(bucket(ca, b)); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; @@ -606,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) +static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) { - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; + unsigned used = bucket_sectors_used(m); - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; + if (used) { + /* + * Prefer to keep buckets that have been read more recently, and + * buckets that have more data in them: + */ + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); - - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); - - return (data_wantness << 9) | - (needs_journal_commit << 8) | - (bucket_gc_gen(ca, b) / 16); + return -last_read_scaled; + } else { + /* + * Prefer to use buckets with smaller gc_gen so that we don't + * have to walk the btree and recalculate oldest_gen - but shift + * off the low bits so that buckets will still have equal sort + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| + (bucket_gc_gen(g) >> 4); + } } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -665,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; + u64 now, last_seq_ondisk; size_t b, i, nr = 0; - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); + last_seq_ondisk = c->journal.last_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -682,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * all buckets have been visited. */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); + struct bucket *g = &buckets->b[b]; + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -718,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -863,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct btree_iter *iter, u64 *journal_seq, unsigned flags) { -#if 0 - __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -#else - /* hack: */ - __BKEY_PADDED(k, 8) alloc_key; -#endif struct bch_fs *c = trans->c; - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; @@ -920,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, goto out; } - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); retry: ret = bch2_btree_iter_traverse(iter); @@ -931,7 +878,7 @@ retry: percpu_down_read(&c->mark_lock); g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - u = alloc_mem_to_key(g, m); + u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); @@ -941,14 +888,11 @@ retry: u.data_type = 0; u.dirty_sectors = 0; u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_BUCKET_INVALIDATE); /* @@ -1455,8 +1399,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index f60fcebf..6fededcd 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -7,12 +7,33 @@ #include "debug.h" struct bkey_alloc_unpacked { + u64 bucket; + u8 dev; u8 gen; + u8 oldest_gen; + u8 data_type; #define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x }; +struct bkey_alloc_buf { + struct bkey_i k; + + union { + struct { +#define x(_name, _bits) + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; +#undef x + } _v1; + struct { +#define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; +#undef x + } _v2; + }; +} __attribute__((packed, aligned(8))); + /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -20,23 +41,28 @@ struct bkey_alloc_unpacked { static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, struct bkey_alloc_unpacked r) { - return l.gen != r.gen -#define x(_name, _bits) || l._name != r._name - BCH_ALLOC_FIELDS() + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type +#define x(_name, ...) || l._name != r._name + BCH_ALLOC_FIELDS_V2() #undef x ; } struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -void bch2_alloc_pack(struct bkey_i_alloc *, +void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, const struct bkey_alloc_unpacked); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bucket *g, struct bucket_mark m) +alloc_mem_to_key(struct btree_iter *iter, + struct bucket *g, struct bucket_mark m) { return (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, .gen = m.gen, .oldest_gen = g->oldest_gen, .data_type = m.data_type, @@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m) #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ - .key_invalid = bch2_alloc_invalid, \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ } diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 1abfff52..be164d61 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -10,30 +10,6 @@ struct ec_bucket_buf; -/* There's two of these clocks, one for reads and one for writes: */ -struct bucket_clock { - /* - * "now" in (read/write) IO time - incremented whenever we do X amount - * of reads or writes. - * - * Goes with the bucket read/write prios: when we read or write to a - * bucket we reset the bucket's prio to the current hand; thus hand - - * prio = time since bucket was last read/written. - * - * The units are some amount (bytes/sectors) of data read/written, and - * the units can change on the fly if we need to rescale to fit - * everything in a u16 - your only guarantee is that the units are - * consistent. - */ - u16 hand; - u16 max_last_io; - - int rw; - - struct io_timer rescale; - struct mutex lock; -}; - enum alloc_reserve { RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE = -1, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 91b9375f..fa36e764 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -429,7 +429,9 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage[2]; + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -451,9 +453,6 @@ struct bch_dev { size_t fifo_last_bucket; - /* last calculated minimum prio */ - u16 max_last_bucket_io[2]; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -473,6 +472,7 @@ struct bch_dev { atomic64_t rebalance_work; struct journal_device journal; + u64 prev_journal_sector; struct work_struct io_error_work; @@ -584,6 +584,8 @@ struct bch_fs { struct journal_entry_res replicas_journal_res; + struct journal_entry_res dev_usage_journal_res; + struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; @@ -691,14 +693,6 @@ struct bch_fs { struct mutex usage_scratch_lock; struct bch_fs_usage *usage_scratch; - /* - * When we invalidate buckets, we use both the priority and the amount - * of good data to determine which buckets to reuse first - to weight - * those together consistently we keep track of the smallest nonzero - * priority of any bucket. - */ - struct bucket_clock bucket_clock[2]; - struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 6dc150cb..30e77190 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k) x(reflink_v, 16) \ x(inline_data, 17) \ x(btree_ptr_v2, 18) \ - x(indirect_inline_data, 19) + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, block:8, - idx:51; + redundancy:4, + idx:47; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:51, + __u64 idx:47, + redundancy:4, block:8, type:5; #endif @@ -799,35 +802,40 @@ struct bch_alloc { __u8 data[]; } __attribute__((packed, aligned(8))); -#define BCH_ALLOC_FIELDS() \ +#define BCH_ALLOC_FIELDS_V1() \ x(read_time, 16) \ x(write_time, 16) \ x(data_type, 8) \ x(dirty_sectors, 16) \ x(cached_sectors, 16) \ - x(oldest_gen, 8) + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __attribute__((packed, aligned(8))); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) enum { -#define x(name, bytes) BCH_ALLOC_FIELD_##name, - BCH_ALLOC_FIELDS() +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() #undef x BCH_ALLOC_FIELD_NR }; -static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, - BCH_ALLOC_FIELDS() -#undef x -}; - -#define x(name, bits) + (bits / 8) -static const unsigned BKEY_ALLOC_VAL_U64s_MAX = - DIV_ROUND_UP(offsetof(struct bch_alloc, data) - BCH_ALLOC_FIELDS(), sizeof(u64)); -#undef x - -#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) - /* Quotas: */ enum quota_types { @@ -1131,8 +1139,8 @@ struct bch_sb_field_clean { struct bch_sb_field field; __le32 flags; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; __le64 journal_seq; union { @@ -1305,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); /* * Features: @@ -1332,7 +1341,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ x(new_varint, 15) \ - x(journal_no_flush, 16) + x(journal_no_flush, 16) \ + x(alloc_v2, 17) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ @@ -1340,7 +1350,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); (1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ (1ULL << BCH_FEATURE_new_varint)| \ - (1ULL << BCH_FEATURE_journal_no_flush)) + (1ULL << BCH_FEATURE_journal_no_flush)| \ + (1ULL << BCH_FEATURE_alloc_v2)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1493,7 +1504,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist, 3) \ x(blacklist_v2, 4) \ x(usage, 5) \ - x(data_usage, 6) + x(data_usage, 6) \ + x(clock, 7) \ + x(dev_usage, 8) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1541,6 +1554,30 @@ struct jset_entry_data_usage { struct bch_replicas_entry r; } __attribute__((packed)); +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __attribute__((packed)); + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __attribute__((packed)); + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 buckets_unavailable; + + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1563,8 +1600,8 @@ struct jset { __u8 encrypted_start[0]; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; /* Sequence number of oldest dirty journal entry */ __le64 last_seq; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 2c3b73a6..48821f6c 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v); BKEY_VAL_ACCESSORS(inline_data); BKEY_VAL_ACCESSORS(btree_ptr_v2); BKEY_VAL_ACCESSORS(indirect_inline_data); +BKEY_VAL_ACCESSORS(alloc_v2); /* byte order helpers */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index bab5ebd3..c2c8a34f 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->mi.nbuckets * sizeof(struct bucket)); ca->buckets[1] = NULL; - free_percpu(ca->usage[1]); - ca->usage[1] = NULL; + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; } free_percpu(c->usage_gc); @@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c, struct bch_dev *ca; bool verify = (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); - unsigned i; + unsigned i, dev; int ret = 0; #define copy_field(_f, _msg, ...) \ @@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c, } } - for_each_member_device(ca, c, i) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { struct bucket_array *dst = __bucket_array(ca, 0); struct bucket_array *src = __bucket_array(ca, 1); size_t b; @@ -801,13 +804,24 @@ static int bch2_gc_done(struct bch_fs *c, dst->b[b].oldest_gen = src->b[b].oldest_gen; } + + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + } }; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - bch2_dev_usage_from_buckets(c); - { unsigned nr = fs_usage_u64s(c); struct bch_fs_usage *dst = c->usage_base; @@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c) for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); - BUG_ON(ca->usage[1]); + BUG_ON(ca->usage_gc); ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), @@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c) return -ENOMEM; } - ca->usage[1] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[1]) { - bch_err(c, "error allocating ca->usage[gc]"); + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); return -ENOMEM; } @@ -1489,7 +1503,7 @@ static int bch2_gc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic_long_read(&clock->now); + unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); int ret; @@ -1510,7 +1524,7 @@ static int bch2_gc_thread(void *arg) if (c->btree_gc_periodic) { unsigned long next = last + c->capacity / 16; - if (atomic_long_read(&clock->now) >= next) + if (atomic64_read(&clock->now) >= next) break; bch2_io_clock_schedule_timeout(clock, next); @@ -1522,7 +1536,7 @@ static int bch2_gc_thread(void *arg) } __set_current_state(TASK_RUNNING); - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); last_kick = atomic_read(&c->kick_gc); /* diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 8919ea62..dd1b8f6e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, + wp = bch2_alloc_sectors_start(c, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index cb0f0e09..ef79f5ca 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; + struct bch_dev *ca; unsigned i; percpu_down_write(&c->mark_lock); @@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c) fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + percpu_up_write(&c->mark_lock); } @@ -189,14 +198,27 @@ out_pool: return ret; } +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { + struct bch_fs *c = ca->fs; struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); - memset(&ret, 0, sizeof(ret)); - acc_u64s_percpu((u64 *) &ret, - (u64 __percpu *) ca->usage[0], - sizeof(ret) / sizeof(u64)); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } @@ -261,7 +283,8 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - unsigned u64s = fs_usage_u64s(c); + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) (u64 __percpu *) c->usage[idx], u64s); percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + write_seqcount_end(&c->usage_lock); preempt_enable(); } @@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, - bool gc) + u64 journal_seq, bool gc) { struct bch_dev_usage *u; percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - u = this_cpu_ptr(ca->usage[gc]); + u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) account_bucket(fs_usage, u, bucket_type(old), @@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -__flatten -void bch2_dev_usage_from_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket_mark old = { .v.counter = 0 }; - struct bucket_array *buckets; - struct bucket *g; - unsigned i; - int cpu; - - c->usage_base->hidden = 0; - - for_each_member_device(ca, c, i) { - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(ca->usage[0], cpu), 0, - sizeof(*ca->usage[0])); - - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, c->usage_base, - old, g->mark, false); - } -} - static inline int update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, @@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + /* + * XXX: this is wrong, this means we'll be doing updates to the percpu + * buckets_alloc counter that don't have an open journal buffer and + * we'll race with the machinery that accumulates that to ca->usage_base + */ + bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -685,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bucket_mark old_m, m; /* We don't do anything for deletions - do we?: */ - if (new.k->type != KEY_TYPE_alloc) + if (new.k->type != KEY_TYPE_alloc && + new.k->type != KEY_TYPE_alloc_v2) return 0; /* @@ -708,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c, m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; m.cached_sectors = u.cached_sectors; + m.stripe = u.stripe != 0; if (journal_seq) { m.journal_seq_valid = 1; @@ -715,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; g->oldest_gen = u.oldest_gen; g->gen_valid = 1; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; /* * need to know if we're getting called from the invalidate path or @@ -778,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (c) bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, gc); + old, new, 0, gc); return 0; } @@ -915,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, +static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, unsigned ptr_idx, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags, - bool enabled) + u64 journal_seq, unsigned flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; @@ -932,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, char buf[200]; int ret; - if (enabled) - g->ec_redundancy = s->nr_redundant; + if (g->stripe && g->stripe != k.k->p.offset) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EINVAL; + } old = bucket_cmpxchg(g, new, ({ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, @@ -941,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, if (ret) return ret; - if (new.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!new.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - new.stripe = enabled; - - if ((flags & BTREE_TRIGGER_GC) && parity) { - new.data_type = enabled ? BCH_DATA_parity : 0; - new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; + if (parity) { + new.data_type = BCH_DATA_parity; + new.dirty_sectors = le16_to_cpu(s->sectors); } if (journal_seq) { @@ -966,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, } })); - if (!enabled) - g->ec_redundancy = 0; + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); return 0; } @@ -1036,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); @@ -1163,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c, unsigned i; int ret; + BUG_ON(gc && old_s); + if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); @@ -1170,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c, } if (!new_s) { - /* Deleting: */ - for (i = 0; i < old_s->nr_blocks; i++) { - ret = bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - - if (!gc && m->on_heap) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_del(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); - } - - if (gc) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); memset(m, 0, sizeof(*m)); } else { - BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); - BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); - - for (i = 0; i < new_s->nr_blocks; i++) { - if (!old_s || - memcmp(new_s->ptrs + i, - old_s->ptrs + i, - sizeof(struct bch_extent_ptr))) { - - if (old_s) { - bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - ret = bucket_set_stripe(c, new, i, fs_usage, - journal_seq, flags, true); - if (ret) - return ret; - } - } - m->alive = true; m->sectors = le16_to_cpu(new_s->sectors); m->algorithm = new_s->algorithm; @@ -1220,27 +1193,13 @@ static int bch2_mark_stripe(struct bch_fs *c, m->blocks_nonempty = 0; for (i = 0; i < new_s->nr_blocks; i++) { - unsigned s = stripe_blockcount_get(new_s, i); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - if (!gc) - m->block_sectors[i] = s; - m->blocks_nonempty += !!s; + m->block_sectors[i] = + stripe_blockcount_get(new_s, i); + m->blocks_nonempty += !!m->block_sectors[i]; } - if (gc && old_s) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); - bch2_bkey_to_replicas(&m->r.e, new); - if (gc) - update_replicas(c, fs_usage, &m->r.e, - ((s64) m->sectors * m->nr_redundant)); - if (!gc) { spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_update(c, m, idx); @@ -1248,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c, } } + if (gc) { + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(c, new, i, fs_usage, + journal_seq, flags); + if (ret) + return ret; + } + + update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant)); + } + return 0; } @@ -1271,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, switch (k.k->type) { case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: @@ -1539,9 +1518,10 @@ static int trans_get_key(struct btree_trans *trans, return ret; } -static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, - const struct bch_extent_ptr *ptr, - struct bkey_alloc_unpacked *u) +static struct bkey_alloc_buf * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); @@ -1549,8 +1529,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree struct bucket *g; struct btree_iter *iter; struct bkey_s_c k; + struct bkey_alloc_buf *a; int ret; + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (IS_ERR(a)) + return a; + iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); if (iter) { *u = bch2_alloc_unpack(k); @@ -1562,17 +1547,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree ret = bch2_btree_iter_traverse(iter); if (ret) { bch2_trans_iter_put(trans, iter); - return ret; + return ERR_PTR(ret); } percpu_down_read(&c->mark_lock); g = bucket(ca, pos.offset); - *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); } *_iter = iter; - return 0; + return a; } static int bch2_trans_mark_pointer(struct btree_trans *trans, @@ -1582,27 +1567,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; int ret; - ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1713,34 +1691,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, } static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, - const struct bch_extent_ptr *ptr, - s64 sectors, bool parity) + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) { - struct bkey_i_alloc *a; + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct bkey_alloc_buf *a; struct btree_iter *iter; struct bkey_alloc_unpacked u; - int ret; + bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + int ret = 0; - ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); if (parity) { + s64 sectors = le16_to_cpu(s.v->sectors); + + if (deleting) + sectors = -sectors; + u.dirty_sectors += sectors; u.data_type = u.dirty_sectors ? BCH_DATA_parity : 0; } - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; + if (!deleting) { + if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, + "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", + iter->pos.inode, iter->pos.offset, u.gen, + u.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + u.stripe = s.k->p.offset; + u.stripe_redundancy = s.v->nr_redundant; + } else { + u.stripe = 0; + u.stripe_redundancy = 0; + } + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); err: bch2_trans_iter_put(trans, iter); return ret; @@ -1750,51 +1745,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; + struct bkey_s_c_stripe old_s = { NULL }; + struct bkey_s_c_stripe new_s = { NULL }; struct bch_replicas_padded r; unsigned i; int ret = 0; + if (old.k->type == KEY_TYPE_stripe) + old_s = bkey_s_c_to_stripe(old); + if (new.k->type == KEY_TYPE_stripe) + new_s = bkey_s_c_to_stripe(new); + /* * If the pointers aren't changing, we don't need to do anything: */ - if (new_s && old_s && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + if (new_s.k && old_s.k && + new_s.v->nr_blocks == old_s.v->nr_blocks && + new_s.v->nr_redundant == old_s.v->nr_redundant && + !memcmp(old_s.v->ptrs, new_s.v->ptrs, + new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - if (new_s) { - unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant; - s64 sectors = le16_to_cpu(new_s->sectors); + if (new_s.k) { + s64 sectors = le16_to_cpu(new_s.v->sectors); bch2_bkey_to_replicas(&r.e, new); - update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); - for (i = 0; i < new_s->nr_blocks; i++) { - bool parity = i >= nr_data; - - ret = bch2_trans_mark_stripe_alloc_ref(trans, - &new_s->ptrs[i], sectors, parity); + for (i = 0; i < new_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, + i, false); if (ret) return ret; } } - if (old_s) { - unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant; - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + if (old_s.k) { + s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); bch2_bkey_to_replicas(&r.e, old); - update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); - for (i = 0; i < old_s->nr_blocks; i++) { - bool parity = i >= nr_data; - - ret = bch2_trans_mark_stripe_alloc_ref(trans, - &old_s->ptrs[i], sectors, parity); + for (i = 0; i < old_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, + i, true); if (ret) return ret; } @@ -2065,21 +2059,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; struct bch_extent_ptr ptr = { .dev = ca->dev_idx, .offset = bucket_to_sector(ca, b), }; int ret = 0; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); if (u.data_type && u.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, @@ -2112,10 +2101,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, u.data_type = type; u.dirty_sectors = sectors; - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); out: bch2_trans_iter_put(trans, iter); return ret; @@ -2422,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage[0]); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -ENOMEM; + } + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 37346240..6d15c455 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -{ - return c->bucket_clock[rw].hand - g->io_time[rw]; -} - /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) +static inline u8 bucket_gc_gen(struct bucket *g) { - struct bucket *g = bucket(ca, b); - return g->mark.gen - g->oldest_gen; } @@ -169,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *); - static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -214,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) READ_ONCE(c->replicas.nr); } +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 5fbe940a..404c89a7 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -37,11 +37,12 @@ struct bucket { const struct bucket_mark mark; }; - u16 io_time[2]; + u64 io_time[2]; u8 oldest_gen; u8 gc_gen; unsigned gen_valid:1; - u8 ec_redundancy; + u8 stripe_redundancy; + u32 stripe; }; struct bucket_array { diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 1d1590de..4324cfe7 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), + if (time_after_eq((unsigned long) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); @@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; - unsigned long now = atomic_long_add_return(sectors, &clock->now); + unsigned long now = atomic64_add_return(sectors, &clock->now); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); @@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) unsigned i; spin_lock(&clock->timer_lock); - now = atomic_long_read(&clock->now); + now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) pr_buf(out, "%ps:\t%li\n", @@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) int bch2_io_clock_init(struct io_clock *clock) { - atomic_long_set(&clock->now, 0); + atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index 92c740a4..5fae0012 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -26,7 +26,7 @@ struct io_timer { typedef HEAP(struct io_timer *) io_timer_heap; struct io_clock { - atomic_long_t now; + atomic64_t now; u16 __percpu *pcpu_buf; unsigned max_slop; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 086897c3..10d55fc8 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + if (!bkey_cmp(k.k->p, POS_MIN)) + return "stripe at pos 0"; + if (k.k->p.inode) return "invalid stripe key"; @@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); + bch_err_ratelimited(c, - "stripe checksum error at %u:%u: csum type %u, expected %llx got %llx", - i, j, v->csum_type, - want.lo, got.lo); + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, + want.lo, got.lo, buf2); clear_bit(i, buf->valid); break; } @@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) static void ec_block_endio(struct bio *bio) { struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_stripe *v = &ec_bio->buf->key.v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; @@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(ca->fs, + "error %s stripe: stale pointer after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to"); + clear_bit(ec_bio->idx, ec_bio->buf->valid); + } + bio_put(&ec_bio->bio); percpu_ref_put(&ca->io_ref); closure_put(cl); @@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c, static int ec_stripe_delete(struct bch_fs *c, size_t idx) { - //pr_info("deleting stripe %zu", idx); return bch2_btree_delete_range(c, BTREE_ID_EC, POS(0, idx), POS(0, idx + 1), @@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, *dst = (struct bch_extent_stripe_ptr) { .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, .block = block, + .redundancy = s->key.v.nr_redundant, .idx = s->key.k.p.offset, }; } @@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, if (!ob) return; - //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); - ec = ob->ec; mutex_lock(&ec->lock); @@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c, struct stripe *m; size_t heap_idx; u64 stripe_idx; + s64 ret = -1; if (may_create_new_stripe(c)) return -1; spin_lock(&c->ec_stripes_heap_lock); for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; @@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c, m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { bch2_stripes_heap_del(c, m, stripe_idx); - spin_unlock(&c->ec_stripes_heap_lock); - return stripe_idx; + ret = stripe_idx; + break; } } - spin_unlock(&c->ec_stripes_heap_lock); - return -1; + return ret; } struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 67ba2c21..4a3a3291 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) if (p.ptr.cached) continue; - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); - - WARN_ON(!s); - if (s) - replicas += s->nr_redundant; - } + if (p.has_ec) + replicas += p.ec.redundancy; replicas++; @@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (ca->mi.state != BCH_MEMBER_STATE_FAILED) durability = max_t(unsigned, durability, ca->mi.durability); - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); + if (p.has_ec) + durability += p.ec.redundancy; - if (WARN_ON(!s)) - goto out; - - durability += s->nr_redundant; - } -out: return durability; } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index a7c5f5fd..e41f0277 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j) j->entry_u64s_reserved += BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); + j->entry_u64s_reserved += + 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); + atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index eacc9b2c..2abca164 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -5,6 +5,7 @@ #include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" +#include "disk_groups.h" #include "error.h" #include "io.h" #include "journal.h" @@ -426,6 +427,69 @@ fsck_err: return ret; } +static int journal_entry_validate_clock(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, "invalid journal entry clock: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, "invalid journal entry clock: bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static int journal_entry_validate_dev_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, "invalid journal entry dev usage: bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, "invalid journal entry dev usage: bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, "invalid journal entry dev usage: bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, int); @@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, for (ptr = 0; ptr < i->nr_ptrs; ptr++) replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + bch2_replicas_entry_sort(&replicas.e); + /* * If we're mounting in degraded mode - if we didn't read all * the devices - this is wrong: @@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); rcu_read_lock(); +retry: + devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, - &c->rw_devs[BCH_DATA_journal]); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + goto retry; + } done: rcu_read_unlock(); @@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl) bio->bi_private = ca; bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; + if (!JSET_NO_FLUSH(w->data)) bio->bi_opf |= REQ_FUA; if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) @@ -1348,8 +1427,8 @@ void bch2_journal_write(struct closure *cl) end = bch2_btree_roots_to_journal_entries(c, jset->start, end); - end = bch2_journal_super_entries_add_common(c, end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1358,10 +1437,7 @@ void bch2_journal_write(struct closure *cl) journal_write_compact(jset); - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index d0acc1ee..f915b30a 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->rewrite_dev = p.ptr.dev; - if (p.has_ec) { - struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); - - data_opts->nr_replicas += m->nr_redundant; - } + if (p.has_ec) + data_opts->nr_replicas += p.ec.redundancy; return DATA_REWRITE; } @@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c) bucket_sectors_used(m) >= ca->mi.bucket_size) continue; - WARN_ON(m.stripe && !g->ec_redundancy); + WARN_ON(m.stripe && !g->stripe_redundancy); e = (struct copygc_heap_entry) { .dev = dev_idx, .gen = m.gen, - .replicas = 1 + g->ec_redundancy, + .replicas = 1 + g->stripe_redundancy, .fragmentation = bucket_sectors_used(m) * (1U << 15) / ca->mi.bucket_size, .sectors = bucket_sectors_used(m), @@ -301,7 +298,7 @@ static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last, wait; + u64 last, wait; set_freezable(); @@ -309,7 +306,7 @@ static int bch2_copygc_thread(void *arg) if (kthread_wait_freezable(c->copy_gc_enabled)) break; - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 710a7ee6..d835a853 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -136,6 +136,11 @@ enum opt_type { OPT_STR(bch2_str_hash_types), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or disk group for metadata writes") \ x(foreground_target, u16, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ OPT_FN(bch2_opt_target), \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index c3373c48..d89920b8 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; - unsigned long io_start; + u64 io_start; long throttle; set_freezable(); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = rebalance_work(c); prev_start = jiffies; prev_cputime = curr_cputime(); @@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) (20 - w.dev_most_full_percent), 50); - if (atomic_long_read(&clock->now) + clock->max_slop < + if (atomic64_read(&clock->now) + clock->max_slop < r->throttled_until_iotime) { r->throttled_until_cputime = start + throttle; r->state = REBALANCE_THROTTLED; @@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) max(p.dev_most_full_percent, 1U) / max(w.dev_most_full_percent, 1U)); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = w; prev_start = start; prev_cputime = cputime; @@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - - atomic_long_read(&c->io_clock[WRITE].now)) << 9); + atomic64_read(&c->io_clock[WRITE].now)) << 9); pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h index 192c6be2..2f62a643 100644 --- a/libbcachefs/rebalance_types.h +++ b/libbcachefs/rebalance_types.h @@ -17,7 +17,7 @@ struct bch_fs_rebalance { atomic64_t work_unknown_dev; enum rebalance_state state; - unsigned long throttled_until_iotime; + u64 throttled_until_iotime; unsigned long throttled_until_cputime; struct bch_move_stats move_stats; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index f470e0e2..7ba098ad 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_data_usage: { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); + unsigned i; + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + + for (i = 0; i < nr_types; i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } case BCH_JSET_ENTRY_blacklist: { struct jset_entry_blacklist *bl_entry = container_of(entry, struct jset_entry_blacklist, entry); @@ -847,6 +868,12 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(bl_entry->end) + 1); break; } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, clock->time); + } } return ret; @@ -861,9 +888,6 @@ static int journal_replay_early(struct bch_fs *c, int ret; if (clean) { - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); - for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { @@ -876,9 +900,6 @@ static int journal_replay_early(struct bch_fs *c, if (i->ignore) continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) @@ -942,13 +963,6 @@ static int verify_superblock_clean(struct bch_fs *c, return 0; } - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock %u doesn't match journal %u after clean shutdown", - clean->read_clock, j->read_clock); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock write clock %u doesn't match journal %u after clean shutdown", - clean->write_clock, j->write_clock); - for (i = 0; i < BTREE_ID_NR; i++) { char buf1[200], buf2[200]; struct bkey_i *k1, *k2; diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index ce8b7355..3970c442 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -static void replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, break; } - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } void bch2_devlist_to_replicas(struct bch_replicas_entry *e, @@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, for (i = 0; i < devs.nr; i++) e->devs[e->nr_devs++] = devs.devs[i]; - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } static struct bch_replicas_cpu @@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, int bch2_replicas_entry_idx(struct bch_fs *c, struct bch_replicas_entry *search) { - replicas_entry_sort(search); + bch2_replicas_entry_sort(search); return __replicas_entry_idx(&c->replicas, search); } @@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, for_each_replicas_entry(sb_r, e) { dst = cpu_replicas_entry(cpu_r, idx++); memcpy(dst, e, replicas_entry_bytes(e)); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; @@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, dst->nr_devs = e->nr_devs; dst->nr_required = 1; memcpy(dst->devs, e->devs, e->nr_devs); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 8b95164f..a16ef23b 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -5,6 +5,7 @@ #include "eytzinger.h" #include "replicas_types.h" +void bch2_replicas_entry_sort(struct bch_replicas_entry *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 751efd28..a510a25e 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -963,31 +963,28 @@ int bch2_fs_mark_dirty(struct bch_fs *c) return ret; } -static void -entry_init_u64s(struct jset_entry *entry, unsigned u64s) +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) { - memset(entry, 0, u64s * sizeof(u64)); + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + memset(entry, 0, u64s * sizeof(u64)); /* * The u64s field counts from the start of data, ignoring the shared * fields. */ entry->u64s = u64s - 1; + + *end = vstruct_next(*end); + return entry; } -static void -entry_init_size(struct jset_entry *entry, size_t size) +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) { - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - entry_init_u64s(entry, u64s); -} - -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry *entry, - u64 journal_seq) -{ - unsigned i; + struct bch_dev *ca; + unsigned i, dev; percpu_down_write(&c->mark_lock); @@ -1000,58 +997,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; u->v = cpu_to_le64(c->usage_base->nr_inodes); - - entry = vstruct_next(entry); } { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_KEY_VERSION; u->v = cpu_to_le64(atomic64_read(&c->key_version)); - - entry = vstruct_next(entry); } for (i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - - entry = vstruct_next(entry); } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); - entry_init_size(entry, sizeof(*u) + e->nr_devs); u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e)); + } - entry = vstruct_next(entry); + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } } percpu_up_write(&c->mark_lock); - return entry; + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = atomic64_read(&c->io_clock[i].now); + } } void bch2_fs_mark_clean(struct bch_fs *c) @@ -1080,15 +1096,13 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); entry = sb_clean->start; - entry = bch2_journal_super_entries_add_common(c, entry, 0); + bch2_journal_super_entries_add_common(c, &entry, 0); entry = bch2_btree_roots_to_journal_entries(c, entry, entry); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 7a068158..1a35124f 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_clean: */ -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry *, u64); +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index f3c12d89..ac277df8 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + (sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + /* Filesystem RO/RW: */ /* @@ -174,9 +190,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_copygc_stop(c); bch2_gc_thread_stop(c); - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - /* * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: @@ -399,9 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - for_each_rw_member(ca, c, i) { ret = bch2_dev_allocator_start(ca); if (ret) { @@ -779,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_fsio_init(c)) goto err; + bch2_dev_usage_journal_reserve(c); + mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -1521,6 +1533,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_RW && @@ -1530,19 +1544,6 @@ err: return ret; } -static void dev_usage_clear(struct bch_dev *ca) -{ - struct bucket_array *buckets; - - percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); - up_read(&ca->bucket_lock); -} - /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1600,8 +1601,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_usage_clear(ca); - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1655,6 +1654,8 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); + bch2_dev_usage_journal_reserve(c); + err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret) diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 80964bdf..f934f12b 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, { int rw = (private ? 1 : 0); - return bucket_last_io(c, bucket(ca, b), rw); + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; } static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, @@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, b); + return bucket_gc_gen(bucket(ca, b)); } static int unsigned_cmp(const void *_l, const void *_r)