From 85ee972555948337bb1a58f0702a4da95db6758f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Oct 2017 14:41:44 -0800 Subject: [PATCH] Update bcachefs sources to e82e656279 bcachefs: Cleanups for building in userspace --- .bcachefs_revision | 2 +- include/trace/events/bcachefs.h | 117 ++-------- libbcachefs/acl.c | 9 +- libbcachefs/acl.h | 15 +- libbcachefs/alloc.c | 338 +++++++++++----------------- libbcachefs/alloc.h | 37 ++- libbcachefs/alloc_types.h | 30 +-- libbcachefs/bcachefs.h | 29 +-- libbcachefs/bcachefs_format.h | 89 ++++---- libbcachefs/bkey.c | 2 +- libbcachefs/bkey.h | 11 +- libbcachefs/bkey_methods.h | 10 +- libbcachefs/bset.c | 15 +- libbcachefs/bset.h | 10 +- libbcachefs/btree_cache.c | 4 +- libbcachefs/btree_cache.h | 18 +- libbcachefs/btree_gc.c | 23 +- libbcachefs/btree_gc.h | 6 +- libbcachefs/btree_io.c | 9 +- libbcachefs/btree_io.h | 6 +- libbcachefs/btree_iter.c | 21 +- libbcachefs/btree_iter.h | 6 +- libbcachefs/btree_locking.h | 8 +- libbcachefs/btree_types.h | 6 +- libbcachefs/btree_update.h | 7 +- libbcachefs/btree_update_interior.c | 16 +- libbcachefs/btree_update_interior.h | 6 +- libbcachefs/buckets.c | 5 +- libbcachefs/buckets.h | 22 -- libbcachefs/buckets_types.h | 2 +- libbcachefs/chardev.c | 4 + libbcachefs/chardev.h | 10 +- libbcachefs/checksum.h | 48 +++- libbcachefs/clock.h | 6 +- libbcachefs/clock_types.h | 7 +- libbcachefs/compress.c | 281 ++++++++++++----------- libbcachefs/compress.h | 6 +- libbcachefs/debug.h | 6 +- libbcachefs/dirent.h | 7 +- libbcachefs/error.h | 6 +- libbcachefs/extents.c | 13 +- libbcachefs/extents.h | 6 +- libbcachefs/eytzinger.h | 48 ++-- libbcachefs/fifo.h | 7 +- libbcachefs/fs-io.c | 38 ++-- libbcachefs/fs-io.h | 6 +- libbcachefs/fs.c | 57 ++++- libbcachefs/fs.h | 12 +- libbcachefs/fsck.c | 25 +- libbcachefs/fsck.h | 6 +- libbcachefs/inode.c | 110 +++++++-- libbcachefs/inode.h | 9 +- libbcachefs/io.c | 28 ++- libbcachefs/io.h | 16 +- libbcachefs/io_types.h | 6 +- libbcachefs/journal.c | 65 +++--- libbcachefs/journal.h | 6 +- libbcachefs/journal_types.h | 9 +- libbcachefs/keylist.h | 6 +- libbcachefs/keylist_types.h | 6 +- libbcachefs/lz4.h | 88 +------- libbcachefs/lz4_compress.c | 228 ------------------- libbcachefs/lz4_decompress.c | 251 +++++++++------------ libbcachefs/lz4defs.h | 182 --------------- libbcachefs/migrate.h | 6 +- libbcachefs/move.c | 6 +- libbcachefs/move.h | 7 +- libbcachefs/move_types.h | 4 - libbcachefs/movinggc.h | 6 +- libbcachefs/opts.c | 15 +- libbcachefs/opts.h | 7 +- libbcachefs/six.h | 7 +- libbcachefs/str_hash.h | 6 +- libbcachefs/super-io.c | 61 +++-- libbcachefs/super-io.h | 26 ++- libbcachefs/super.c | 61 ++--- libbcachefs/super.h | 60 +++-- libbcachefs/super_types.h | 6 +- libbcachefs/sysfs.c | 103 +++++---- libbcachefs/sysfs.h | 10 +- libbcachefs/tier.c | 79 ++----- libbcachefs/tier.h | 6 +- libbcachefs/util.c | 81 +++++-- libbcachefs/util.h | 14 +- libbcachefs/xattr.h | 6 +- 85 files changed, 1300 insertions(+), 1765 deletions(-) delete mode 100644 libbcachefs/lz4_compress.c delete mode 100644 libbcachefs/lz4defs.h delete mode 100644 libbcachefs/move_types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 7d1a4e6d..2279c3ab 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -6a25f7a00d08c45b35bed3d649c05286ec60f7f6 +e82e65627960a46945b78a5e5e946b23b8f08972 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index e5052b8d..0c9f3de5 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -197,24 +197,22 @@ DECLARE_EVENT_CLASS(btree_node, TP_STRUCT__entry( __array(char, uuid, 16 ) - __field(u64, bucket ) __field(u8, level ) __field(u8, id ) - __field(u32, inode ) + __field(u64, inode ) __field(u64, offset ) ), TP_fast_assign( memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); __entry->level = b->level; __entry->id = b->btree_id; __entry->inode = b->key.k.p.inode; __entry->offset = b->key.k.p.offset; ), - TP_printk("%pU bucket %llu(%u) id %u: %u:%llu", - __entry->uuid, __entry->bucket, __entry->level, __entry->id, + TP_printk("%pU %u id %u %llu:%llu", + __entry->uuid, __entry->level, __entry->id, __entry->inode, __entry->offset) ); @@ -253,21 +251,9 @@ DEFINE_EVENT(btree_node, btree_node_free, TP_ARGS(c, b) ); -TRACE_EVENT(btree_node_reap, - TP_PROTO(struct bch_fs *c, struct btree *b, int ret), - TP_ARGS(c, b, ret), - - TP_STRUCT__entry( - __field(u64, bucket ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); - __entry->ret = ret; - ), - - TP_printk("bucket %llu ret %d", __entry->bucket, __entry->ret) +DEFINE_EVENT(btree_node, btree_node_reap, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) ); DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, @@ -330,68 +316,31 @@ TRACE_EVENT(btree_insert_key, TP_ARGS(c, b, k), TP_STRUCT__entry( - __field(u64, b_bucket ) - __field(u64, b_offset ) - __field(u64, offset ) - __field(u32, b_inode ) - __field(u32, inode ) - __field(u32, size ) - __field(u8, level ) __field(u8, id ) + __field(u64, inode ) + __field(u64, offset ) + __field(u32, size ) ), TP_fast_assign( - __entry->b_bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); - __entry->level = b->level; __entry->id = b->btree_id; - __entry->b_inode = b->key.k.p.inode; - __entry->b_offset = b->key.k.p.offset; __entry->inode = k->k.p.inode; __entry->offset = k->k.p.offset; __entry->size = k->k.size; ), - TP_printk("bucket %llu(%u) id %u: %u:%llu %u:%llu len %u", - __entry->b_bucket, __entry->level, __entry->id, - __entry->b_inode, __entry->b_offset, + TP_printk("btree %u: %llu:%llu len %u", __entry->id, __entry->inode, __entry->offset, __entry->size) ); -DECLARE_EVENT_CLASS(btree_split, - TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys), - TP_ARGS(c, b, keys), - - TP_STRUCT__entry( - __field(u64, bucket ) - __field(u8, level ) - __field(u8, id ) - __field(u32, inode ) - __field(u64, offset ) - __field(u32, keys ) - ), - - TP_fast_assign( - __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); - __entry->level = b->level; - __entry->id = b->btree_id; - __entry->inode = b->key.k.p.inode; - __entry->offset = b->key.k.p.offset; - __entry->keys = keys; - ), - - TP_printk("bucket %llu(%u) id %u: %u:%llu keys %u", - __entry->bucket, __entry->level, __entry->id, - __entry->inode, __entry->offset, __entry->keys) +DEFINE_EVENT(btree_node, btree_split, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) ); -DEFINE_EVENT(btree_split, btree_node_split, - TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys), - TP_ARGS(c, b, keys) -); - -DEFINE_EVENT(btree_split, btree_node_compact, - TP_PROTO(struct bch_fs *c, struct btree *b, unsigned keys), - TP_ARGS(c, b, keys) +DEFINE_EVENT(btree_node, btree_compact, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) ); DEFINE_EVENT(btree_node, btree_set_root, @@ -401,31 +350,9 @@ DEFINE_EVENT(btree_node, btree_set_root, /* Garbage collection */ -TRACE_EVENT(btree_gc_coalesce, - TP_PROTO(struct bch_fs *c, struct btree *b, unsigned nodes), - TP_ARGS(c, b, nodes), - - TP_STRUCT__entry( - __field(u64, bucket ) - __field(u8, level ) - __field(u8, id ) - __field(u32, inode ) - __field(u64, offset ) - __field(unsigned, nodes ) - ), - - TP_fast_assign( - __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); - __entry->level = b->level; - __entry->id = b->btree_id; - __entry->inode = b->key.k.p.inode; - __entry->offset = b->key.k.p.offset; - __entry->nodes = nodes; - ), - - TP_printk("bucket %llu(%u) id %u: %u:%llu nodes %u", - __entry->bucket, __entry->level, __entry->id, - __entry->inode, __entry->offset, __entry->nodes) +DEFINE_EVENT(btree_node, btree_gc_coalesce, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) ); TRACE_EVENT(btree_gc_coalesce_fail, @@ -523,8 +450,8 @@ DEFINE_EVENT(bch_dev, prio_write_end, ); TRACE_EVENT(invalidate, - TP_PROTO(struct bch_dev *ca, size_t bucket, unsigned sectors), - TP_ARGS(ca, bucket, sectors), + TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), + TP_ARGS(ca, offset, sectors), TP_STRUCT__entry( __field(unsigned, sectors ) @@ -534,7 +461,7 @@ TRACE_EVENT(invalidate, TP_fast_assign( __entry->dev = ca->disk_sb.bdev->bd_dev; - __entry->offset = bucket << ca->bucket_bits; + __entry->offset = offset, __entry->sectors = sectors; ), diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 6fcac72c..690f4b5b 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -1,9 +1,12 @@ +#ifndef NO_BCACHEFS_FS + #include "bcachefs.h" -#include +#include +#include +#include #include #include -#include #include "xattr.h" #include "acl.h" @@ -223,3 +226,5 @@ int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; } + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 2e51726f..539bca7e 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -1,10 +1,7 @@ -/* - File: fs/bch/acl.h +#ifndef _BCACHEFS_ACL_H +#define _BCACHEFS_ACL_H - (C) 2001 Andreas Gruenbacher, -*/ - -#include +#ifndef NO_BCACHEFS_FS #define BCH_ACL_VERSION 0x0001 @@ -52,5 +49,11 @@ static inline int bch2_acl_count(size_t size) } } +struct posix_acl; + extern struct posix_acl *bch2_get_acl(struct inode *, int); extern int bch2_set_acl(struct inode *, struct posix_acl *, int); + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_ACL_H */ diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 953c6b3b..1c5b2e49 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -77,42 +77,6 @@ static void bch2_recalc_min_prio(struct bch_dev *, int); -/* Allocation groups: */ - -void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) { - grp->nr--; - memmove(&grp->d[i], - &grp->d[i + 1], - (grp->nr- i) * sizeof(grp->d[0])); - break; - } - - spin_unlock(&grp->lock); -} - -void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca) -{ - unsigned i; - - spin_lock(&grp->lock); - for (i = 0; i < grp->nr; i++) - if (grp->d[i].dev == ca) - goto out; - - BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - - grp->d[grp->nr++].dev = ca; -out: - spin_unlock(&grp->lock); -} - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -139,24 +103,24 @@ static void pd_controllers_update(struct work_struct *work) faster_tiers_dirty, -1); - spin_lock(&c->tiers[i].devs.lock); - group_for_each_dev(ca, &c->tiers[i].devs, iter) { + for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) { struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned bucket_bits = ca->bucket_bits + 9; - u64 size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - u64 dirty = stats.buckets[S_DIRTY] << bucket_bits; - u64 free = __dev_buckets_free(ca, stats) << bucket_bits; + u64 size = bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket) << 9; + u64 dirty = bucket_to_sector(ca, + stats.buckets[S_DIRTY]) << 9; + u64 free = bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = ((stats.buckets[S_DIRTY] + - stats.buckets_cached) << - bucket_bits) - - ((stats.sectors[S_DIRTY] + - stats.sectors_cached) << 9); + s64 fragmented = (bucket_to_sector(ca, + stats.buckets[S_DIRTY] + + stats.buckets_cached) - + (stats.sectors[S_DIRTY] + + stats.sectors_cached)) << 9; fragmented = max(0LL, fragmented); @@ -174,7 +138,6 @@ static void pd_controllers_update(struct work_struct *work) copygc_can_free += fragmented; } - spin_unlock(&c->tiers[i].devs.lock); } rcu_read_unlock(); @@ -427,19 +390,22 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) return ret; } -int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) +static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) { struct btree_iter iter; - struct bucket *g; + unsigned long bucket; int ret = 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - for_each_bucket(g, ca) { - ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq); + for_each_set_bit(bucket, ca->bucket_dirty, ca->mi.nbuckets) { + ret = __bch2_alloc_write_key(c, ca, ca->buckets + bucket, + &iter, journal_seq); if (ret) break; + + clear_bit(bucket, ca->bucket_dirty); } bch2_btree_iter_unlock(&iter); @@ -926,8 +892,10 @@ static int bch2_allocator_thread(void *arg) ca->nr_invalidated = ret; - if (ca->nr_invalidated == fifo_used(&ca->free_inc)) + if (ca->nr_invalidated == fifo_used(&ca->free_inc)) { ca->alloc_thread_started = true; + bch2_alloc_write(c, ca, &journal_seq); + } if (ca->allocator_invalidating_data) bch2_journal_flush_seq(&c->journal, journal_seq); @@ -996,6 +964,21 @@ static int bch2_allocator_thread(void *arg) /* Allocation */ +/* + * XXX: allocation on startup is still sketchy. There is insufficient + * synchronization for bch2_bucket_alloc_startup() to work correctly after + * bch2_alloc_write() has been called, and we aren't currently doing anything + * to guarantee that this won't happen. + * + * Even aside from that, it's really difficult to avoid situations where on + * startup we write out a pointer to a freshly allocated bucket before the + * corresponding gen - when we're still digging ourself out of the "i need to + * allocate to write bucket gens, but i need to write bucket gens to allocate" + * hole. + * + * Fortunately, bch2_btree_mark_key_initial() will detect and repair this + * easily enough... + */ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) { struct bucket *g; @@ -1012,6 +995,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) is_available_bucket(g->mark) && bch2_mark_alloc_bucket_startup(ca, g)) { r = g - ca->buckets; + set_bit(r, ca->bucket_dirty); break; } out: @@ -1055,6 +1039,7 @@ long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, spin_unlock(&ca->freelist_lock); if (unlikely(!ca->alloc_thread_started) && + (reserve == RESERVE_ALLOC) && (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { verify_not_on_freelist(ca, r); goto out2; @@ -1081,92 +1066,87 @@ enum bucket_alloc_ret { FREELIST_EMPTY, /* Allocator thread not keeping up */ }; -static void recalc_alloc_group_weights(struct bch_fs *c, - struct dev_group *devs) +struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs) { - struct bch_dev *ca; - u64 available_buckets = 1; /* avoid a divide by zero... */ - unsigned i; + struct dev_alloc_list ret = { .nr = 0 }; + struct bch_dev *ca, *ca2; + unsigned i, j; - for (i = 0; i < devs->nr; i++) { - ca = devs->d[i].dev; + for_each_member_device_rcu(ca, c, i, devs) { + for (j = 0; j < ret.nr; j++) { + unsigned idx = ret.devs[j]; - devs->d[i].weight = dev_buckets_free(ca); - available_buckets += devs->d[i].weight; + ca2 = rcu_dereference(c->devs[idx]); + if (!ca2) + break; + + if (ca->mi.tier < ca2->mi.tier) + break; + + if (ca->mi.tier == ca2->mi.tier && + wp->next_alloc[i] < wp->next_alloc[idx]) + break; + } + + memmove(&ret.devs[j + 1], + &ret.devs[j], + sizeof(ret.devs[0]) * (ret.nr - j)); + ret.nr++; + ret.devs[j] = i; } - for (i = 0; i < devs->nr; i++) { - const unsigned min_weight = U32_MAX >> 4; - const unsigned max_weight = U32_MAX; - - devs->d[i].weight = - min_weight + - div64_u64(devs->d[i].weight * - devs->nr * - (max_weight - min_weight), - available_buckets); - devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); - } + return ret; } -static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, - struct open_bucket *ob, - enum alloc_reserve reserve, - unsigned nr_replicas, - struct dev_group *devs, - long *devs_used) +void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) { - enum bucket_alloc_ret ret; - unsigned fail_idx = -1, i; - unsigned available = 0; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++) + wp->next_alloc[i] >>= 1; +} + +static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, + struct write_point *wp, + struct open_bucket *ob, + unsigned nr_replicas, + enum alloc_reserve reserve, + struct bch_devs_mask *devs) +{ + enum bucket_alloc_ret ret = NO_DEVICES; + struct dev_alloc_list devs_sorted; + unsigned i; BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); if (ob->nr_ptrs >= nr_replicas) return ALLOC_SUCCESS; - spin_lock(&devs->lock); + rcu_read_lock(); + devs_sorted = bch2_wp_alloc_list(c, wp, devs); - for (i = 0; i < devs->nr; i++) - available += !test_bit(devs->d[i].dev->dev_idx, - devs_used); - - recalc_alloc_group_weights(c, devs); - - i = devs->cur_device; - - while (ob->nr_ptrs < nr_replicas) { - struct bch_dev *ca; + for (i = 0; i < devs_sorted.nr; i++) { + struct bch_dev *ca = + rcu_dereference(c->devs[devs_sorted.devs[i]]); long bucket; - if (!available) { - ret = NO_DEVICES; - goto err; - } - - i++; - i %= devs->nr; - - ret = FREELIST_EMPTY; - if (i == fail_idx) - goto err; - - ca = devs->d[i].dev; - - if (test_bit(ca->dev_idx, devs_used)) - continue; - - if (fail_idx == -1 && - get_random_int() > devs->d[i].weight) + if (!ca) continue; bucket = bch2_bucket_alloc(c, ca, reserve); if (bucket < 0) { - if (fail_idx == -1) - fail_idx = i; + ret = FREELIST_EMPTY; continue; } + wp->next_alloc[ca->dev_idx] += + div64_u64(U64_MAX, dev_buckets_free(ca) * + ca->mi.bucket_size); + bch2_wp_rescale(c, ca, wp); + /* * open_bucket_add_buckets expects new pointers at the head of * the list: @@ -1185,56 +1165,28 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, }; ob->ptr_offset[0] = 0; - __set_bit(ca->dev_idx, devs_used); - available--; - devs->cur_device = i; + if (ob->nr_ptrs == nr_replicas) { + ret = ALLOC_SUCCESS; + break; + } } - ret = ALLOC_SUCCESS; -err: EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&devs->lock); + rcu_read_unlock(); return ret; } -static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob, - unsigned nr_replicas, - enum alloc_reserve reserve, - long *devs_used) -{ - struct bch_tier *tier; - /* - * this should implement policy - for a given type of allocation, decide - * which devices to allocate from: - * - * XXX: switch off wp->type and do something more intelligent here - */ - if (wp->group) - return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, devs_used); - - /* foreground writes: prefer fastest tier: */ - tier = READ_ONCE(c->fastest_tier); - if (tier) - bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - &tier->devs, devs_used); - - return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->all_devs, devs_used); -} - static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob, unsigned nr_replicas, - enum alloc_reserve reserve, long *devs_used, + enum alloc_reserve reserve, + struct bch_devs_mask *devs, struct closure *cl) { bool waiting = false; while (1) { switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used)) { + reserve, devs)) { case ALLOC_SUCCESS: if (waiting) closure_wake_up(&c->freelist_wait); @@ -1354,13 +1306,12 @@ static unsigned ob_ptr_sectors_free(struct bch_fs *c, { struct bch_dev *ca = c->devs[ptr->dev]; unsigned i = ptr - ob->ptrs; - unsigned bucket_size = ca->mi.bucket_size; - unsigned used = (ptr->offset & (bucket_size - 1)) + + unsigned used = bucket_remainder(ca, ptr->offset) + ob->ptr_offset[i]; - BUG_ON(used > bucket_size); + BUG_ON(used > ca->mi.bucket_size); - return bucket_size - used; + return ca->mi.bucket_size - used; } static unsigned open_bucket_sectors_free(struct bch_fs *c, @@ -1432,28 +1383,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, enum alloc_reserve reserve, struct closure *cl) { - long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; + struct bch_devs_mask devs = c->rw_devs[wp->type]; unsigned i; int ret; - /* - * We might be allocating pointers to add to an existing extent - * (tiering/copygc/migration) - if so, some of the pointers in our - * existing open bucket might duplicate devices we already have. This is - * moderately annoying. - */ - - /* Short circuit all the fun stuff if posssible: */ if (ob->nr_ptrs >= nr_replicas) return 0; - memset(devs_used, 0, sizeof(devs_used)); - + /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < ob->nr_ptrs; i++) - __set_bit(ob->ptrs[i].dev, devs_used); + __clear_bit(ob->ptrs[i].dev, devs.d); + + if (wp->group) + bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX); ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs_used, cl); + reserve, &devs, cl); if (ret == -EROFS && ob->nr_ptrs >= nr_replicas_required) @@ -1568,8 +1513,6 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, extent_ptr_append(e, tmp); ob->ptr_offset[i] += sectors; - - this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors); } } @@ -1651,6 +1594,8 @@ void bch2_recalc_capacity(struct bch_fs *c) unsigned long ra_pages = 0; unsigned i, j; + lockdep_assert_held(&c->state_lock); + for_each_online_member(ca, c, i) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; @@ -1663,7 +1608,7 @@ void bch2_recalc_capacity(struct bch_fs *c) for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) + if (!dev_mask_nr(&tier->devs)) continue; if (!fastest_tier) fastest_tier = tier; @@ -1681,8 +1626,7 @@ void bch2_recalc_capacity(struct bch_fs *c) * Capacity of the filesystem is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. */ - spin_lock(&slowest_tier->devs.lock); - group_for_each_dev(ca, &slowest_tier->devs, i) { + for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) { size_t reserve = 0; /* @@ -1712,13 +1656,11 @@ void bch2_recalc_capacity(struct bch_fs *c) reserve += 1; /* tiering write point */ reserve += 1; /* btree write point */ - reserved_sectors += reserve << ca->bucket_bits; + reserved_sectors += bucket_to_sector(ca, reserve); - capacity += (ca->mi.nbuckets - - ca->mi.first_bucket) << - ca->bucket_bits; + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); } - spin_unlock(&slowest_tier->devs.lock); set_capacity: total_capacity = capacity; @@ -1795,7 +1737,6 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; struct closure cl; unsigned i; @@ -1805,9 +1746,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* First, remove device from allocation groups: */ - bch2_dev_group_remove(&c->journal.devs, ca); - bch2_dev_group_remove(tier, ca); - bch2_dev_group_remove(&c->all_devs, ca); + clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + clear_bit(ca->dev_idx, c->rw_devs[i].d); /* * Capacity is calculated based off of devices in allocation groups: @@ -1820,7 +1761,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) bch2_stop_write_point(c, ca, &ca->copygc_write_point); bch2_stop_write_point(c, ca, &c->promote_write_point); - bch2_stop_write_point(c, ca, &ca->tiering_write_point); + bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp); bch2_stop_write_point(c, ca, &c->migration_write_point); bch2_stop_write_point(c, ca, &c->btree_write_point); @@ -1862,21 +1803,12 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct bch_sb_field_journal *journal_buckets; - bool has_journal; + unsigned i; - bch2_dev_group_add(&c->all_devs, ca); - bch2_dev_group_add(tier, ca); - - mutex_lock(&c->sb_lock); - journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb); - has_journal = bch2_nr_journal_buckets(journal_buckets) >= - BCH_JOURNAL_BUCKETS_MIN; - mutex_unlock(&c->sb_lock); - - if (has_journal) - bch2_dev_group_add(&c->journal.devs, ca); + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (ca->mi.data_allowed & (1 << i)) + set_bit(ca->dev_idx, c->rw_devs[i].d); + set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); } /* stop allocator thread: */ @@ -1942,13 +1874,17 @@ void bch2_fs_allocator_init(struct bch_fs *c) list_add(&c->open_buckets[i].list, &c->open_buckets_free); } - spin_lock_init(&c->all_devs.lock); + c->journal.wp.type = BCH_DATA_JOURNAL; + c->btree_write_point.type = BCH_DATA_BTREE; for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - spin_lock_init(&c->tiers[i].devs.lock); + c->tiers[i].wp.type = BCH_DATA_USER; for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - c->write_points[i].throttle = true; + c->write_points[i].type = BCH_DATA_USER; + + c->promote_write_point.type = BCH_DATA_USER; + c->migration_write_point.type = BCH_DATA_USER; c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index cfd1c8ef..f07f1bfc 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_ALLOC_H -#define _BCACHE_ALLOC_H +#ifndef _BCACHEFS_ALLOC_H +#define _BCACHEFS_ALLOC_H #include "bcachefs.h" #include "alloc_types.h" @@ -10,11 +10,18 @@ struct bch_dev; struct bch_fs; struct dev_group; -void bch2_dev_group_remove(struct dev_group *, struct bch_dev *); -void bch2_dev_group_add(struct dev_group *, struct bch_dev *); +struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +}; + +struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *, + struct write_point *, + struct bch_devs_mask *); +void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, + struct write_point *); int bch2_alloc_read(struct bch_fs *, struct list_head *); -int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *); int bch2_alloc_replay_key(struct bch_fs *, struct bpos); long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve); @@ -46,24 +53,6 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) rcu_read_unlock(); } -static inline struct bch_dev *dev_group_next(struct dev_group *devs, - unsigned *iter) -{ - struct bch_dev *ret = NULL; - - while (*iter < devs->nr && - !(ret = rcu_dereference_check(devs->d[*iter].dev, - lockdep_is_held(&devs->lock)))) - (*iter)++; - - return ret; -} - -#define group_for_each_dev(ca, devs, iter) \ - for ((iter) = 0; \ - ((ca) = dev_group_next((devs), &(iter))); \ - (iter)++) - #define open_bucket_for_each_ptr(_ob, _ptr) \ for ((_ptr) = (_ob)->ptrs; \ (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ @@ -81,4 +70,4 @@ void bch2_fs_allocator_init(struct bch_fs *); extern const struct bkey_ops bch2_bkey_alloc_ops; -#endif /* _BCACHE_ALLOC_H */ +#endif /* _BCACHEFS_ALLOC_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index ce3a919e..d297430c 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_ALLOC_TYPES_H -#define _BCACHE_ALLOC_TYPES_H +#ifndef _BCACHEFS_ALLOC_TYPES_H +#define _BCACHEFS_ALLOC_TYPES_H #include @@ -42,16 +42,6 @@ enum alloc_reserve { RESERVE_NR = 3, }; -struct dev_group { - spinlock_t lock; - unsigned nr; - unsigned cur_device; - struct { - u64 weight; - struct bch_dev *dev; - } d[BCH_SB_MEMBERS_MAX]; -}; - /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 @@ -74,22 +64,18 @@ struct open_bucket { struct write_point { struct open_bucket *b; - - /* - * Throttle writes to this write point if tier 0 is full? - */ - bool throttle; + enum bch_data_type type; /* * If not NULL, cache group for tiering, promotion and moving GC - * always allocates a single replica - */ - struct dev_group *group; - - /* + * * Otherwise do a normal replicated bucket allocation that could come * from any device in tier 0 (foreground write) */ + struct bch_devs_mask *group; + + u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; struct alloc_heap_entry { @@ -99,4 +85,4 @@ struct alloc_heap_entry { typedef HEAP(struct alloc_heap_entry) alloc_heap; -#endif /* _BCACHE_ALLOC_TYPES_H */ +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 96956e1a..dce8714b 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -284,7 +284,6 @@ do { \ #include "clock_types.h" #include "journal_types.h" #include "keylist_types.h" -#include "move_types.h" #include "super_types.h" /* 256k, in sectors */ @@ -330,6 +329,7 @@ struct bch_member_cpu { u8 tier; u8 replacement; u8 discard; + u8 data_allowed; u8 valid; }; @@ -345,6 +345,10 @@ struct bch_replicas_cpu { struct bch_replicas_cpu_entry entries[]; }; +struct io_count { + u64 sectors[2][BCH_DATA_NR]; +}; + struct bch_dev { struct kobject kobj; struct percpu_ref ref; @@ -366,7 +370,7 @@ struct bch_dev { struct bcache_superblock disk_sb; int sb_write_error; - struct dev_group self; + struct bch_devs_mask self; /* biosets used in cloned bios for replicas and moving_gc */ struct bio_set replica_set; @@ -387,7 +391,6 @@ struct bch_dev { spinlock_t freelist_lock; unsigned nr_invalidated; bool alloc_thread_started; - bool need_alloc_write; size_t fifo_last_bucket; @@ -396,7 +399,7 @@ struct bch_dev { /* most out of date gen in the btree */ u8 *oldest_gens; struct bucket *buckets; - unsigned short bucket_bits; /* ilog2(bucket_size) */ + unsigned long *bucket_dirty; /* last calculated minimum prio */ u16 min_prio[2]; @@ -423,9 +426,6 @@ struct bch_dev { struct bch_pd_controller moving_gc_pd; - /* Tiering: */ - struct write_point tiering_write_point; - struct write_point copygc_write_point; struct journal_device journal; @@ -433,9 +433,7 @@ struct bch_dev { struct work_struct io_error_work; /* The rest of this all shows up in sysfs */ - atomic64_t meta_sectors_written; - atomic64_t btree_sectors_written; - u64 __percpu *sectors_written; + struct io_count __percpu *io_done; }; /* @@ -472,7 +470,8 @@ struct bch_tier { struct task_struct *migrate; struct bch_pd_controller pd; - struct dev_group devs; + struct bch_devs_mask devs; + struct write_point wp; }; enum bch_fs_state { @@ -520,6 +519,7 @@ struct bch_fs { u16 block_size; u16 btree_node_size; + u16 encoded_extent_max; u8 nr_devices; u8 clean; @@ -621,7 +621,7 @@ struct bch_fs { * These contain all r/w devices - i.e. devices we can currently * allocate from: */ - struct dev_group all_devs; + struct bch_devs_mask rw_devs[BCH_DATA_NR]; struct bch_tier tiers[BCH_TIER_MAX]; /* NULL if we only have devices in one tier: */ struct bch_tier *fastest_tier; @@ -789,11 +789,6 @@ static inline bool bch2_fs_running(struct bch_fs *c) return c->state == BCH_FS_RO || c->state == BCH_FS_RW; } -static inline unsigned bucket_pages(const struct bch_dev *ca) -{ - return ca->mi.bucket_size / PAGE_SECTORS; -} - static inline unsigned bucket_bytes(const struct bch_dev *ca) { return ca->mi.bucket_size << 9; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 125b6fab..463789d6 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -77,6 +77,7 @@ struct bpos { #define KEY_INODE_MAX ((__u64)~0ULL) #define KEY_OFFSET_MAX ((__u64)~0ULL) #define KEY_SNAPSHOT_MAX ((__u32)~0U) +#define KEY_SIZE_MAX ((__u32)~0U) static inline struct bpos POS(__u64 inode, __u64 offset) { @@ -177,8 +178,6 @@ struct bkey_packed { #define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) #define KEY_PACKED_BITS_START 24 -#define KEY_SIZE_MAX ((__u32)~0U) - #define KEY_FORMAT_LOCAL_BTREE 0 #define KEY_FORMAT_CURRENT 1 @@ -359,14 +358,16 @@ struct bch_csum { __le64 hi; } __attribute__((packed, aligned(8))); -#define BCH_CSUM_NONE 0U -#define BCH_CSUM_CRC32C 1U -#define BCH_CSUM_CRC64 2U -#define BCH_CSUM_CHACHA20_POLY1305_80 3U -#define BCH_CSUM_CHACHA20_POLY1305_128 4U -#define BCH_CSUM_NR 5U +enum bch_csum_type { + BCH_CSUM_NONE = 0, + BCH_CSUM_CRC32C = 1, + BCH_CSUM_CRC64 = 2, + BCH_CSUM_CHACHA20_POLY1305_80 = 3, + BCH_CSUM_CHACHA20_POLY1305_128 = 4, + BCH_CSUM_NR = 5, +}; -static inline _Bool bch2_csum_type_is_encryption(unsigned type) +static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { case BCH_CSUM_CHACHA20_POLY1305_80: @@ -377,6 +378,14 @@ static inline _Bool bch2_csum_type_is_encryption(unsigned type) } } +enum bch_compression_type { + BCH_COMPRESSION_NONE = 0, + BCH_COMPRESSION_LZ4_OLD = 1, + BCH_COMPRESSION_GZIP = 2, + BCH_COMPRESSION_LZ4 = 3, + BCH_COMPRESSION_NR = 4, +}; + enum bch_extent_entry_type { BCH_EXTENT_ENTRY_ptr = 0, BCH_EXTENT_ENTRY_crc32 = 1, @@ -462,12 +471,6 @@ struct bch_extent_crc128 { #define CRC128_SIZE_MAX (1U << 13) #define CRC128_NONCE_MAX ((1U << 13) - 1) -/* - * Max size of an extent that may require bouncing to read or write - * (checksummed, compressed): 64k - */ -#define BCH_ENCODED_EXTENT_MAX 128U - /* * @reservation - pointer hasn't been written to, just reserved */ @@ -578,11 +581,12 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION); #define BLOCKDEV_INODE_MAX 4096 -#define BCACHE_ROOT_INO 4096 +#define BCACHEFS_ROOT_INO 4096 enum bch_inode_types { BCH_INODE_FS = 128, BCH_INODE_BLOCKDEV = 129, + BCH_INODE_GENERATION = 130, }; struct bch_inode { @@ -595,6 +599,15 @@ struct bch_inode { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(inode, BCH_INODE_FS); +struct bch_inode_generation { + struct bch_val v; + + __le32 i_generation; + __le32 pad; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); + + #define BCH_INODE_FIELDS() \ BCH_INODE_FIELD(i_atime, 64) \ BCH_INODE_FIELD(i_ctime, 64) \ @@ -735,24 +748,14 @@ BKEY_VAL_TYPE(alloc, BCH_ALLOC); /* Superblock */ -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - * Version 5: All the incompat changes - * Version 6: Cache device UUIDs all in superblock, another incompat bset change - * Version 7: Encryption (expanded checksum fields), other random things +/* + * Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS + * BCH_MEMBER_DATA_ALLOWED */ -#define BCACHE_SB_VERSION_CDEV_V0 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_VERSION_CDEV_V2 5 -#define BCACHE_SB_VERSION_CDEV_V3 6 -#define BCACHE_SB_VERSION_CDEV_V4 7 -#define BCACHE_SB_VERSION_CDEV 7 -#define BCACHE_SB_MAX_VERSION 7 + +#define BCH_SB_VERSION_MIN 7 +#define BCH_SB_VERSION_EXTENT_MAX 8 +#define BCH_SB_VERSION_MAX 8 #define BCH_SB_SECTOR 8 #define BCH_SB_LABEL_SIZE 32 @@ -774,6 +777,7 @@ LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) /* 8-10 unused, was HAS_(META)DATA */ LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20); #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -880,7 +884,7 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -enum bch_data_types { +enum bch_data_type { BCH_DATA_NONE = 0, BCH_DATA_SB = 1, BCH_DATA_JOURNAL = 2, @@ -981,7 +985,12 @@ LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -/* 14-20 unused, was JOURNAL_ENTRY_SIZE */ +/* + * Max size of an extent that may require bouncing to read or write + * (checksummed, compressed): 64k + */ +LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, + struct bch_sb, flags[1], 14, 20); LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); @@ -1032,10 +1041,10 @@ enum bch_str_hash_opts { }; enum bch_compression_opts { - BCH_COMPRESSION_NONE = 0, - BCH_COMPRESSION_LZ4 = 1, - BCH_COMPRESSION_GZIP = 2, - BCH_COMPRESSION_NR = 3, + BCH_COMPRESSION_OPT_NONE = 0, + BCH_COMPRESSION_OPT_LZ4 = 1, + BCH_COMPRESSION_OPT_GZIP = 2, + BCH_COMPRESSION_OPT_NR = 3, }; /* @@ -1049,7 +1058,7 @@ enum bch_compression_opts { UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -#define BCACHE_STATFS_MAGIC 0xca451a4e +#define BCACHEFS_STATFS_MAGIC 0xca451a4e #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index cc76257e..19bf1b8f 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -287,7 +287,7 @@ struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, return out; } -#ifndef HAVE_BCACHE_COMPILED_UNPACK +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK struct bpos __bkey_unpack_pos(const struct bkey_format *format, const struct bkey_packed *in) { diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 0511e1fa..dc0b88f7 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BKEY_H -#define _BCACHE_BKEY_H +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H #include #include "bcachefs_format.h" @@ -345,7 +345,7 @@ bool bch2_bkey_transform(const struct bkey_format *, struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, const struct bkey_packed *); -#ifndef HAVE_BCACHE_COMPILED_UNPACK +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK struct bpos __bkey_unpack_pos(const struct bkey_format *, const struct bkey_packed *); #endif @@ -382,7 +382,7 @@ static inline u64 bkey_field_max(const struct bkey_format *f, } #ifdef CONFIG_X86_64 -#define HAVE_BCACHE_COMPILED_UNPACK 1 +#define HAVE_BCACHEFS_COMPILED_UNPACK 1 int bch2_compile_bkey_format(const struct bkey_format *, void *); @@ -575,6 +575,7 @@ BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION); BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); +BKEY_VAL_ACCESSORS(inode_generation, BCH_INODE_GENERATION); BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); @@ -612,4 +613,4 @@ void bch2_bkey_pack_test(void); static inline void bch2_bkey_pack_test(void) {} #endif -#endif /* _BCACHE_BKEY_H */ +#endif /* _BCACHEFS_BKEY_H */ diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 2d526f56..29c1abd3 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BKEY_METHODS_H -#define _BCACHE_BKEY_METHODS_H +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H #include "bkey.h" @@ -10,6 +10,8 @@ enum bkey_type { BKEY_TYPE_BTREE, }; +#undef DEF_BTREE_ID + /* Type of a key in btree @id at level @level: */ static inline enum bkey_type bkey_type(unsigned level, enum btree_id id) { @@ -77,6 +79,4 @@ void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, extern const struct bkey_ops *bch2_bkey_ops[]; -#undef DEF_BTREE_ID - -#endif /* _BCACHE_BKEY_METHODS_H */ +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 53627380..10f3f3f3 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -691,7 +691,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, struct bkey_packed *l, *r; unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; unsigned mantissa; - int shift, exponent; + int shift, exponent, high_bit; EBUG_ON(bkey_next(p) != m); @@ -737,7 +737,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, */ if (!bkey_packed(l) || !bkey_packed(r) || - !bkey_packed(p) || !bkey_packed(m)) { + !bkey_packed(p) || !bkey_packed(m) || + !b->nr_key_bits) { f->exponent = BFLOAT_FAILED_UNPACKED; return; } @@ -752,7 +753,9 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * Note that this may be negative - we may be running off the low end * of the key: we handle this later: */ - exponent = (int) bch2_bkey_greatest_differing_bit(b, l, r) - (bits - 1); + high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), + min_t(unsigned, bits, b->nr_key_bits) - 1); + exponent = high_bit - (bits - 1); /* * Then we calculate the actual shift value, from the start of the key @@ -761,16 +764,16 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, #ifdef __LITTLE_ENDIAN shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - EBUG_ON(shift + bits > b->format.key_u64s * 64); + BUG_ON(shift + bits > b->format.key_u64s * 64); #else shift = high_bit_offset + b->nr_key_bits - exponent - bits; - EBUG_ON(shift < KEY_PACKED_BITS_START); + BUG_ON(shift < KEY_PACKED_BITS_START); #endif - EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); + BUG_ON(shift < 0 || shift >= BFLOAT_FAILED); f->exponent = shift; mantissa = bkey_mantissa(m, f, j); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 660a7283..a1337bf8 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BSET_H -#define _BCACHE_BSET_H +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H #include #include @@ -183,7 +183,7 @@ bkey_unpack_key_format_checked(const struct btree *b, { struct bkey dst; -#ifdef HAVE_BCACHE_COMPILED_UNPACK +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK { compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(&dst, src); @@ -221,7 +221,7 @@ static inline struct bpos bkey_unpack_pos_format_checked(const struct btree *b, const struct bkey_packed *src) { -#ifdef HAVE_BCACHE_COMPILED_UNPACK +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK return bkey_unpack_key_format_checked(b, src).p; #else return __bkey_unpack_pos(&b->format, src); @@ -618,4 +618,4 @@ static inline void bch2_verify_btree_nr_keys(struct btree *b) __bch2_verify_btree_nr_keys(b); } -#endif +#endif /* _BCACHEFS_BSET_H */ diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 03c77b4c..0be372c4 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -180,8 +180,8 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) btree_node_wait_on_io(b); } out: - if (PTR_HASH(&b->key)) - trace_btree_node_reap(c, b, ret); + if (PTR_HASH(&b->key) && !ret) + trace_btree_node_reap(c, b); return ret; out_unlock: six_unlock_write(&b->lock); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 31556044..ce10a4a9 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_CACHE_H -#define _BCACHE_BTREE_CACHE_H +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H #include "bcachefs.h" #include "btree_types.h" @@ -59,14 +59,14 @@ static inline size_t btree_max_u64s(struct bch_fs *c) return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) -{ - return c->sb.btree_node_size >> (PAGE_SHIFT - 9); -} - static inline size_t btree_page_order(struct bch_fs *c) { - return ilog2(btree_pages(c)); + return get_order(btree_bytes(c)); +} + +static inline size_t btree_pages(struct bch_fs *c) +{ + return 1 << btree_page_order(c); } static inline unsigned btree_blocks(struct bch_fs *c) @@ -86,4 +86,4 @@ static inline unsigned btree_blocks(struct bch_fs *c) int bch2_print_btree_node(struct bch_fs *, struct btree *, char *, size_t); -#endif /* _BCACHE_BTREE_CACHE_H */ +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 212bb5f8..2bd2887a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -129,7 +129,7 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k) { - enum bch_data_types data_type = type == BKEY_TYPE_BTREE + enum bch_data_type data_type = type == BKEY_TYPE_BTREE ? BCH_DATA_BTREE : BCH_DATA_USER; int ret = 0; @@ -152,20 +152,23 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, struct bch_dev *ca = c->devs[ptr->dev]; struct bucket *g = PTR_BUCKET(ca, ptr); - if (!g->mark.gen_valid) { + if (mustfix_fsck_err_on(!g->mark.gen_valid, c, + "found ptr with missing gen in alloc btree,\n" + "type %s gen %u", + bch2_data_types[data_type], + ptr->gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - ca->need_alloc_write = true; + set_bit(g - ca->buckets, ca->bucket_dirty); } - if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, + if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, "%s ptr gen in the future: %u > %u", - type == BKEY_TYPE_BTREE - ? "btree" : "data", + bch2_data_types[data_type], ptr->gen, g->mark.gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - ca->need_alloc_write = true; + set_bit(g - ca->buckets, ca->bucket_dirty); set_bit(BCH_FS_FIXED_GENS, &c->flags); } @@ -308,12 +311,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, enum bucket_data_type type) { - u64 b = start >> ca->bucket_bits; + u64 b = sector_to_bucket(ca, start); do { bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true); b++; - } while (b < end >> ca->bucket_bits); + } while (b < sector_to_bucket(ca, end)); } static void bch2_dev_mark_superblocks(struct bch_dev *ca) @@ -608,7 +611,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, return; } - trace_btree_gc_coalesce(c, parent, nr_old_nodes); + trace_btree_gc_coalesce(c, old_nodes[0]); for (i = 0; i < nr_old_nodes; i++) bch2_btree_interior_update_will_free_node(as, old_nodes[i]); diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 1d461518..27dcc06c 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_GC_H -#define _BCACHE_GC_H +#ifndef _BCACHEFS_BTREE_GC_H +#define _BCACHEFS_BTREE_GC_H #include "btree_types.h" @@ -101,4 +101,4 @@ static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) return ret; } -#endif +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 89724f32..0eb27eae 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1292,6 +1292,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bio->bi_iter.bi_size = btree_bytes(c); bch2_bio_map(bio, b->data); + this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE], + bio_sectors(bio)); + set_btree_node_read_in_flight(b); if (sync) { @@ -1702,13 +1705,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, extent_for_each_ptr(e, ptr) ptr->offset += b->written; - extent_for_each_ptr(e, ptr) - atomic64_add(sectors_to_write, - &c->devs[ptr->dev]->btree_sectors_written); - b->written += sectors_to_write; - bch2_submit_wbio_replicas(wbio, c, &k.key); + bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key); return; err: set_btree_node_noevict(b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 877ada66..537b8e1d 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_IO_H -#define _BCACHE_BTREE_IO_H +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H #include "extents.h" @@ -109,4 +109,4 @@ do { \ void bch2_btree_verify_flushed(struct bch_fs *); -#endif /* _BCACHE_BTREE_IO_H */ +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 8ad08953..f4f73bfc 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -249,10 +249,10 @@ fail: static void __bch2_btree_iter_unlock(struct btree_iter *iter) { + iter->flags &= ~BTREE_ITER_UPTODATE; + while (iter->nodes_locked) btree_node_unlock(iter, __ffs(iter->nodes_locked)); - - iter->flags &= ~BTREE_ITER_UPTODATE; } int bch2_btree_iter_unlock(struct btree_iter *iter) @@ -627,9 +627,9 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) unsigned level = b->level; if (iter->nodes[level] == b) { + iter->flags &= ~BTREE_ITER_UPTODATE; btree_node_unlock(iter, level); iter->nodes[level] = BTREE_ITER_NOT_END; - iter->flags &= ~BTREE_ITER_UPTODATE; } } @@ -840,6 +840,11 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { unsigned depth_want = iter->level; + if (unlikely(!iter->nodes[iter->level])) + return 0; + + iter->flags &= ~(BTREE_ITER_UPTODATE|BTREE_ITER_AT_END_OF_LEAF); + /* make sure we have all the intent locks we need - ugh */ if (unlikely(iter->nodes[iter->level] && iter->level + 1 < iter->locks_want)) { @@ -893,6 +898,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) : btree_iter_lock_root(iter, depth_want); if (unlikely(ret)) { iter->level = depth_want; + iter->nodes[iter->level] = BTREE_ITER_NOT_END; return ret; } } @@ -904,13 +910,6 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - iter->flags &= ~BTREE_ITER_UPTODATE; - - if (unlikely(!iter->nodes[iter->level])) - return 0; - - iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; - ret = __bch2_btree_iter_traverse(iter); if (unlikely(ret)) ret = btree_iter_traverse_error(iter, ret); @@ -1068,6 +1067,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) .v = bkeyp_val(&b->format, k) }; + EBUG_ON(!btree_node_locked(iter, 0)); + if (debug_check_bkeys(iter->c)) bch2_bkey_debugcheck(iter->c, b, ret); return ret; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 34e5035e..a7fdba82 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_ITER_H -#define _BCACHE_BTREE_ITER_H +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H #include "btree_types.h" @@ -263,4 +263,4 @@ static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) } } -#endif /* _BCACHE_BTREE_ITER_H */ +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 86c19540..0c174e4e 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_LOCKING_H -#define _BCACHE_BTREE_LOCKING_H +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H /* * Only for internal btree use: @@ -91,6 +91,8 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); + EBUG_ON(iter->flags & BTREE_ITER_UPTODATE); + if (lock_type != BTREE_NODE_UNLOCKED) six_unlock_type(&iter->nodes[level]->lock, lock_type); mark_btree_node_unlocked(iter, level); @@ -113,4 +115,4 @@ bool bch2_btree_node_relock(struct btree_iter *, unsigned); void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); -#endif /* _BCACHE_BTREE_LOCKING_H */ +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index d3ba28bd..c0c16205 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_TYPES_H -#define _BCACHE_BTREE_TYPES_H +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H #include #include @@ -321,4 +321,4 @@ typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, struct btree *, struct btree_node_iter *); -#endif /* _BCACHE_BTREE_TYPES_H */ +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 584f0f5e..e11fcec9 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_UPDATE_H -#define _BCACHE_BTREE_UPDATE_H +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H #include "btree_iter.h" #include "journal.h" @@ -133,5 +133,4 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree *, struct bkey_i_extent *); -#endif /* _BCACHE_BTREE_UPDATE_H */ - +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 350e2f9b..98e85627 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1310,7 +1310,7 @@ static void btree_split(struct btree_update *as, struct btree *b, btree_split_insert_keys(as, n1, iter, keys); if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { - trace_btree_node_split(c, b, b->nr.live_u64s); + trace_btree_split(c, b); n2 = __btree_split_node(as, n1, iter); @@ -1340,7 +1340,7 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent); } } else { - trace_btree_node_compact(c, b, b->nr.live_u64s); + trace_btree_compact(c, b); bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->lock); @@ -1882,12 +1882,13 @@ retry: if (new_hash) { mutex_lock(&c->btree_cache_lock); + bch2_btree_node_hash_remove(c, new_hash); + bch2_btree_node_hash_remove(c, b); bkey_copy(&b->key, &new_key->k_i); - __bch2_btree_node_hash_insert(c, b); - - bch2_btree_node_hash_remove(c, new_hash); + ret = __bch2_btree_node_hash_insert(c, b); + BUG_ON(ret); mutex_unlock(&c->btree_cache_lock); } else { bkey_copy(&b->key, &new_key->k_i); @@ -1959,7 +1960,10 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, while (1) { /* XXX haven't calculated capacity yet :/ */ - as = bch2_btree_update_start(c, id, 1, 0, &cl); + as = bch2_btree_update_start(c, id, 1, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); if (!IS_ERR(as)) break; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index b1fa06c6..8f75963b 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_BTREE_UPDATE_INTERIOR_H -#define _BCACHE_BTREE_UPDATE_INTERIOR_H +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H #include "btree_cache.h" #include "btree_update.h" @@ -309,4 +309,4 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans, return u64s <= trans->journal_res.u64s; } -#endif /* _BCACHE_BTREE_UPDATE_INTERIOR_H */ +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index a113d0d0..9be11217 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -314,7 +314,8 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, })); if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, g - ca->buckets, old->cached_sectors); + trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets), + old->cached_sectors); return true; } @@ -522,7 +523,7 @@ static void bch2_mark_pointer(struct bch_fs *c, if (saturated && atomic_long_add_return(saturated, &ca->saturated_count) >= - ca->free_inc.size << ca->bucket_bits) { + bucket_to_sector(ca, ca->free_inc.size)) { if (c->gc_thread) { trace_gc_sectors_saturated(c); wake_up_process(c->gc_thread); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 618802c4..141aa4ad 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -45,28 +45,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -/* - * Returns 0 if no pointers or device offline - only for tracepoints! - */ -static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c, - const struct bkey_i *k, - unsigned ptr) -{ - size_t bucket = 0; -#if 0 - if (bkey_extent_is_data(&k->k)) { - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) { - const struct bch_dev *ca = c->devs[ptr->dev]; - bucket = PTR_BUCKET_NR(ca, ptr); - break; - } - } -#endif - return bucket; -} - static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 396d7703..63f1b27f 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -3,7 +3,7 @@ #include "util.h" -/* kill, switch to bch_data_types */ +/* kill, switch to bch_data_type */ enum bucket_data_type { BUCKET_DATA = 0, BUCKET_BTREE, diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 47af7a25..d9a3212c 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -1,3 +1,5 @@ +#ifndef NO_BCACHEFS_CHARDEV + #include "bcachefs.h" #include "bcachefs_ioctl.h" #include "super.h" @@ -404,3 +406,5 @@ int __init bch2_chardev_init(void) return 0; } + +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/libbcachefs/chardev.h b/libbcachefs/chardev.h index e0e34e24..c3057b07 100644 --- a/libbcachefs/chardev.h +++ b/libbcachefs/chardev.h @@ -1,7 +1,7 @@ -#ifndef _BCACHE_CHARDEV_H -#define _BCACHE_CHARDEV_H +#ifndef _BCACHEFS_CHARDEV_H +#define _BCACHEFS_CHARDEV_H -#ifndef NO_BCACHE_CHARDEV +#ifndef NO_BCACHEFS_FS long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); @@ -25,6 +25,6 @@ static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } static inline void bch2_chardev_exit(void) {} static inline int __init bch2_chardev_init(void) { return 0; } -#endif +#endif /* NO_BCACHEFS_FS */ -#endif /* _BCACHE_CHARDEV_H */ +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index f540e305..15d15b92 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_CHECKSUM_H -#define _BCACHE_CHECKSUM_H +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H #include "bcachefs.h" #include "super-io.h" @@ -46,21 +46,51 @@ int bch2_enable_encryption(struct bch_fs *, bool); void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); -static inline unsigned bch2_data_checksum_type(struct bch_fs *c) +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type) +{ + switch (type) { + case BCH_CSUM_OPT_NONE: + return BCH_CSUM_NONE; + case BCH_CSUM_OPT_CRC32C: + return BCH_CSUM_CRC32C; + case BCH_CSUM_OPT_CRC64: + return BCH_CSUM_CRC64; + default: + BUG(); + } +} + +static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) return c->opts.wide_macs ? BCH_CSUM_CHACHA20_POLY1305_128 : BCH_CSUM_CHACHA20_POLY1305_80; - return c->opts.data_checksum; + return bch2_csum_opt_to_type(c->opts.data_checksum); } -static inline unsigned bch2_meta_checksum_type(struct bch_fs *c) +static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { - return c->sb.encryption_type - ? BCH_CSUM_CHACHA20_POLY1305_128 - : c->opts.metadata_checksum; + if (c->sb.encryption_type) + return BCH_CSUM_CHACHA20_POLY1305_128; + + return bch2_csum_opt_to_type(c->opts.metadata_checksum); +} + +static inline enum bch_compression_type +bch2_compression_opt_to_type(enum bch_compression_opts type) +{ + switch (type) { + case BCH_COMPRESSION_OPT_NONE: + return BCH_COMPRESSION_NONE; + case BCH_COMPRESSION_OPT_LZ4: + return BCH_COMPRESSION_LZ4; + case BCH_COMPRESSION_OPT_GZIP: + return BCH_COMPRESSION_GZIP; + default: + BUG(); + } } static inline bool bch2_checksum_type_valid(const struct bch_fs *c, @@ -130,4 +160,4 @@ static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) }}; } -#endif /* _BCACHE_CHECKSUM_H */ +#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index 061bf04a..af6b2b39 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_CLOCK_H -#define _BCACHE_CLOCK_H +#ifndef _BCACHEFS_CLOCK_H +#define _BCACHEFS_CLOCK_H void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); @@ -20,4 +20,4 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); void bch2_io_clock_exit(struct io_clock *); int bch2_io_clock_init(struct io_clock *); -#endif /* _BCACHE_CLOCK_H */ +#endif /* _BCACHEFS_CLOCK_H */ diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index ae068c6d..bfd4b303 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_CLOCK_TYPES_H -#define _BCACHE_CLOCK_TYPES_H +#ifndef _BCACHEFS_CLOCK_TYPES_H +#define _BCACHEFS_CLOCK_TYPES_H #include "util.h" @@ -32,5 +32,4 @@ struct io_clock { io_timer_heap timers; }; -#endif /* _BCACHE_CLOCK_TYPES_H */ - +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 80b12f3b..fba36c82 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -5,45 +5,53 @@ #include "super-io.h" #include "lz4.h" +#include #include -enum bounced { - BOUNCED_CONTIG, - BOUNCED_MAPPED, - BOUNCED_KMALLOCED, - BOUNCED_VMALLOCED, - BOUNCED_MEMPOOLED, +/* Bounce buffer: */ +struct bbuf { + void *b; + enum { + BB_NONE, + BB_VMAP, + BB_KMALLOC, + BB_VMALLOC, + BB_MEMPOOL, + } type; + int rw; }; -static void *__bounce_alloc(struct bch_fs *c, unsigned size, - unsigned *bounced, int direction) +static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) { - void *data; + void *b; - *bounced = BOUNCED_KMALLOCED; - data = kmalloc(size, GFP_NOIO|__GFP_NOWARN); - if (data) - return data; + BUG_ON(size > c->sb.encoded_extent_max); - *bounced = BOUNCED_MEMPOOLED; - data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT); - if (data) - return page_address(data); + b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); + if (b) + return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - *bounced = BOUNCED_VMALLOCED; - data = vmalloc(size); - if (data) - return data; + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT); + b = b ? page_address(b) : NULL; + if (b) + return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; - *bounced = BOUNCED_MEMPOOLED; - data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO); - return page_address(data); + b = vmalloc(size); + if (b) + return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw }; + + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); + b = b ? page_address(b) : NULL; + if (b) + return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; + + BUG(); } -static void *__bio_map_or_bounce(struct bch_fs *c, - struct bio *bio, struct bvec_iter start, - unsigned *bounced, int direction) +static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bvec_iter start, int rw) { + struct bbuf ret; struct bio_vec bv; struct bvec_iter iter; unsigned nr_pages = 0; @@ -53,18 +61,17 @@ static void *__bio_map_or_bounce(struct bch_fs *c, unsigned prev_end = PAGE_SIZE; void *data; - BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX); + BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); #ifndef CONFIG_HIGHMEM - *bounced = BOUNCED_CONTIG; - __bio_for_each_contig_segment(bv, bio, iter, start) { if (bv.bv_len == start.bi_size) - return page_address(bv.bv_page) + bv.bv_offset; + return (struct bbuf) { + .b = page_address(bv.bv_page) + bv.bv_offset, + .type = BB_NONE, .rw = rw + }; } #endif - *bounced = BOUNCED_MAPPED; - __bio_for_each_segment(bv, bio, iter, start) { if ((!first && bv.bv_offset) || prev_end != PAGE_SIZE) @@ -90,41 +97,43 @@ static void *__bio_map_or_bounce(struct bch_fs *c, if (pages != stack_pages) kfree(pages); - return data + bio_iter_offset(bio, start); + if (data) + return (struct bbuf) { + .b = data + bio_iter_offset(bio, start), + .type = BB_VMAP, .rw = rw + }; bounce: - data = __bounce_alloc(c, start.bi_size, bounced, direction); + ret = __bounce_alloc(c, start.bi_size, rw); - if (direction == READ) - memcpy_from_bio(data, bio, start); + if (rw == READ) + memcpy_from_bio(ret.b, bio, start); - return data; + return ret; } -static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio, - unsigned *bounced, int direction) +static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) { - return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction); + return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); } -static void bio_unmap_or_unbounce(struct bch_fs *c, void *data, - unsigned bounced, int direction) +static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) { - if (!data) - return; - - switch (bounced) { - case BOUNCED_MAPPED: - vunmap((void *) ((unsigned long) data & PAGE_MASK)); - return; - case BOUNCED_KMALLOCED: - kfree(data); - return; - case BOUNCED_VMALLOCED: - vfree(data); - return; - case BOUNCED_MEMPOOLED: - mempool_free(virt_to_page(data), &c->compression_bounce[direction]); - return; + switch (buf.type) { + case BB_NONE: + break; + case BB_VMAP: + vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); + break; + case BB_KMALLOC: + kfree(buf.b); + break; + case BB_VMALLOC: + vfree(buf.b); + break; + case BB_MEMPOOL: + mempool_free(virt_to_page(buf.b), + &c->compression_bounce[buf.rw]); + break; } } @@ -138,23 +147,30 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace) static int __bio_uncompress(struct bch_fs *c, struct bio *src, void *dst_data, struct bch_extent_crc128 crc) { - void *src_data = NULL; - unsigned src_bounced; + struct bbuf src_data = { NULL }; size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret; - src_data = bio_map_or_bounce(c, src, &src_bounced, READ); + src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { - case BCH_COMPRESSION_LZ4: - ret = lz4_decompress(src_data, &src_len, + case BCH_COMPRESSION_LZ4_OLD: + ret = bch2_lz4_decompress(src_data.b, &src_len, dst_data, dst_len); if (ret) { ret = -EIO; goto err; } break; + case BCH_COMPRESSION_LZ4: + ret = LZ4_decompress_safe(src_data.b, dst_data, + src_len, dst_len); + if (ret != dst_len) { + ret = -EIO; + goto err; + } + break; case BCH_COMPRESSION_GZIP: { void *workspace; z_stream strm; @@ -166,7 +182,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, workspace = c->zlib_workspace; } - strm.next_in = src_data; + strm.next_in = src_data.b; strm.avail_in = src_len; strm.next_out = dst_data; strm.avail_out = dst_len; @@ -191,7 +207,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, } ret = 0; err: - bio_unmap_or_unbounce(c, src_data, src_bounced, READ); + bio_unmap_or_unbounce(c, src_data); return ret; } @@ -199,21 +215,19 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, unsigned live_data_sectors, struct bch_extent_crc128 crc) { - void *dst_data = NULL; + struct bbuf dst_data = { NULL }; size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret = -ENOMEM; BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs); - /* XXX mempoolify */ - dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN); - if (!dst_data) { - dst_data = vmalloc(dst_len); - if (!dst_data) - goto err; - } + if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max || + crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max) + return -EIO; - ret = __bio_uncompress(c, bio, dst_data, crc); + dst_data = __bounce_alloc(c, dst_len, WRITE); + + ret = __bio_uncompress(c, bio, dst_data.b, crc); if (ret) goto err; @@ -231,9 +245,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, bio->bi_iter.bi_size = live_data_sectors << 9; copy_data: - memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9)); + memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9)); err: - kvfree(dst_data); + bio_unmap_or_unbounce(c, dst_data); return ret; use_mempool: /* @@ -251,67 +265,72 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, struct bio *dst, struct bvec_iter dst_iter, struct bch_extent_crc128 crc) { - void *dst_data = NULL; - unsigned dst_bounced; + struct bbuf dst_data = { NULL }; size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret = -ENOMEM; - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE) - : __bounce_alloc(c, dst_len, &dst_bounced, WRITE); + if (crc_uncompressed_size(NULL, &crc) < c->sb.encoded_extent_max) + return -EIO; - ret = __bio_uncompress(c, src, dst_data, crc); + dst_data = dst_len == dst_iter.bi_size + ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) + : __bounce_alloc(c, dst_len, WRITE); + + ret = __bio_uncompress(c, src, dst_data.b, crc); if (ret) goto err; - if (dst_bounced) - memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9)); + if (dst_data.type != BB_NONE) + memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); err: - bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); + bio_unmap_or_unbounce(c, dst_data); return ret; } static int __bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, - unsigned compression_type) + unsigned *compression_type) { - void *src_data = NULL, *dst_data = NULL; - unsigned src_bounced, dst_bounced, pad; - int ret = -1; + struct bbuf src_data = { NULL }, dst_data = { NULL }; + unsigned pad; + int ret; - dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE); - src_data = bio_map_or_bounce(c, src, &src_bounced, READ); + dst_data = bio_map_or_bounce(c, dst, WRITE); + src_data = bio_map_or_bounce(c, src, READ); + + switch (*compression_type) { + case BCH_COMPRESSION_LZ4_OLD: + *compression_type = BCH_COMPRESSION_LZ4; - switch (compression_type) { case BCH_COMPRESSION_LZ4: { void *workspace; + int len = src->bi_iter.bi_size; - *dst_len = dst->bi_iter.bi_size; - *src_len = src->bi_iter.bi_size; + ret = 0; workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); - while (*src_len > block_bytes(c) && - (ret = lz4_compress(src_data, *src_len, - dst_data, dst_len, - workspace))) { + while (len > block_bytes(c) && + (!(ret = LZ4_compress_destSize( + src_data.b, dst_data.b, + &len, dst->bi_iter.bi_size, + workspace)) || + (len & (block_bytes(c) - 1)))) { /* * On error, the compressed data was bigger than - * dst_len, and -ret is the amount of data we were able - * to compress - round down to nearest block and try - * again: + * dst_len - round down to nearest block and try again: */ - BUG_ON(ret > 0); - BUG_ON(-ret >= *src_len); - - *src_len = round_down(-ret, block_bytes(c)); + len = round_down(len, block_bytes(c)); } mempool_free(workspace, &c->lz4_workspace_pool); - if (ret) + if (!ret) goto err; + + *src_len = len; + *dst_len = ret; break; } case BCH_COMPRESSION_GZIP: { @@ -326,10 +345,10 @@ static int __bio_compress(struct bch_fs *c, workspace = c->zlib_workspace; } - strm.next_in = src_data; + strm.next_in = src_data.b; strm.avail_in = min(src->bi_iter.bi_size, dst->bi_iter.bi_size); - strm.next_out = dst_data; + strm.next_out = dst_data.b; strm.avail_out = dst->bi_iter.bi_size; zlib_set_workspace(&strm, workspace); zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, @@ -366,42 +385,37 @@ zlib_err: BUG(); } - BUG_ON(!*dst_len); - BUG_ON(*dst_len > dst->bi_iter.bi_size); - - BUG_ON(*src_len & (block_bytes(c) - 1)); - BUG_ON(*src_len > src->bi_iter.bi_size); - /* Didn't get smaller: */ - if (round_up(*dst_len, block_bytes(c)) >= *src_len) { - ret = -1; + if (round_up(*dst_len, block_bytes(c)) >= *src_len) goto err; - } pad = round_up(*dst_len, block_bytes(c)) - *dst_len; - memset(dst_data + *dst_len, 0, pad); + memset(dst_data.b + *dst_len, 0, pad); *dst_len += pad; - if (dst_bounced) - memcpy_to_bio(dst, dst->bi_iter, dst_data); -err: - bio_unmap_or_unbounce(c, src_data, src_bounced, READ); - bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE); + if (dst_data.type != BB_NONE) + memcpy_to_bio(dst, dst->bi_iter, dst_data.b); +out: + bio_unmap_or_unbounce(c, src_data); + bio_unmap_or_unbounce(c, dst_data); return ret; +err: + ret = -1; + goto out; } void bch2_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned *compression_type) + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned *compression_type) { unsigned orig_dst = dst->bi_iter.bi_size; unsigned orig_src = src->bi_iter.bi_size; /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ - src->bi_iter.bi_size = - min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9); + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, + c->sb.encoded_extent_max << 9); /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = @@ -410,7 +424,7 @@ void bch2_bio_compress(struct bch_fs *c, /* If it's only one block, don't bother trying to compress: */ if (*compression_type != BCH_COMPRESSION_NONE && bio_sectors(src) > c->sb.block_size && - !__bio_compress(c, dst, dst_len, src, src_len, *compression_type)) + !__bio_compress(c, dst, dst_len, src, src_len, compression_type)) goto out; /* If compressing failed (didn't get smaller), just copy: */ @@ -420,6 +434,11 @@ void bch2_bio_compress(struct bch_fs *c, out: dst->bi_iter.bi_size = orig_dst; src->bi_iter.bi_size = orig_src; + + BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); + BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); + BUG_ON(*dst_len & (block_bytes(c) - 1)); + BUG_ON(*src_len & (block_bytes(c) - 1)); } /* doesn't write superblock: */ @@ -460,7 +479,7 @@ void bch2_fs_compress_exit(struct bch_fs *c) int bch2_fs_compress_init(struct bch_fs *c) { - unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); + unsigned order = get_order(c->sb.encoded_extent_max << 9); int ret; if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h index 05804f55..ad1ba25d 100644 --- a/libbcachefs/compress.h +++ b/libbcachefs/compress.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_COMPRESS_H -#define _BCACHE_COMPRESS_H +#ifndef _BCACHEFS_COMPRESS_H +#define _BCACHEFS_COMPRESS_H int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, unsigned, struct bch_extent_crc128); @@ -12,4 +12,4 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); void bch2_fs_compress_exit(struct bch_fs *); int bch2_fs_compress_init(struct bch_fs *); -#endif /* _BCACHE_COMPRESS_H */ +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h index 77245045..b5de1a70 100644 --- a/libbcachefs/debug.h +++ b/libbcachefs/debug.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_DEBUG_H -#define _BCACHE_DEBUG_H +#ifndef _BCACHEFS_DEBUG_H +#define _BCACHEFS_DEBUG_H #include "bcachefs.h" @@ -59,4 +59,4 @@ static inline void bch2_fs_debug_init(struct bch_fs *c) {} void bch2_debug_exit(void); int bch2_debug_init(void); -#endif +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index fb2950a3..9fe3d8f6 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_DIRENT_H -#define _BCACHE_DIRENT_H +#ifndef _BCACHEFS_DIRENT_H +#define _BCACHEFS_DIRENT_H #include "str_hash.h" @@ -35,5 +35,4 @@ u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, int bch2_empty_dir(struct bch_fs *, u64); int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); -#endif /* _BCACHE_DIRENT_H */ - +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/libbcachefs/error.h b/libbcachefs/error.h index f2032d9e..68635eee 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_ERROR_H -#define _BCACHE_ERROR_H +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H #include @@ -220,4 +220,4 @@ do { \ (bio)->bi_error = -EIO; \ } while (0) -#endif /* _BCACHE_ERROR_H */ +#endif /* _BCACHEFS_ERROR_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 5819cefc..9936d0ff 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -435,13 +435,13 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, if (ptr != ptr2 && ptr->dev == ptr2->dev) return "multiple pointers to same device"; - if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets) + if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) return "offset past end of device"; - if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket) + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) return "offset before first bucket"; - if ((ptr->offset & (ca->mi.bucket_size - 1)) + + if (bucket_remainder(ca, ptr->offset) + size_ondisk > ca->mi.bucket_size) return "spans multiple buckets"; @@ -2126,7 +2126,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c, extent_for_each_entry(el, en_l) { struct bch_extent_ptr *lp, *rp; - unsigned bucket_size; + struct bch_dev *ca; en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); @@ -2144,10 +2144,9 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c, return BCH_MERGE_NOMERGE; /* We don't allow extents to straddle buckets: */ - bucket_size = c->devs[lp->dev]->mi.bucket_size; + ca = c->devs[lp->dev]; - if ((lp->offset & ~((u64) bucket_size - 1)) != - (rp->offset & ~((u64) bucket_size - 1))) + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) return BCH_MERGE_NOMERGE; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index e49b9cfe..dc2fcbc1 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_EXTENTS_H -#define _BCACHE_EXTENTS_H +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H #include "bcachefs.h" #include "bkey.h" @@ -565,4 +565,4 @@ bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); void bch2_key_resize(struct bkey *, unsigned); -#endif /* _BCACHE_EXTENTS_H */ +#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index dc23e44d..04dcfc50 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -259,29 +259,31 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); } -#define eytzinger0_find(base, _nr, _size, _cmp, _search) \ -({ \ - void *_base = base; \ - size_t _i = 0; \ - int _res; \ - \ - while (_i < (_nr) && \ - (_res = _cmp(_search, _base + _i * (_size), _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - \ - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { \ - bool found1 = _i < _nr, found2 = false; \ - unsigned _j; \ - \ - for (_j = 0; _j < _nr; _j++) \ - if (!_cmp(_base + _j * (_size), _search, _size))\ - found2 = true; \ - \ - BUG_ON(found1 != found2); \ - } \ - \ - _i; \ -}) +typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); + +static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, + eytzinger_cmp_fn cmp, void *search) +{ + size_t i = 0; + int res; + + while (i < nr && + (res = cmp(search, base + i * size, size))) + i = eytzinger0_child(i, res > 0); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + bool found1 = i < nr, found2 = false; + size_t j; + + for (j = 0; j < nr; j++) + if (!cmp(base + j * size, search, size)) + found2 = true; + + BUG_ON(found1 != found2); + } + + return i; +} void eytzinger0_sort(void *, size_t, size_t, int (*cmp_func)(const void *, const void *, size_t), diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 853815f8..0a9c0c9f 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_FIFO_H -#define _BCACHE_FIFO_H +#ifndef _BCACHEFS_FIFO_H +#define _BCACHEFS_FIFO_H #include "util.h" @@ -111,5 +111,4 @@ do { \ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ _iter++) -#endif /* _BCACHE_FIFO_H */ - +#endif /* _BCACHEFS_FIFO_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 58456030..6828221a 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1,3 +1,4 @@ +#ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "btree_update.h" @@ -520,7 +521,7 @@ int bch2_set_page_dirty(struct page *page) static bool bio_can_add_page_contig(struct bio *bio, struct page *page) { - sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; return bio->bi_vcnt < bio->bi_max_vecs && bio_end_sector(bio) == offset; @@ -539,7 +540,7 @@ static void __bio_add_page(struct bio *bio, struct page *page) static int bio_add_page_contig(struct bio *bio, struct page *page) { - sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; BUG_ON(!bio->bi_max_vecs); @@ -798,9 +799,10 @@ int bch2_readpages(struct file *file, struct address_space *mapping, pagecache_add_get(&mapping->add_lock); while ((page = readpage_iter_next(&readpages_iter))) { - unsigned n = max(min_t(unsigned, readpages_iter.nr_pages + 1, - BIO_MAX_PAGES), - BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT); + unsigned n = max_t(unsigned, + min_t(unsigned, readpages_iter.nr_pages + 1, + BIO_MAX_PAGES), + c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT); struct bch_read_bio *rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read)); @@ -976,9 +978,10 @@ alloc_io: (struct disk_reservation) { .nr_replicas = c->opts.data_replicas, }, - foreground_write_point(c, inum), + foreground_write_point(c, ei->last_dirtied), POS(inum, 0), - &ei->journal_seq, 0); + &ei->journal_seq, + BCH_WRITE_THROTTLE); w->io->op.op.index_update_fn = bchfs_write_index_update; } @@ -1327,6 +1330,7 @@ int bch2_write_end(struct file *filp, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); struct bch_fs *c = inode->i_sb->s_fs_info; lockdep_assert_held(&inode->i_rwsem); @@ -1350,6 +1354,8 @@ int bch2_write_end(struct file *filp, struct address_space *mapping, SetPageUptodate(page); if (!PageDirty(page)) set_page_dirty(page); + + ei->last_dirtied = (unsigned long) current; } else { bch2_put_page_reservation(c, page); } @@ -1546,9 +1552,10 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) dio->iop.is_dio = true; dio->iop.new_i_size = U64_MAX; bch2_write_op_init(&dio->iop.op, dio->c, dio->res, - foreground_write_point(dio->c, inode->i_ino), + foreground_write_point(dio->c, (unsigned long) current), POS(inode->i_ino, (dio->offset + dio->written) >> 9), - &ei->journal_seq, flags); + &ei->journal_seq, + flags|BCH_WRITE_THROTTLE); dio->iop.op.index_update_fn = bchfs_write_index_update; dio->res.sectors -= bio_sectors(bio); @@ -1900,10 +1907,10 @@ static int __bch2_truncate_page(struct address_space *mapping, */ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode->i_ino, - index << (PAGE_SHIFT - 9)), 0, k) { + index << PAGE_SECTOR_SHIFT), 0, k) { if (bkey_cmp(bkey_start_pos(k.k), POS(inode->i_ino, - (index + 1) << (PAGE_SHIFT - 9))) >= 0) + (index + 1) << PAGE_SECTOR_SHIFT)) >= 0) break; if (k.k->type != KEY_TYPE_DISCARD && @@ -2022,17 +2029,12 @@ int bch2_truncate(struct inode *inode, struct iattr *iattr) mutex_lock(&ei->update_lock); setattr_copy(inode, iattr); inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); - +err: /* clear I_SIZE_DIRTY: */ i_size_dirty_put(ei); ret = bch2_write_inode_size(c, ei, inode->i_size); mutex_unlock(&ei->update_lock); - pagecache_block_put(&mapping->add_lock); - - return 0; -err: - i_size_dirty_put(ei); err_put_pagecache: pagecache_block_put(&mapping->add_lock); return ret; @@ -2566,3 +2568,5 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 252a4039..9fdcb6b6 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_FS_IO_H -#define _BCACHE_FS_IO_H +#ifndef _BCACHEFS_FS_IO_H +#define _BCACHEFS_FS_IO_H #include "buckets.h" #include @@ -91,4 +91,4 @@ struct dio_read { extern struct bio_set *bch2_dio_read_bioset; -#endif /* _BCACHE_FS_IO_H */ +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 6c9792e8..76829f49 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1,3 +1,4 @@ +#ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "acl.h" @@ -18,8 +19,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -208,7 +211,6 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c, struct posix_acl *default_acl = NULL, *acl = NULL; struct bch_inode_info *ei; struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; int ret; inode = new_inode(parent->i_sb); @@ -227,9 +229,7 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c, bch2_inode_init(c, &inode_u, i_uid_read(inode), i_gid_read(inode), inode->i_mode, rdev); - bch2_inode_pack(&inode_p, &inode_u); - - ret = bch2_inode_create(c, &inode_p.inode.k_i, + ret = bch2_inode_create(c, &inode_u, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (unlikely(ret)) { @@ -241,7 +241,6 @@ static struct inode *bch2_vfs_inode_create(struct bch_fs *c, goto err; } - inode_u.inum = inode_p.inode.k.p.inode; bch2_vfs_inode_init(c, ei, &inode_u); if (default_acl) { @@ -1022,6 +1021,45 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; +static struct inode *bch2_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < BCACHEFS_ROOT_INO) + return ERR_PTR(-ESTALE); + + inode = bch2_vfs_inode_get(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + bch2_nfs_get_inode); +} + +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + bch2_nfs_get_inode); +} + +static const struct export_operations bch_export_ops = { + .fh_to_dentry = bch2_fh_to_dentry, + .fh_to_parent = bch2_fh_to_parent, + //.get_parent = bch2_get_parent, +}; + static void bch2_vfs_inode_init(struct bch_fs *c, struct bch_inode_info *ei, struct bch_inode_unpacked *bi) @@ -1154,7 +1192,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; u64 fsid; - buf->f_type = BCACHE_STATFS_MAGIC; + buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; buf->f_bfree = (c->capacity - bch2_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT; @@ -1371,8 +1409,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &bch_super_operations; + sb->s_export_op = &bch_export_ops; sb->s_xattr = bch2_xattr_handlers; - sb->s_magic = BCACHE_STATFS_MAGIC; + sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.time_precision; c->vfs_sb = sb; sb->s_bdi = &c->bdi; @@ -1393,7 +1432,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, else sb->s_flags |= opts.posix_acl ? MS_POSIXACL : 0; - inode = bch2_vfs_inode_get(sb, BCACHE_ROOT_INO); + inode = bch2_vfs_inode_get(sb, BCACHEFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto err_put_super; @@ -1480,3 +1519,5 @@ err: bch2_vfs_exit(); return ret; } + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 38a349d0..5f2c39f0 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_FS_H -#define _BCACHE_FS_H +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H #include "str_hash.h" @@ -25,6 +25,8 @@ struct bch_inode_info { atomic64_t i_sectors; struct bch_hash_info str_hash; + + unsigned long last_dirtied; }; #define to_bch_ei(_inode) \ @@ -42,7 +44,7 @@ static inline unsigned nlink_bias(umode_t mode) struct bch_inode_unpacked; -#ifndef NO_BCACHE_FS +#ifndef NO_BCACHEFS_FS /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, @@ -61,6 +63,6 @@ int bch2_vfs_init(void); static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } -#endif +#endif /* NO_BCACHEFS_FS */ -#endif /* _BCACHE_FS_H */ +#endif /* _BCACHEFS_FS_H */ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 18d1d533..f137b730 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -251,7 +251,7 @@ static int check_extents(struct bch_fs *c) int ret = 0; for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(BCACHE_ROOT_INO, 0), 0, k) { + POS(BCACHEFS_ROOT_INO, 0), 0, k) { if (k.k->type == KEY_TYPE_DISCARD) continue; @@ -310,7 +310,7 @@ static int check_dirents(struct bch_fs *c) hash_check_init(bch2_dirent_hash_desc, &h, c); for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(BCACHE_ROOT_INO, 0), 0, k) { + POS(BCACHEFS_ROOT_INO, 0), 0, k) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; @@ -444,7 +444,7 @@ static int check_xattrs(struct bch_fs *c) hash_check_init(bch2_xattr_hash_desc, &h, c); for_each_btree_key(&iter, c, BTREE_ID_XATTRS, - POS(BCACHE_ROOT_INO, 0), 0, k) { + POS(BCACHEFS_ROOT_INO, 0), 0, k) { ret = walk_inode(c, &w, k.k->p.inode); if (ret) break; @@ -478,7 +478,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) struct bkey_inode_buf packed; int ret; - ret = bch2_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode); + ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); if (ret && ret != -ENOENT) return ret; @@ -494,7 +494,7 @@ fsck_err: return ret; create_root: bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - root_inode->inum = BCACHE_ROOT_INO; + root_inode->inum = BCACHEFS_ROOT_INO; bch2_inode_pack(&packed, root_inode); @@ -514,7 +514,7 @@ static int check_lostfound(struct bch_fs *c, u64 inum; int ret; - inum = bch2_dirent_lookup(c, BCACHE_ROOT_INO, &root_hash_info, + inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, &lostfound); if (!inum) { bch_notice(c, "creating lost+found"); @@ -546,16 +546,13 @@ create_lostfound: return ret; bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - bch2_inode_pack(&packed, lostfound_inode); - ret = bch2_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0, + ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (ret) return ret; - lostfound_inode->inum = packed.inode.k.p.inode; - - ret = bch2_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR, + ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, &lostfound, lostfound_inode->inum, NULL, BTREE_INSERT_NOFAIL); if (ret) @@ -645,13 +642,13 @@ static int check_directory_structure(struct bch_fs *c, restart_dfs: had_unreachable = false; - ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO); + ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); if (ret) { bch_err(c, "memory allocation failure in inode_bitmap_set()"); goto err; } - ret = path_down(&path, BCACHE_ROOT_INO); + ret = path_down(&path, BCACHEFS_ROOT_INO); if (ret) { return ret; } @@ -792,7 +789,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, u64 d_inum; int ret; - inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false); + inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) { switch (k.k->type) { diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index 4bde1bda..f9af1305 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -1,7 +1,7 @@ -#ifndef _BCACHE_FS_GC_H -#define _BCACHE_FS_GC_H +#ifndef _BCACHEFS_FSCK_H +#define _BCACHEFS_FSCK_H s64 bch2_count_inode_sectors(struct bch_fs *, u64); int bch2_fsck(struct bch_fs *, bool); -#endif /* _BCACHE_FS_GC_H */ +#endif /* _BCACHEFS_FSCK_H */ diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 18bc182a..1422cc24 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -206,6 +206,11 @@ static const char *bch2_inode_invalid(const struct bch_fs *c, if (k.k->p.inode >= BLOCKDEV_INODE_MAX) return "blockdev inode in fs range"; + return NULL; + case BCH_INODE_GENERATION: + if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) + return "incorrect value size"; + return NULL; default: return "invalid type"; @@ -257,9 +262,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->i_otime = now; } -int bch2_inode_create(struct bch_fs *c, struct bkey_i *inode, +int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, u64 min, u64 max, u64 *hint) { + struct bkey_inode_buf inode_p; struct btree_iter iter; bool searched_from_start = false; int ret; @@ -281,6 +287,7 @@ again: while (1) { struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + u32 i_generation = 0; ret = btree_iter_err(k); if (ret) { @@ -288,31 +295,51 @@ again: return ret; } - if (k.k->type < BCH_INODE_FS) { - inode->k.p = k.k->p; + switch (k.k->type) { + case BCH_INODE_BLOCKDEV: + case BCH_INODE_FS: + /* slot used */ + if (iter.pos.inode == max) + goto out; - pr_debug("inserting inode %llu (size %u)", - inode->k.p.inode, inode->k.u64s); + bch2_btree_iter_advance_pos(&iter); + break; + + case BCH_INODE_GENERATION: { + struct bkey_s_c_inode_generation g = + bkey_s_c_to_inode_generation(k); + i_generation = le32_to_cpu(g.v->i_generation); + /* fallthrough: */ + } + default: + inode_u->i_generation = i_generation; + + bch2_inode_pack(&inode_p, inode_u); + inode_p.inode.k.p = k.k->p; ret = bch2_btree_insert_at(c, NULL, NULL, NULL, BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, inode)); + BTREE_INSERT_ENTRY(&iter, + &inode_p.inode.k_i)); + + if (ret != -EINTR) { + bch2_btree_iter_unlock(&iter); + + if (!ret) { + inode_u->inum = + inode_p.inode.k.p.inode; + *hint = inode_p.inode.k.p.inode + 1; + } + + return ret; + } if (ret == -EINTR) continue; - bch2_btree_iter_unlock(&iter); - if (!ret) - *hint = k.k->p.inode + 1; - - return ret; - } else { - if (iter.pos.inode == max) - break; - /* slot used */ - bch2_btree_iter_advance_pos(&iter); } } +out: bch2_btree_iter_unlock(&iter); if (!searched_from_start) { @@ -337,7 +364,8 @@ int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) { - struct bkey_i delete; + struct btree_iter iter; + struct bkey_i_inode_generation delete; int ret; ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL); @@ -366,11 +394,51 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) if (ret < 0) return ret; - bkey_init(&delete.k); - delete.k.p.inode = inode_nr; + bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), + BTREE_ITER_INTENT); + do { + struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + u32 i_generation = 0; - return bch2_btree_insert(c, BTREE_ID_INODES, &delete, NULL, - NULL, NULL, BTREE_INSERT_NOFAIL); + ret = btree_iter_err(k); + if (ret) { + bch2_btree_iter_unlock(&iter); + return ret; + } + + switch (k.k->type) { + case BCH_INODE_FS: { + struct bch_inode_unpacked inode_u; + + if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) + i_generation = cpu_to_le32(inode_u.i_generation) + 1; + break; + } + case BCH_INODE_GENERATION: { + struct bkey_s_c_inode_generation g = + bkey_s_c_to_inode_generation(k); + i_generation = le32_to_cpu(g.v->i_generation); + break; + } + } + + if (!i_generation) { + bkey_init(&delete.k); + delete.k.p.inode = inode_nr; + } else { + bkey_inode_generation_init(&delete.k_i); + delete.k.p.inode = inode_nr; + delete.v.i_generation = cpu_to_le32(i_generation); + } + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &delete.k_i)); + } while (ret == -EINTR); + + bch2_btree_iter_unlock(&iter); + return ret; } int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 06e2ffda..22aac3e6 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_INODE_H -#define _BCACHE_INODE_H +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H #include @@ -29,7 +29,8 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t); -int bch2_inode_create(struct bch_fs *, struct bkey_i *, u64, u64, u64 *); +int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, + u64, u64, u64 *); int bch2_inode_truncate(struct bch_fs *, u64, u64, struct extent_insert_hook *, u64 *); int bch2_inode_rm(struct bch_fs *, u64); @@ -60,4 +61,4 @@ void bch2_inode_pack_test(void); static inline void bch2_inode_pack_test(void) {} #endif -#endif +#endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 9eed97ba..946c75bb 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -79,6 +79,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, /* Bios with headers */ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + enum bch_data_type type, const struct bkey_i *k) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); @@ -122,6 +123,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->bio.bi_opf |= REQ_FUA; if (likely(percpu_ref_tryget(&ca->io_ref))) { + this_cpu_add(ca->io_done->sectors[WRITE][type], + bio_sectors(&n->bio)); + n->have_io_ref = true; n->bio.bi_bdev = ca->disk_sb.bdev; submit_bio(&n->bio); @@ -423,17 +427,12 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) orig, &src_len, &fragment_compression_type); - BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size); - BUG_ON(!src_len || src_len > orig->bi_iter.bi_size); - BUG_ON(dst_len & (block_bytes(c) - 1)); - BUG_ON(src_len & (block_bytes(c) - 1)); - - swap(bio->bi_iter.bi_size, dst_len); nonce = extent_nonce(op->version, crc_nonce, src_len >> 9, - fragment_compression_type), + fragment_compression_type); + swap(bio->bi_iter.bi_size, dst_len); bch2_encrypt_bio(c, csum_type, nonce, bio); csum = bch2_checksum_bio(c, csum_type, nonce, bio); @@ -496,7 +495,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) closure_get(bio->bi_private); - bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + key_to_write); return more; } @@ -661,9 +661,9 @@ void bch2_write(struct closure *cl) /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */ - if (c->foreground_write_ratelimit_enabled && - c->foreground_write_pd.rate.rate < (1 << 30) && - op->wp->throttle) { + if ((op->flags & BCH_WRITE_THROTTLE) && + c->foreground_write_ratelimit_enabled && + c->foreground_write_pd.rate.rate < (1 << 30)) { unsigned long flags; u64 delay; @@ -715,7 +715,8 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->error = 0; op->flags = flags; op->csum_type = bch2_data_checksum_type(c); - op->compression_type = c->opts.compression; + op->compression_type = + bch2_compression_opt_to_type(c->opts.compression); op->nr_replicas = res.nr_replicas; op->alloc_reserve = RESERVE_NONE; op->nonce = 0; @@ -1203,6 +1204,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (bounce) trace_read_bounce(&rbio->bio); + this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], + bio_sectors(&rbio->bio)); + if (likely(!(flags & BCH_READ_IN_RETRY))) { submit_bio(&rbio->bio); } else { diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 1aa0bfab..674cdf7a 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_IO_H -#define _BCACHE_IO_H +#ifndef _BCACHEFS_IO_H +#define _BCACHEFS_IO_H #include #include "io_types.h" @@ -14,19 +14,19 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - const struct bkey_i *); + enum bch_data_type, const struct bkey_i *); enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), BCH_WRITE_FLUSH = (1 << 2), BCH_WRITE_DATA_COMPRESSED = (1 << 3), + BCH_WRITE_THROTTLE = (1 << 4), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 4), - BCH_WRITE_DONE = (1 << 5), - BCH_WRITE_LOOPED = (1 << 6), - __BCH_WRITE_KEYLIST_LOCKED = 8, + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 5), + BCH_WRITE_DONE = (1 << 6), + BCH_WRITE_LOOPED = (1 << 7), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -105,4 +105,4 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio) return rbio; } -#endif /* _BCACHE_IO_H */ +#endif /* _BCACHEFS_IO_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 9842019d..ae4f8f3c 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_IO_TYPES_H -#define _BCACHE_IO_TYPES_H +#ifndef _BCACHEFS_IO_TYPES_H +#define _BCACHEFS_IO_TYPES_H #include "btree_types.h" #include "buckets_types.h" @@ -148,4 +148,4 @@ struct bch_write_op { struct bch_write_bio wbio; }; -#endif /* _BCACHE_IO_TYPES_H */ +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 6dc14ff2..b22fc8d9 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1274,10 +1274,15 @@ static int journal_entry_sectors(struct journal *j) lockdep_assert_held(&j->lock); - spin_lock(&j->devs.lock); - group_for_each_dev(ca, &j->devs, i) { + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; unsigned buckets_required = 0; + if (!ja->nr) + continue; + sectors_available = min_t(unsigned, sectors_available, ca->mi.bucket_size); @@ -1288,11 +1293,11 @@ static int journal_entry_sectors(struct journal *j) * it too: */ if (bch2_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ca->journal.sectors_free) + if (j->prev_buf_sectors > ja->sectors_free) buckets_required++; if (j->prev_buf_sectors + sectors_available > - ca->journal.sectors_free) + ja->sectors_free) buckets_required++; } else { if (j->prev_buf_sectors + sectors_available > @@ -1306,7 +1311,7 @@ static int journal_entry_sectors(struct journal *j) nr_devs++; nr_online++; } - spin_unlock(&j->devs.lock); + rcu_read_unlock(); if (nr_online < c->opts.metadata_replicas_required) return -EROFS; @@ -1542,7 +1547,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, */ if (bch2_disk_reservation_get(c, &disk_res, - (nr - ja->nr) << ca->bucket_bits, 0)) + bucket_to_sector(ca, nr - ja->nr), 0)) return -ENOSPC; mutex_lock(&c->sb_lock); @@ -1566,7 +1571,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, while (ja->nr < nr) { /* must happen under journal lock, to avoid racing with gc: */ - long b = bch2_bucket_alloc(c, ca, RESERVE_NONE); + long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC); if (b < 0) { if (!closure_wait(&c->freelist_wait, &cl)) { spin_unlock(&j->lock); @@ -1969,7 +1974,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) struct bch_extent_ptr *ptr; struct journal_device *ja; struct bch_dev *ca; - bool swapped; + struct dev_alloc_list devs_sorted; unsigned i, replicas, replicas_want = READ_ONCE(c->opts.metadata_replicas); @@ -1996,26 +2001,18 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) replicas = bch2_extent_nr_ptrs(e.c); - spin_lock(&j->devs.lock); + rcu_read_lock(); + devs_sorted = bch2_wp_alloc_list(c, &j->wp, + &c->rw_devs[BCH_DATA_JOURNAL]); - /* Sort by tier: */ - do { - swapped = false; + for (i = 0; i < devs_sorted.nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; - for (i = 0; i + 1 < j->devs.nr; i++) - if (j->devs.d[i + 0].dev->mi.tier > - j->devs.d[i + 1].dev->mi.tier) { - swap(j->devs.d[i], j->devs.d[i + 1]); - swapped = true; - } - } while (swapped); - - /* - * Pick devices for next journal write: - * XXX: sort devices by free journal space? - */ - group_for_each_dev(ca, &j->devs, i) { ja = &ca->journal; + if (!ja->nr) + continue; if (replicas >= replicas_want) break; @@ -2029,6 +2026,9 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) sectors > ca->mi.bucket_size) continue; + j->wp.next_alloc[ca->dev_idx] += U32_MAX; + bch2_wp_rescale(c, ca, &j->wp); + ja->sectors_free = ca->mi.bucket_size - sectors; ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); @@ -2041,7 +2041,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) }); replicas++; } - spin_unlock(&j->devs.lock); + rcu_read_unlock(); j->prev_buf_sectors = 0; spin_unlock(&j->lock); @@ -2280,7 +2280,8 @@ static void journal_write(struct closure *cl) continue; } - atomic64_add(sectors, &ca->meta_sectors_written); + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], + sectors); ca->journal.ptr_idx = ptr_idx++; bio = ca->journal.bio; @@ -2682,6 +2683,7 @@ int bch2_journal_flush(struct journal *j) ssize_t bch2_journal_print_debug(struct journal *j, char *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state *s = &j->reservations; struct bch_dev *ca; unsigned iter; @@ -2714,10 +2716,13 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) journal_entry_is_open(j), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - spin_lock(&j->devs.lock); - group_for_each_dev(ca, &j->devs, iter) { + for_each_member_device_rcu(ca, c, iter, + &c->rw_devs[BCH_DATA_JOURNAL]) { struct journal_device *ja = &ca->journal; + if (!ja->nr) + continue; + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "dev %u:\n" "\tnr\t\t%u\n" @@ -2727,7 +2732,6 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } - spin_unlock(&j->devs.lock); spin_unlock(&j->lock); rcu_read_unlock(); @@ -2911,7 +2915,6 @@ int bch2_fs_journal_init(struct journal *j) INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); mutex_init(&j->blacklist_lock); INIT_LIST_HEAD(&j->seq_blacklist); - spin_lock_init(&j->devs.lock); mutex_init(&j->reclaim_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index d785a0cb..9d6c79c6 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_JOURNAL_H -#define _BCACHE_JOURNAL_H +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H /* * THE JOURNAL: @@ -402,4 +402,4 @@ int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); int bch2_fs_journal_init(struct journal *); -#endif /* _BCACHE_JOURNAL_H */ +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index e3342453..55b41c56 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -1,10 +1,11 @@ -#ifndef _BCACHE_JOURNAL_TYPES_H -#define _BCACHE_JOURNAL_TYPES_H +#ifndef _BCACHEFS_JOURNAL_TYPES_H +#define _BCACHEFS_JOURNAL_TYPES_H #include #include #include "alloc_types.h" +#include "super_types.h" #include "fifo.h" struct journal_res; @@ -176,7 +177,7 @@ struct journal { struct list_head seq_blacklist; BKEY_PADDED(key); - struct dev_group devs; + struct write_point wp; struct delayed_work reclaim_work; unsigned long last_flushed; @@ -234,4 +235,4 @@ struct journal_device { struct closure read; }; -#endif /* _BCACHE_JOURNAL_TYPES_H */ +#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index 87388c97..ea65f8e0 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_KEYLIST_H -#define _BCACHE_KEYLIST_H +#ifndef _BCACHEFS_KEYLIST_H +#define _BCACHEFS_KEYLIST_H #include "keylist_types.h" @@ -65,4 +65,4 @@ void bch2_verify_keylist_sorted(struct keylist *); static inline void bch2_verify_keylist_sorted(struct keylist *l) {} #endif -#endif /* _BCACHE_KEYLIST_H */ +#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/libbcachefs/keylist_types.h b/libbcachefs/keylist_types.h index 195785bf..48a17d7a 100644 --- a/libbcachefs/keylist_types.h +++ b/libbcachefs/keylist_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_KEYLIST_TYPES_H -#define _BCACHE_KEYLIST_TYPES_H +#ifndef _BCACHEFS_KEYLIST_TYPES_H +#define _BCACHEFS_KEYLIST_TYPES_H struct keylist { union { @@ -12,4 +12,4 @@ struct keylist { }; }; -#endif /* _BCACHE_KEYLIST_TYPES_H */ +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/libbcachefs/lz4.h b/libbcachefs/lz4.h index 6b784c59..22e7859c 100644 --- a/libbcachefs/lz4.h +++ b/libbcachefs/lz4.h @@ -1,87 +1,7 @@ -#ifndef __LZ4_H__ -#define __LZ4_H__ -/* - * LZ4 Kernel Interface - * - * Copyright (C) 2013, LG Electronics, Kyungsik Lee - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define LZ4_MEM_COMPRESS (16384) -#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) +#ifndef __BCH_LZ4_H__ +#define __BCH_LZ4_H__ -/* - * lz4_compressbound() - * Provides the maximum size that LZ4 may output in a "worst case" scenario - * (input data not compressible) - */ -static inline size_t lz4_compressbound(size_t isize) -{ - return isize + (isize / 255) + 16; -} +int bch2_lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); -/* - * lz4_compress() - * src : source address of the original data - * src_len : size of the original data - * dst : output buffer address of the compressed data - * This requires 'dst' of size LZ4_COMPRESSBOUND. - * dst_len : is the output size, which is returned after compress done - * workmem : address of the working memory. - * This requires 'workmem' of size LZ4_MEM_COMPRESS. - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer and workmem must be already allocated with - * the defined size. - */ -int lz4_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem); - - /* - * lz4hc_compress() - * src : source address of the original data - * src_len : size of the original data - * dst : output buffer address of the compressed data - * This requires 'dst' of size LZ4_COMPRESSBOUND. - * dst_len : is the output size, which is returned after compress done - * workmem : address of the working memory. - * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer and workmem must be already allocated with - * the defined size. - */ -int lz4hc_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem); - -/* - * lz4_decompress() - * src : source address of the compressed data - * src_len : is the input size, whcih is returned after decompress done - * dest : output buffer address of the decompressed data - * actual_dest_len: is the size of uncompressed data, supposing it's known - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. - * slightly faster than lz4_decompress_unknownoutputsize() - */ -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); - -/* - * lz4_decompress_unknownoutputsize() - * src : source address of the compressed data - * src_len : is the input size, therefore the compressed size - * dest : output buffer address of the decompressed data - * dest_len: is the max size of the destination buffer, which is - * returned with actual size of decompressed data after - * decompress done - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. - */ -int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len); #endif diff --git a/libbcachefs/lz4_compress.c b/libbcachefs/lz4_compress.c deleted file mode 100644 index de33acf3..00000000 --- a/libbcachefs/lz4_compress.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ - * - * Changed for kernel use by: - * Chanho Min - */ - -#include -#include -#include -#include "lz4.h" -#include "lz4defs.h" - -#define LZ4_HASH_VALUE(p, _table) \ - __HASH_VALUE(p, MEMORY_USAGE - ilog2(sizeof(_table[0]))) - -struct lz4_hash_table { - const u8 *(*add)(const struct lz4_hash_table, const u8 *); - void *ctx; - const u8 *base; -}; - -#if __SIZEOF_POINTER__ == 4 -static inline const u8 *hash_table_add32(const struct lz4_hash_table hash, - const u8 *ip) -{ - const u8 **table = hash.ctx; - - swap(table[LZ4_HASH_VALUE(ip, table)], ip); - return ip; -} -#else -static inline const u8 *hash_table_add32(const struct lz4_hash_table hash, - const u8 *ip) -{ - u32 *table = hash.ctx; - size_t offset = ip - hash.base; - - swap(table[LZ4_HASH_VALUE(ip, table)], offset); - return hash.base + offset; -} -#endif - -static inline const u8 *hash_table_add16(const struct lz4_hash_table hash, - const u8 *ip) -{ - u16 *table = hash.ctx; - size_t offset = ip - hash.base; - - swap(table[LZ4_HASH_VALUE(ip, table)], offset); - return hash.base + offset; -} - -static inline const u8 *find_match(const struct lz4_hash_table hash, - const u8 **ip, const u8 *anchor, - const u8 *start, const u8 *mflimit) -{ - int findmatchattempts = (1U << SKIPSTRENGTH) + 3; - - while (*ip <= mflimit) { - const u8 *ref = hash.add(hash, *ip); - - if (ref >= *ip - MAX_DISTANCE && A32(ref) == A32(*ip)) { - /* found match: */ - while (*ip > anchor && - ref > start && - unlikely((*ip)[-1] == ref[-1])) { - (*ip)--; - ref--; - } - - return ref; - } - - *ip += findmatchattempts++ >> SKIPSTRENGTH; - } - - return NULL; -} - -static inline int length_len(unsigned length) -{ - return length / 255 + 1; -} - -/* - * LZ4_compressCtx : - * ----------------- - * Compress 'isize' bytes from 'source' into an output buffer 'dest' of - * maximum size 'maxOutputSize'. * If it cannot achieve it, compression - * will stop, and result of the function will be zero. - * return : the number of bytes written in buffer 'dest', or 0 if the - * compression fails - */ -static inline int lz4_compressctx(const struct lz4_hash_table hash, - const u8 *src, size_t src_len, - u8 *dst, size_t *dst_len) -{ - const u8 *ip = src, *anchor = ip, *ref; - const u8 *const iend = ip + src_len; - const u8 *const mflimit = iend - MFLIMIT; - const u8 *const matchlimit = iend - LASTLITERALS; - u8 *op = dst, *token; - u8 *const oend = op + *dst_len; - size_t literal_len, match_len, match_offset; - - /* Init */ - memset(hash.ctx, 0, LZ4_MEM_COMPRESS); - hash.add(hash, ip); - - /* Always start with a literal: */ - ip++; - - while ((ref = find_match(hash, &ip, anchor, src, mflimit))) { - /* - * We found a match; @ip now points to the match and @ref points - * to the prior part of the input we matched with. Everything up - * to @anchor has been encoded; the range from @anchor to @ip - * didn't match and now has to be encoded as a literal: - */ - literal_len = ip - anchor; - match_offset = ip - ref; - - /* MINMATCH bytes already matched from find_match(): */ - ip += MINMATCH; - ref += MINMATCH; - match_len = common_length(ip, ref, matchlimit); - ip += match_len; - - /* check output limit */ - if (unlikely(op + - 1 + /* token */ - 2 + /* match ofset */ - literal_len + - length_len(literal_len) + - length_len(match_len) + - LASTLITERALS > oend)) - break; - - token = op++; - *token = encode_length(&op, literal_len) << ML_BITS; - MEMCPY_ADVANCE_CHUNKED(op, anchor, literal_len); - PUT_LE16_ADVANCE(op, match_offset); - *token += encode_length(&op, match_len); - - anchor = ip; - } - - /* Encode remaining input as literal: */ - literal_len = iend - anchor; - if (unlikely(op + - 1 + - literal_len + - length_len(literal_len) > oend)) { - /* Return how much would be able to fit: */ - ssize_t remaining = oend - op; - ssize_t encoded = anchor - src; - - remaining -= length_len(remaining) + 1; - - return -max(encoded + remaining, 1L); - } - - token = op++; - *token = encode_length(&op, literal_len) << ML_BITS; - MEMCPY_ADVANCE(op, anchor, literal_len); - - /* End */ - BUG_ON(op > oend); - *dst_len = op - dst; - return 0; -} - -__attribute__((flatten)) -int lz4_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem) -{ - if (src_len < LZ4_64KLIMIT) { - const struct lz4_hash_table hash = { - .add = hash_table_add16, - .ctx = wrkmem, - .base = src, - }; - - return lz4_compressctx(hash, src, src_len, dst, dst_len); - } else { - const struct lz4_hash_table hash = { - .add = hash_table_add32, - .ctx = wrkmem, - .base = src, - }; - - return lz4_compressctx(hash, src, src_len, dst, dst_len); - } -} -EXPORT_SYMBOL(lz4_compress); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/libbcachefs/lz4_decompress.c b/libbcachefs/lz4_decompress.c index 77c9c391..9e809f97 100644 --- a/libbcachefs/lz4_decompress.c +++ b/libbcachefs/lz4_decompress.c @@ -43,7 +43,110 @@ #endif #include "lz4.h" -#include "lz4defs.h" + +/* + * Detects 64 bits mode + */ +#if defined(CONFIG_64BIT) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +#include +#include +#include + +#define A32(_p) get_unaligned((u32 *) (_p)) +#define A16(_p) get_unaligned((u16 *) (_p)) + +#define GET_LE16_ADVANCE(_src) \ +({ \ + u16 _r = get_unaligned_le16(_src); \ + (_src) += 2; \ + _r; \ +}) + +#define PUT_LE16_ADVANCE(_dst, _v) \ +do { \ + put_unaligned_le16((_v), (_dst)); \ + (_dst) += 2; \ +} while (0) + +#define LENGTH_LONG 15 +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) + +#define __HASH_VALUE(p, bits) \ + (((A32(p)) * 2654435761U) >> (32 - (bits))) + +#define HASH_VALUE(p) __HASH_VALUE(p, HASH_LOG) + +#define MEMCPY_ADVANCE(_dst, _src, length) \ +do { \ + typeof(length) _length = (length); \ + memcpy(_dst, _src, _length); \ + _src += _length; \ + _dst += _length; \ +} while (0) + +#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length) \ +do { \ + const u8 *_end = (_src) + (_length); \ + while ((_src) < _end) \ + *_dst++ = *_src++; \ +} while (0) + +#define STEPSIZE __SIZEOF_LONG__ + +#define LZ4_COPYPACKET(_src, _dst) \ +do { \ + MEMCPY_ADVANCE(_dst, _src, STEPSIZE); \ + MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\ +} while (0) + +/* + * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by + * COPYLENGTH: + * + * Note: src and dst may overlap (with src < dst) - we must do the copy in + * STEPSIZE chunks for correctness + * + * Note also: length may be negative - we must not call memcpy if length is + * negative, but still adjust dst and src by length + */ +#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length) \ +do { \ + u8 *_end = (_dst) + (_length); \ + while ((_dst) < _end) \ + LZ4_COPYPACKET(_src, _dst); \ + _src -= (_dst) - _end; \ + _dst = _end; \ +} while (0) + +#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\ +do { \ + while ((_dst) < (_end)) \ + LZ4_COPYPACKET((_src), (_dst)); \ +} while (0) static const int dec32table[8] = {0, 3, 2, 3, 0, 0, 0, 0}; #if LZ4_ARCH64 @@ -157,124 +260,8 @@ _output_error: return -1; } -static inline ssize_t get_length_safe(const u8 **ip, ssize_t length) -{ - if (length == 15) { - size_t len; - - do { - length += (len = *(*ip)++); - if (unlikely((ssize_t) length < 0)) - return -1; - - length += len; - } while (len == 255); - } - - return length; -} - -static int lz4_uncompress_unknownoutputsize(const u8 *source, u8 *dest, - int isize, size_t maxoutputsize) -{ - const u8 *ip = source; - const u8 *const iend = ip + isize; - const u8 *ref; - u8 *op = dest; - u8 * const oend = op + maxoutputsize; - u8 *cpy; - unsigned token, offset; - size_t length; - - /* Main Loop */ - while (ip < iend) { - /* get runlength */ - token = *ip++; - length = get_length_safe(&ip, token >> ML_BITS); - if (unlikely((ssize_t) length < 0)) - goto _output_error; - - /* copy literals */ - if ((op + length > oend - COPYLENGTH) || - (ip + length > iend - COPYLENGTH)) { - - if (op + length > oend) - goto _output_error;/* writes beyond buffer */ - - if (ip + length != iend) - goto _output_error;/* - * Error: LZ4 format requires - * to consume all input - * at this stage - */ - MEMCPY_ADVANCE(op, ip, length); - break;/* Necessarily EOF, due to parsing restrictions */ - } - MEMCPY_ADVANCE_CHUNKED(op, ip, length); - - /* get match offset */ - offset = GET_LE16_ADVANCE(ip); - ref = op - offset; - - /* Error: offset create reference outside destination buffer */ - if (ref < (u8 * const) dest) - goto _output_error; - - /* get match length */ - length = get_length_safe(&ip, token & ML_MASK); - if (unlikely((ssize_t) length < 0)) - goto _output_error; - - length += MINMATCH; - - /* copy first STEPSIZE bytes of match: */ - if (unlikely(offset < STEPSIZE)) { - MEMCPY_ADVANCE_BYTES(op, ref, 4); - ref -= dec32table[offset]; - - memcpy(op, ref, 4); - op += STEPSIZE - 4; - ref -= dec64table[offset]; - } else { - MEMCPY_ADVANCE(op, ref, STEPSIZE); - } - length -= STEPSIZE; - - /* copy rest of match: */ - cpy = op + length; - if (cpy > oend - COPYLENGTH) { - /* Error: request to write beyond destination buffer */ - if (cpy > oend || - ref + COPYLENGTH > oend) - goto _output_error; -#if !LZ4_ARCH64 - if (op + COPYLENGTH > oend) - goto _output_error; -#endif - MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH); - while (op < cpy) - *op++ = *ref++; - op = cpy; - /* - * Check EOF (should never happen, since last 5 bytes - * are supposed to be literals) - */ - if (op == oend) - goto _output_error; - } else { - MEMCPY_ADVANCE_CHUNKED(op, ref, length); - } - } - /* end of decoding */ - return op - dest; - - /* write overflow error detected */ -_output_error: - return -1; -} - -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len) +int bch2_lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) { int ret = -1; int input_len = 0; @@ -288,29 +275,3 @@ int lz4_decompress(const unsigned char *src, size_t *src_len, exit_0: return ret; } -#ifndef STATIC -EXPORT_SYMBOL(lz4_decompress); -#endif - -int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len) -{ - int ret = -1; - int out_len = 0; - - out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, - *dest_len); - if (out_len < 0) - goto exit_0; - *dest_len = out_len; - - return 0; -exit_0: - return ret; -} -#ifndef STATIC -EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("LZ4 Decompressor"); -#endif diff --git a/libbcachefs/lz4defs.h b/libbcachefs/lz4defs.h deleted file mode 100644 index 29f70f91..00000000 --- a/libbcachefs/lz4defs.h +++ /dev/null @@ -1,182 +0,0 @@ -/* - * lz4defs.h -- architecture specific defines - * - * Copyright (C) 2013, LG Electronics, Kyungsik Lee - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * Detects 64 bits mode - */ -#if defined(CONFIG_64BIT) -#define LZ4_ARCH64 1 -#else -#define LZ4_ARCH64 0 -#endif - -#include -#include -#include - -#define A32(_p) get_unaligned((u32 *) (_p)) -#define A16(_p) get_unaligned((u16 *) (_p)) - -#define GET_LE16_ADVANCE(_src) \ -({ \ - u16 _r = get_unaligned_le16(_src); \ - (_src) += 2; \ - _r; \ -}) - -#define PUT_LE16_ADVANCE(_dst, _v) \ -do { \ - put_unaligned_le16((_v), (_dst)); \ - (_dst) += 2; \ -} while (0) - -#define LENGTH_LONG 15 -#define COPYLENGTH 8 -#define ML_BITS 4 -#define ML_MASK ((1U << ML_BITS) - 1) -#define RUN_BITS (8 - ML_BITS) -#define RUN_MASK ((1U << RUN_BITS) - 1) -#define MEMORY_USAGE 14 -#define MINMATCH 4 -#define SKIPSTRENGTH 6 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH + MINMATCH) -#define MINLENGTH (MFLIMIT + 1) -#define MAXD_LOG 16 -#define MAXD (1 << MAXD_LOG) -#define MAXD_MASK (u32)(MAXD - 1) -#define MAX_DISTANCE (MAXD - 1) -#define HASH_LOG (MAXD_LOG - 1) -#define HASHTABLESIZE (1 << HASH_LOG) -#define MAX_NB_ATTEMPTS 256 -#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) -#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) - -#define __HASH_VALUE(p, bits) \ - (((A32(p)) * 2654435761U) >> (32 - (bits))) - -#define HASH_VALUE(p) __HASH_VALUE(p, HASH_LOG) - -#define MEMCPY_ADVANCE(_dst, _src, length) \ -do { \ - typeof(length) _length = (length); \ - memcpy(_dst, _src, _length); \ - _src += _length; \ - _dst += _length; \ -} while (0) - -#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length) \ -do { \ - const u8 *_end = (_src) + (_length); \ - while ((_src) < _end) \ - *_dst++ = *_src++; \ -} while (0) - -#define STEPSIZE __SIZEOF_LONG__ - -#define LZ4_COPYPACKET(_src, _dst) \ -do { \ - MEMCPY_ADVANCE(_dst, _src, STEPSIZE); \ - MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\ -} while (0) - -/* - * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by - * COPYLENGTH: - * - * Note: src and dst may overlap (with src < dst) - we must do the copy in - * STEPSIZE chunks for correctness - * - * Note also: length may be negative - we must not call memcpy if length is - * negative, but still adjust dst and src by length - */ -#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length) \ -do { \ - u8 *_end = (_dst) + (_length); \ - while ((_dst) < _end) \ - LZ4_COPYPACKET(_src, _dst); \ - _src -= (_dst) - _end; \ - _dst = _end; \ -} while (0) - -#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\ -do { \ - while ((_dst) < (_end)) \ - LZ4_COPYPACKET((_src), (_dst)); \ -} while (0) - -struct lz4_hashtable { -#if LZ4_ARCH64 - const u8 * const base; - u32 *table; -#else - const int base; - const u8 *table; -#endif -}; - -#if LZ4_ARCH64 -#define HTYPE u32 -#else /* 32-bit */ -#define HTYPE const u8* -#endif - -#ifdef __BIG_ENDIAN -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzl(val) >> 3) -#else -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzl(val) >> 3) -#endif - -static inline unsigned common_length(const u8 *l, const u8 *r, - const u8 *const l_end) -{ - const u8 *l_start = l; - - while (likely(l <= l_end - sizeof(long))) { - unsigned long diff = - get_unaligned((unsigned long *) l) ^ - get_unaligned((unsigned long *) r); - - if (diff) - return l + LZ4_NBCOMMONBYTES(diff) - l_start; - - l += sizeof(long); - r += sizeof(long); - } -#if LZ4_ARCH64 - if (l <= l_end - 4 && A32(r) == A32(l)) { - l += 4; - r += 4; - } -#endif - if (l <= l_end - 2 && A16(r) == A16(l)) { - l += 2; - r += 2; - } - if (l <= l_end - 1 && *r == *l) { - l++; - r++; - } - - return l - l_start; -} - -static inline unsigned encode_length(u8 **op, unsigned length) -{ - if (length >= LENGTH_LONG) { - length -= LENGTH_LONG; - - for (; length > 254 ; length -= 255) - *(*op)++ = 255; - *(*op)++ = length; - return LENGTH_LONG; - } else - return length; -} diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h index 81776bdc..9bdaa792 100644 --- a/libbcachefs/migrate.h +++ b/libbcachefs/migrate.h @@ -1,8 +1,8 @@ -#ifndef _BCACHE_MIGRATE_H -#define _BCACHE_MIGRATE_H +#ifndef _BCACHEFS_MIGRATE_H +#define _BCACHEFS_MIGRATE_H int bch2_move_data_off_device(struct bch_dev *); int bch2_move_metadata_off_device(struct bch_dev *); int bch2_flag_data_bad(struct bch_dev *); -#endif /* _BCACHE_MIGRATE_H */ +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 73132a0d..f78cd72f 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -17,13 +17,13 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, struct bch_extent_ptr ptr) { struct bch_extent_ptr *ptr2; - unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits; + struct bch_dev *ca = c->devs[ptr.dev]; extent_for_each_ptr(e, ptr2) if (ptr2->dev == ptr.dev && ptr2->gen == ptr.gen && - (ptr2->offset >> bucket_bits) == - (ptr.offset >> bucket_bits)) + PTR_BUCKET_NR(ca, ptr2) == + PTR_BUCKET_NR(ca, &ptr)) return ptr2; return NULL; diff --git a/libbcachefs/move.h b/libbcachefs/move.h index ed0b24c9..71edcf13 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -1,9 +1,8 @@ -#ifndef _BCACHE_MOVE_H -#define _BCACHE_MOVE_H +#ifndef _BCACHEFS_MOVE_H +#define _BCACHEFS_MOVE_H #include "buckets.h" #include "io_types.h" -#include "move_types.h" enum moving_flag_bitnos { MOVING_FLAG_BITNO_READ = 0, @@ -83,4 +82,4 @@ void bch2_move_ctxt_exit(struct moving_context *); void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *, unsigned); -#endif /* _BCACHE_MOVE_H */ +#endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h deleted file mode 100644 index 0e2275e2..00000000 --- a/libbcachefs/move_types.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _BCACHE_MOVE_TYPES_H -#define _BCACHE_MOVE_TYPES_H - -#endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h index e27ccc35..d835d138 100644 --- a/libbcachefs/movinggc.h +++ b/libbcachefs/movinggc.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_MOVINGGC_H -#define _BCACHE_MOVINGGC_H +#ifndef _BCACHEFS_MOVINGGC_H +#define _BCACHEFS_MOVINGGC_H /* * We can't use the entire copygc reserve in one iteration of copygc: we may @@ -27,4 +27,4 @@ void bch2_moving_gc_stop(struct bch_dev *); int bch2_moving_gc_start(struct bch_dev *); void bch2_dev_moving_gc_init(struct bch_dev *); -#endif +#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index b1bbf092..b5ae5aeb 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -32,6 +32,15 @@ const char * const bch2_str_hash_types[] = { NULL }; +const char * const bch2_data_types[] = { + "none", + "sb", + "journal", + "btree", + "data", + NULL +}; + const char * const bch2_cache_replacement_policies[] = { "lru", "fifo", @@ -224,7 +233,7 @@ enum bch_opt_id bch2_parse_sysfs_opt(const char *name, const char *val, } ssize_t bch2_opt_show(struct bch_opts *opts, const char *name, - char *buf, size_t size) + char *buf, size_t size) { int id = bch2_opt_lookup(name); const struct bch_option *opt; @@ -237,6 +246,6 @@ ssize_t bch2_opt_show(struct bch_opts *opts, const char *name, opt = &bch2_opt_table[id]; return opt->type == BCH_OPT_STR - ? bch2_snprint_string_list(buf, size, opt->choices, v) - : snprintf(buf, size, "%lli\n", v); + ? bch2_scnprint_string_list(buf, size, opt->choices, v) + : scnprintf(buf, size, "%lli", v); } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 53eb15ad..667f629e 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_OPTS_H -#define _BCACHE_OPTS_H +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H #include #include @@ -10,6 +10,7 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_compression_types[]; extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_data_types[]; extern const char * const bch2_cache_replacement_policies[]; extern const char * const bch2_cache_modes[]; extern const char * const bch2_dev_state[]; @@ -167,4 +168,4 @@ enum bch_opt_id bch2_parse_sysfs_opt(const char *, const char *, u64 *); ssize_t bch2_opt_show(struct bch_opts *, const char *, char *, size_t); -#endif /* _BCACHE_OPTS_H */ +#endif /* _BCACHEFS_OPTS_H */ diff --git a/libbcachefs/six.h b/libbcachefs/six.h index 01ed3385..0f319df6 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -1,6 +1,5 @@ - -#ifndef _BCACHE_SIX_H -#define _BCACHE_SIX_H +#ifndef _BCACHEFS_SIX_H +#define _BCACHEFS_SIX_H #include #include @@ -133,4 +132,4 @@ __SIX_LOCK(read) __SIX_LOCK(intent) __SIX_LOCK(write) -#endif /* _BCACHE_SIX_H */ +#endif /* _BCACHEFS_SIX_H */ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index ab28b07a..d91fbdf1 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_STR_HASH_H -#define _BCACHE_STR_HASH_H +#ifndef _BCACHEFS_STR_HASH_H +#define _BCACHEFS_STR_HASH_H #include "btree_iter.h" #include "btree_update.h" @@ -404,4 +404,4 @@ err: return ret; } -#endif /* _BCACHE_STR_HASH_H */ +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index abcc933d..482ab572 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -314,16 +314,12 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb) const char *err; u16 block_size; - switch (le64_to_cpu(sb->version)) { - case BCACHE_SB_VERSION_CDEV_V4: - break; - default: + if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN || + le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX) return"Unsupported superblock version"; - } - if (BCH_SB_INITIALIZED(sb) && - le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4) - return "Unsupported superblock version"; + if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) + SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7); block_size = le16_to_cpu(sb->block_size); @@ -397,15 +393,22 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb) sb_mi = bch2_sb_get_members(sb); mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx); + if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) { + struct bch_member *m; + + for (m = sb_mi->members; + m < sb_mi->members + sb->nr_devices; + m++) + SET_BCH_MEMBER_DATA_ALLOWED(m, ~0); + } + if (mi.nbuckets > LONG_MAX) return "Too many buckets"; if (mi.nbuckets - mi.first_bucket < 1 << 10) return "Not enough buckets"; - if (!is_power_of_2(mi.bucket_size) || - mi.bucket_size < PAGE_SECTORS || - mi.bucket_size < block_size) + if (mi.bucket_size < block_size) return "Bad bucket size"; if (get_capacity(disk_sb->bdev->bd_disk) < @@ -420,6 +423,8 @@ const char *bch2_sb_validate(struct bcache_superblock *disk_sb) if (err) return err; + sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); + return NULL; } @@ -463,6 +468,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.clean = BCH_SB_CLEAN(src); c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); c->sb.time_precision = le32_to_cpu(src->time_precision); @@ -570,8 +576,9 @@ reread: if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) return "Not a bcachefs superblock"; - if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4) - return "Unsupported superblock version"; + if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN || + le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX) + return"Unsupported superblock version"; bytes = vstruct_bytes(sb->sb); @@ -729,6 +736,9 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); bch2_bio_map(bio, sb); + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], + bio_sectors(bio)); + percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); } @@ -784,7 +794,7 @@ void bch2_write_super(struct bch_fs *c) if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); - nr_wrote = bitmap_weight(sb_written.d, BCH_SB_MEMBERS_MAX); + nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = bch2_have_enough_devs(c, @@ -824,17 +834,6 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) return (void *) r->entries + r->entry_size * i; } -static inline struct bch_replicas_entry * -replicas_entry_next(struct bch_replicas_entry *i) -{ - return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; -} - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, unsigned dev) { @@ -939,7 +938,7 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) } static void bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_types data_type, + enum bch_data_type data_type, struct bch_replicas_cpu_entry *r, unsigned *max_dev) { @@ -967,7 +966,7 @@ static void bkey_to_replicas(struct bkey_s_c_extent e, static int bch2_update_gc_replicas(struct bch_fs *c, struct bch_replicas_cpu *gc_r, struct bkey_s_c_extent e, - enum bch_data_types data_type) + enum bch_data_type data_type) { struct bch_replicas_cpu_entry new_e; struct bch_replicas_cpu *new; @@ -1009,7 +1008,7 @@ static int bch2_update_gc_replicas(struct bch_fs *c, static bool replicas_has_extent(struct bch_replicas_cpu *r, struct bkey_s_c_extent e, - enum bch_data_types data_type) + enum bch_data_type data_type) { struct bch_replicas_cpu_entry search; unsigned max_dev; @@ -1023,7 +1022,7 @@ static bool replicas_has_extent(struct bch_replicas_cpu *r, } bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_types data_type) + enum bch_data_type data_type) { bool ret; @@ -1038,7 +1037,7 @@ bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, noinline static int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_types data_type) + enum bch_data_type data_type) { struct bch_replicas_cpu *gc_r; const struct bch_extent_ptr *ptr; @@ -1103,7 +1102,7 @@ err: } int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_types data_type) + enum bch_data_type data_type) { struct bch_replicas_cpu *gc_r; bool marked; diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index e5e865a9..ed27dd0f 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_SUPER_IO_H -#define _BCACHE_SUPER_IO_H +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H #include "extents.h" #include "eytzinger.h" @@ -104,6 +104,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .tier = BCH_MEMBER_TIER(mi), .replacement = BCH_MEMBER_REPLACEMENT(mi), .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), }; } @@ -122,10 +123,25 @@ const char *bch2_read_super(struct bcache_superblock *, struct bch_opts, const char *); void bch2_write_super(struct bch_fs *); +/* replicas: */ + +/* iterate over bch_sb_field_replicas: */ + +static inline struct bch_replicas_entry * +replicas_entry_next(struct bch_replicas_entry *i) +{ + return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; +} + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent, - enum bch_data_types); + enum bch_data_type); int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent, - enum bch_data_types); + enum bch_data_type); struct replicas_status { struct { @@ -145,4 +161,4 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); -#endif /* _BCACHE_SUPER_IO_H */ +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index ad388425..951053f7 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -100,7 +100,7 @@ struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i) + for_each_member_device_rcu(ca, c, i, NULL) if (ca->disk_sb.bdev == bdev) { closure_get(&c->cl); goto found; @@ -159,10 +159,11 @@ int bch2_congested(struct bch_fs *c, int bdi_bits) } else { /* Writes prefer fastest tier: */ struct bch_tier *tier = READ_ONCE(c->fastest_tier); - struct dev_group *grp = tier ? &tier->devs : &c->all_devs; + struct bch_devs_mask *devs = + tier ? &tier->devs : &c->rw_devs[BCH_DATA_USER]; rcu_read_lock(); - group_for_each_dev(ca, grp, i) { + for_each_member_device_rcu(ca, c, i, devs) { bdi = ca->disk_sb.bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { @@ -554,6 +555,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + c->block_bits = ilog2(c->sb.block_size); + mutex_unlock(&c->sb_lock); scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); @@ -564,8 +567,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->opts.nochanges |= c->opts.noreplay; c->opts.read_only |= c->opts.nochanges; - c->block_bits = ilog2(c->sb.block_size); - if (bch2_fs_init_fault("fs_alloc")) goto err; @@ -590,7 +591,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->sb.btree_node_size, - BCH_ENCODED_EXTENT_MAX) / + c->sb.encoded_extent_max) / PAGE_SECTORS, 0) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || lg_lock_init(&c->usage_lock) || @@ -662,7 +663,7 @@ static const char *__bch2_fs_online(struct bch_fs *c) mutex_lock(&c->state_lock); err = "error creating sysfs objects"; - __for_each_member_device(ca, c, i) + __for_each_member_device(ca, c, i, NULL) if (bch2_dev_sysfs_online(ca)) goto err; @@ -692,7 +693,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) LIST_HEAD(journal); struct jset *j; struct closure cl; - u64 journal_seq = 0; time64_t now; unsigned i; int ret = -EINVAL; @@ -790,17 +790,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) if (ret) goto err; bch_verbose(c, "fsck done"); - - for_each_rw_member(ca, c, i) - if (ca->need_alloc_write) { - ret = bch2_alloc_write(c, ca, &journal_seq); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - bch2_journal_flush_seq(&c->journal, journal_seq); } else { struct bch_inode_unpacked inode; struct bkey_inode_buf packed_inode; @@ -842,7 +831,7 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch2_inode_init(c, &inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - inode.inum = BCACHE_ROOT_INO; + inode.inum = BCACHEFS_ROOT_INO; bch2_inode_pack(&packed_inode, &inode); @@ -878,7 +867,6 @@ recovery_done: SET_BCH_SB_INITIALIZED(c->disk_sb, true); SET_BCH_SB_CLEAN(c->disk_sb, false); - c->disk_sb->version = BCACHE_SB_VERSION_CDEV; bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -988,9 +976,10 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); - free_percpu(ca->sectors_written); + free_percpu(ca->io_done); bioset_exit(&ca->replica_set); free_percpu(ca->usage_percpu); + kvpfree(ca->bucket_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); free_heap(&ca->copygc_heap); @@ -1108,10 +1097,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) init_completion(&ca->stop_complete); init_completion(&ca->offline_complete); - spin_lock_init(&ca->self.lock); - ca->self.nr = 1; - rcu_assign_pointer(ca->self.d[0].dev, ca); ca->dev_idx = dev_idx; + __set_bit(ca->dev_idx, ca->self.d); + + ca->copygc_write_point.type = BCH_DATA_USER; spin_lock_init(&ca->freelist_lock); bch2_dev_moving_gc_init(ca); @@ -1125,7 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; - ca->bucket_bits = ilog2(ca->mi.bucket_size); scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); /* XXX: tune these */ @@ -1161,10 +1149,13 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) !(ca->buckets = kvpmalloc(ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || + !(ca->bucket_dirty = kvpmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || - !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) + !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; total_reserve = ca->free_inc.size; @@ -1172,7 +1163,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) total_reserve += ca->free[i].size; ca->copygc_write_point.group = &ca->self; - ca->tiering_write_point.group = &ca->self; ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1238,19 +1228,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb) bch2_mark_dev_metadata(c, ca); lg_local_unlock(&c->usage_lock); - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - struct bch_sb_field_journal *journal_buckets = - bch2_sb_get_journal(ca->disk_sb.sb); - bool has_journal = - bch2_nr_journal_buckets(journal_buckets) >= - BCH_JOURNAL_BUCKETS_MIN; - - bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); - bch2_dev_group_add(&c->all_devs, ca); - - if (has_journal) - bch2_dev_group_add(&c->journal.devs, ca); - } + if (ca->mi.state == BCH_MEMBER_STATE_RW) + bch2_dev_allocator_add(c, ca); percpu_ref_reinit(&ca->io_ref); return 0; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 54f60c62..18e36c08 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -1,35 +1,28 @@ -#ifndef _BCACHE_SUPER_H -#define _BCACHE_SUPER_H +#ifndef _BCACHEFS_SUPER_H +#define _BCACHEFS_SUPER_H #include "extents.h" #include "bcachefs_ioctl.h" +#include + static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) { - return s >> ca->bucket_bits; + return div_u64(s, ca->mi.bucket_size); } static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) { - return ((sector_t) b) << ca->bucket_bits; + return ((sector_t) b) * ca->mi.bucket_size; } static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) { - return s & (ca->mi.bucket_size - 1); -} + u32 remainder; -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter) -{ - struct bch_dev *ca = NULL; - - while (*iter < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], - lockdep_is_held(&c->state_lock)))) - (*iter)++; - - return ca; + div_u64_rem(s, ca->mi.bucket_size, &remainder); + return remainder; } static inline bool bch2_dev_is_online(struct bch_dev *ca) @@ -37,18 +30,38 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } -#define __for_each_member_device(ca, c, iter) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter))); (iter)++) +static inline unsigned dev_mask_nr(struct bch_devs_mask *devs) +{ + return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); +} -#define for_each_member_device_rcu(ca, c, iter) \ - __for_each_member_device(ca, c, iter) +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + struct bch_devs_mask *mask) +{ + struct bch_dev *ca = NULL; + + while ((*iter = mask + ? find_next_bit(mask->d, c->sb.nr_devices, *iter) + : *iter) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define __for_each_member_device(ca, c, iter, mask) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + +#define for_each_member_device_rcu(ca, c, iter, mask) \ + __for_each_member_device(ca, c, iter, mask) static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) { struct bch_dev *ca; rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter))) + if ((ca = __bch2_next_dev(c, iter, NULL))) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -70,7 +83,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca; rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter)) && + while ((ca = __bch2_next_dev(c, iter, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) (*iter)++; @@ -94,6 +107,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, __for_each_online_member(ca, c, iter, \ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) +/* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { struct bch_devs_mask devs; @@ -135,4 +149,4 @@ const char *bch2_fs_open(char * const *, unsigned, struct bch_opts, struct bch_fs **); const char *bch2_fs_open_incremental(const char *path); -#endif /* _BCACHE_SUPER_H */ +#endif /* _BCACHEFS_SUPER_H */ diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 9f79d8a1..579929ac 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_SUPER_TYPES_H -#define _BCACHE_SUPER_TYPES_H +#ifndef _BCACHEFS_SUPER_TYPES_H +#define _BCACHEFS_SUPER_TYPES_H struct bcache_superblock { struct bch_sb *sb; @@ -13,4 +13,4 @@ struct bch_devs_mask { unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; }; -#endif /* _BCACHE_SUPER_TYPES_H */ +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 9e9ef4cd..ff3deba8 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -5,6 +5,8 @@ * Copyright 2012 Google, Inc. */ +#ifndef NO_BCACHEFS_SYSFS + #include "bcachefs.h" #include "alloc.h" #include "compress.h" @@ -53,7 +55,7 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ #define sysfs_printf(file, fmt, ...) \ do { \ if (attr == &sysfs_ ## file) \ - return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ + return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ } while (0) #define sysfs_print(file, var) \ @@ -134,6 +136,7 @@ read_attribute(block_size); read_attribute(btree_node_size); read_attribute(first_bucket); read_attribute(nbuckets); +read_attribute(iostats); read_attribute(read_priority_stats); read_attribute(write_priority_stats); read_attribute(fragmentation_stats); @@ -141,9 +144,6 @@ read_attribute(oldest_gen_stats); read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); -read_attribute(written); -read_attribute(btree_written); -read_attribute(metadata_written); read_attribute(journal_debug); read_attribute(journal_pins); @@ -160,7 +160,6 @@ read_attribute(cached_buckets); read_attribute(meta_buckets); read_attribute(alloc_buckets); read_attribute(has_data); -read_attribute(has_metadata); read_attribute(alloc_debug); read_attribute(read_realloc_races); @@ -301,7 +300,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) } bch2_btree_iter_unlock(&iter); - return snprintf(buf, PAGE_SIZE, + return scnprintf(buf, PAGE_SIZE, "uncompressed data:\n" " nr extents: %llu\n" " size (bytes): %llu\n" @@ -527,9 +526,13 @@ struct attribute *bch2_fs_internal_files[] = { SHOW(bch2_fs_opts_dir) { + char *out = buf, *end = buf + PAGE_SIZE; struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - return bch2_opt_show(&c->opts, attr->name, buf, PAGE_SIZE); + out += bch2_opt_show(&c->opts, attr->name, out, end - out); + out += scnprintf(out, end - out, "\n"); + + return out - buf; } STORE(bch2_fs_opts_dir) @@ -728,15 +731,32 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) c->open_buckets_wait.list.first ? "waiting" : "empty"); } -static u64 sectors_written(struct bch_dev *ca) +const char * const bch2_rw[] = { + "read", + "write", + NULL +}; + +static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf) { - u64 ret = 0; - int cpu; + char *out = buf, *end = buf + PAGE_SIZE; + int rw, i, cpu; - for_each_possible_cpu(cpu) - ret += *per_cpu_ptr(ca->sectors_written, cpu); + for (rw = 0; rw < 2; rw++) { + out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]); - return ret; + for (i = 1; i < BCH_DATA_NR; i++) { + u64 n = 0; + + for_each_possible_cpu(cpu) + n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i]; + + out += scnprintf(out, end - out, "%-12s:%12llu\n", + bch2_data_types[i], n << 9); + } + } + + return out - buf; } SHOW(bch2_dev) @@ -744,6 +764,7 @@ SHOW(bch2_dev) struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; struct bch_dev_usage stats = bch2_dev_usage_read(ca); + char *out = buf, *end = buf + PAGE_SIZE; sysfs_printf(uuid, "%pU\n", ca->uuid.b); @@ -752,12 +773,6 @@ SHOW(bch2_dev) sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); sysfs_print(discard, ca->mi.discard); - sysfs_hprint(written, sectors_written(ca) << 9); - sysfs_hprint(btree_written, - atomic64_read(&ca->btree_sectors_written) << 9); - sysfs_hprint(metadata_written, - (atomic64_read(&ca->meta_sectors_written) + - atomic64_read(&ca->btree_sectors_written)) << 9); sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9); sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9); @@ -769,26 +784,37 @@ SHOW(bch2_dev) sysfs_print(alloc_buckets, stats.buckets_alloc); sysfs_print(available_buckets, dev_buckets_available(ca)); sysfs_print(free_buckets, dev_buckets_free(ca)); - sysfs_print(has_data, bch2_dev_has_data(c, ca) & - (1 << BCH_DATA_USER)); - sysfs_print(has_metadata, bch2_dev_has_data(c, ca) & - ((1 << BCH_DATA_JOURNAL)| - (1 << BCH_DATA_BTREE))); + + if (attr == &sysfs_has_data) { + out += bch2_scnprint_flag_list(out, end - out, + bch2_data_types, + bch2_dev_has_data(c, ca)); + out += scnprintf(out, end - out, "\n"); + return out - buf; + } sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); - if (attr == &sysfs_cache_replacement_policy) - return bch2_snprint_string_list(buf, PAGE_SIZE, - bch2_cache_replacement_policies, - ca->mi.replacement); + if (attr == &sysfs_cache_replacement_policy) { + out += bch2_scnprint_string_list(out, end - out, + bch2_cache_replacement_policies, + ca->mi.replacement); + out += scnprintf(out, end - out, "\n"); + return out - buf; + } sysfs_print(tier, ca->mi.tier); - if (attr == &sysfs_state_rw) - return bch2_snprint_string_list(buf, PAGE_SIZE, - bch2_dev_state, - ca->mi.state); + if (attr == &sysfs_state_rw) { + out += bch2_scnprint_string_list(out, end - out, + bch2_dev_state, + ca->mi.state); + out += scnprintf(out, end - out, "\n"); + return out - buf; + } + if (attr == &sysfs_iostats) + return show_dev_iostats(ca, buf); if (attr == &sysfs_read_priority_stats) return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0); if (attr == &sysfs_write_priority_stats) @@ -859,8 +885,8 @@ STORE(bch2_dev) SET_BCH_MEMBER_TIER(mi, v); bch2_write_super(c); - bch2_dev_group_remove(&c->tiers[prev_tier].devs, ca); - bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + clear_bit(ca->dev_idx, c->tiers[prev_tier].devs.d); + set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); mutex_unlock(&c->sb_lock); bch2_recalc_capacity(c); @@ -885,12 +911,7 @@ struct attribute *bch2_dev_files[] = { &sysfs_state_rw, &sysfs_has_data, - &sysfs_has_metadata, - - /* io stats: */ - &sysfs_written, - &sysfs_btree_written, - &sysfs_metadata_written, + &sysfs_iostats, /* alloc info - data: */ &sysfs_dirty_data, @@ -919,3 +940,5 @@ struct attribute *bch2_dev_files[] = { sysfs_pd_controller_files(copy_gc), NULL }; + +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/libbcachefs/sysfs.h b/libbcachefs/sysfs.h index c0b8034e..a4825056 100644 --- a/libbcachefs/sysfs.h +++ b/libbcachefs/sysfs.h @@ -1,9 +1,9 @@ -#ifndef _BCACHE_SYSFS_H_ -#define _BCACHE_SYSFS_H_ +#ifndef _BCACHEFS_SYSFS_H_ +#define _BCACHEFS_SYSFS_H_ #include -#ifndef NO_BCACHE_SYSFS +#ifndef NO_BCACHEFS_SYSFS struct attribute; struct sysfs_ops; @@ -34,6 +34,6 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; static const struct sysfs_ops bch2_dev_sysfs_ops; -#endif +#endif /* NO_BCACHEFS_SYSFS */ -#endif /* _BCACHE_SYSFS_H_ */ +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 6bc20845..b68cae75 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -24,7 +24,7 @@ struct tiering_state { }; static bool tiering_pred(struct bch_fs *c, - struct tiering_state *s, + struct bch_tier *tier, struct bkey_s_c k) { if (bkey_extent_is_data(k.k)) { @@ -38,7 +38,7 @@ static bool tiering_pred(struct bch_fs *c, return false; extent_for_each_ptr(e, ptr) - if (c->devs[ptr->dev]->mi.tier >= s->tier->idx) + if (c->devs[ptr->dev]->mi.tier >= tier->idx) replicas++; return replicas < c->opts.data_replicas; @@ -47,49 +47,18 @@ static bool tiering_pred(struct bch_fs *c, return false; } -static void tier_put_device(struct tiering_state *s) -{ - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = NULL; -} - -/** - * refill_next - move on to refilling the next cache's tiering keylist - */ -static void tier_next_device(struct bch_fs *c, struct tiering_state *s) -{ - if (!s->ca || s->sectors > s->stripe_size) { - tier_put_device(s); - s->sectors = 0; - s->dev_idx++; - - spin_lock(&s->tier->devs.lock); - if (s->dev_idx >= s->tier->devs.nr) - s->dev_idx = 0; - - if (s->tier->devs.nr) { - s->ca = s->tier->devs.d[s->dev_idx].dev; - percpu_ref_get(&s->ca->io_ref); - } - spin_unlock(&s->tier->devs.lock); - } -} - static int issue_tiering_move(struct bch_fs *c, - struct tiering_state *s, + struct bch_tier *tier, struct moving_context *ctxt, struct bkey_s_c k) { int ret; - ret = bch2_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); - if (!ret) { + ret = bch2_data_move(c, ctxt, &tier->wp, k, NULL); + if (!ret) trace_tiering_copy(k.k); - s->sectors += k.k->size; - } else { + else trace_tiering_alloc_fail(c, k.k->size); - } return ret; } @@ -101,10 +70,9 @@ static int issue_tiering_move(struct bch_fs *c, static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) { struct moving_context ctxt; - struct tiering_state s; struct btree_iter iter; struct bkey_s_c k; - unsigned nr_devices = READ_ONCE(tier->devs.nr); + unsigned nr_devices = dev_mask_nr(&tier->devs); int ret; if (!nr_devices) @@ -112,10 +80,6 @@ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) trace_tiering_start(c); - memset(&s, 0, sizeof(s)); - s.tier = tier; - s.stripe_size = 2048; /* 1 mb for now */ - bch2_move_ctxt_init(&ctxt, &tier->pd.rate, nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, @@ -125,14 +89,10 @@ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) !bch2_move_ctxt_wait(&ctxt) && (k = bch2_btree_iter_peek(&iter)).k && !btree_iter_err(k)) { - if (!tiering_pred(c, &s, k)) + if (!tiering_pred(c, tier, k)) goto next; - tier_next_device(c, &s); - if (!s.ca) - break; - - ret = issue_tiering_move(c, &s, &ctxt, k); + ret = issue_tiering_move(c, tier, &ctxt, k); if (ret) { bch2_btree_iter_unlock(&iter); @@ -150,7 +110,6 @@ next: } bch2_btree_iter_unlock(&iter); - tier_put_device(&s); bch2_move_ctxt_exit(&ctxt); trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); @@ -171,7 +130,7 @@ static int bch2_tiering_thread(void *arg) while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && - tier->devs.nr)) + dev_mask_nr(&tier->devs))) break; while (1) { @@ -183,15 +142,18 @@ static int bch2_tiering_thread(void *arg) for (faster_tier = c->tiers; faster_tier != tier; faster_tier++) { - spin_lock(&faster_tier->devs.lock); - group_for_each_dev(ca, &faster_tier->devs, i) { + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &faster_tier->devs) { tier_capacity += - (ca->mi.nbuckets - - ca->mi.first_bucket) << ca->bucket_bits; + bucket_to_sector(ca, + ca->mi.nbuckets - + ca->mi.first_bucket); available_sectors += - dev_buckets_available(ca) << ca->bucket_bits; + bucket_to_sector(ca, + dev_buckets_available(ca)); } - spin_unlock(&faster_tier->devs.lock); + rcu_read_unlock(); } if (available_sectors < (tier_capacity >> 1)) @@ -255,7 +217,7 @@ int bch2_tiering_start(struct bch_fs *c) return 0; for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!tier->devs.nr) + if (!dev_mask_nr(&tier->devs)) continue; if (have_faster_tier) { @@ -279,5 +241,6 @@ void bch2_fs_tiering_init(struct bch_fs *c) for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { c->tiers[i].idx = i; bch2_pd_controller_init(&c->tiers[i].pd); + c->tiers[i].wp.group = &c->tiers[i].devs; } } diff --git a/libbcachefs/tier.h b/libbcachefs/tier.h index a4fd6225..f8eaa9b0 100644 --- a/libbcachefs/tier.h +++ b/libbcachefs/tier.h @@ -1,8 +1,8 @@ -#ifndef _BCACHE_TIER_H -#define _BCACHE_TIER_H +#ifndef _BCACHEFS_TIER_H +#define _BCACHEFS_TIER_H void bch2_tiering_stop(struct bch_fs *); int bch2_tiering_start(struct bch_fs *); void bch2_fs_tiering_init(struct bch_fs *); -#endif +#endif /* _BCACHEFS_TIER_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 9a958543..2eb8ca72 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -98,44 +98,95 @@ ssize_t bch2_hprint(char *buf, s64 v) * to turn it into [-9, 9] */ if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 103); + scnprintf(dec, sizeof(dec), ".%i", t / 103); return sprintf(buf, "%lli%s%c", v, dec, units[u]); } -ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected) +ssize_t bch2_scnprint_string_list(char *buf, size_t size, + const char * const list[], + size_t selected) { char *out = buf; size_t i; - for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, - i == selected ? "[%s] " : "%s ", list[i]); + if (size) + *out = '\0'; + + for (i = 0; list[i]; i++) + out += scnprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + if (out != buf) + *--out = '\0'; - out[-1] = '\n'; return out - buf; } ssize_t bch2_read_string_list(const char *buf, const char * const list[]) { - size_t i; - char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); + size_t i, len; + + buf = skip_spaces(buf); + + len = strlen(buf); + while (len && isspace(buf[len - 1])) + --len; + + for (i = 0; list[i]; i++) + if (strlen(list[i]) == len && + !memcmp(buf, list[i], len)) + break; + + return list[i] ? i : -EINVAL; +} + +ssize_t bch2_scnprint_flag_list(char *buf, size_t size, + const char * const list[], u64 flags) +{ + char *out = buf, *end = buf + size; + unsigned bit, nr = 0; + + while (list[nr]) + nr++; + + if (size) + *out = '\0'; + + while (flags && (bit = __ffs(flags)) < nr) { + out += scnprintf(out, end - out, "%s,", list[bit]); + flags ^= 1 << bit; + } + + if (out != buf) + *--out = '\0'; + + return out - buf; +} + +u64 bch2_read_flag_list(char *opt, const char * const list[]) +{ + u64 ret = 0; + char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); + if (!d) return -ENOMEM; s = strim(d); - for (i = 0; list[i]; i++) - if (!strcmp(list[i], s)) + while ((p = strsep(&s, ","))) { + int flag = bch2_read_string_list(p, list); + if (flag < 0) { + ret = -1; break; + } + + ret |= 1 << flag; + } kfree(d); - if (!list[i]) - return -EINVAL; - - return i; + return ret; } bool bch2_is_zero(const void *_p, size_t n) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 99ad359f..b91b2dc8 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_UTIL_H -#define _BCACHE_UTIL_H +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H #include #include @@ -356,10 +356,12 @@ ssize_t bch2_hprint(char *buf, s64 v); bool bch2_is_zero(const void *, size_t); -ssize_t bch2_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected); +ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t); -ssize_t bch2_read_string_list(const char *buf, const char * const list[]); +ssize_t bch2_read_string_list(const char *, const char * const[]); + +ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64); +u64 bch2_read_flag_list(char *, const char * const[]); struct time_stats { spinlock_t lock; @@ -787,4 +789,4 @@ void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); -#endif /* _BCACHE_UTIL_H */ +#endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 9bc5376f..16310d89 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_XATTR_H -#define _BCACHE_XATTR_H +#ifndef _BCACHEFS_XATTR_H +#define _BCACHEFS_XATTR_H #include "str_hash.h" @@ -20,4 +20,4 @@ ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler *bch2_xattr_handlers[]; -#endif /* _BCACHE_XATTR_H */ +#endif /* _BCACHEFS_XATTR_H */