From 22291ae84a029d65334d1a90b67b5031f45cd540 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 22 Nov 2017 00:42:55 -0500 Subject: [PATCH] Update bcachefs sources to 9e7ae5219c bcachefs: Make write points more dynamic --- .bcachefs_revision | 2 +- cmd_migrate.c | 2 +- include/linux/jiffies.h | 9 +- include/linux/rculist.h | 3 - libbcachefs/alloc.c | 600 +++++++++++++++++----------- libbcachefs/alloc.h | 28 +- libbcachefs/alloc_types.h | 38 +- libbcachefs/bcachefs.h | 53 +-- libbcachefs/btree_cache.c | 253 ++++++------ libbcachefs/btree_cache.h | 15 +- libbcachefs/btree_gc.c | 20 +- libbcachefs/btree_io.c | 10 +- libbcachefs/btree_iter.c | 4 +- libbcachefs/btree_types.h | 36 ++ libbcachefs/btree_update_interior.c | 53 +-- libbcachefs/buckets.c | 7 +- libbcachefs/compress.c | 47 ++- libbcachefs/extents.c | 10 +- libbcachefs/fs-io.c | 18 +- libbcachefs/fs-io.h | 2 +- libbcachefs/io.c | 53 ++- libbcachefs/io.h | 18 +- libbcachefs/io_types.h | 5 +- libbcachefs/migrate.c | 5 +- libbcachefs/move.c | 12 +- libbcachefs/move.h | 17 +- libbcachefs/movinggc.c | 3 +- libbcachefs/super.c | 14 +- libbcachefs/sysfs.c | 8 +- libbcachefs/tier.c | 3 +- 30 files changed, 774 insertions(+), 574 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 668fea75..04ebc308 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -661faf58dbcab87e512e64e7cb164905689e64c8 +192d759a491f50d92c89c2e842639d2307c815a5 diff --git a/cmd_migrate.c b/cmd_migrate.c index 519e85da..ec6c8314 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c, if (ret) die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_write_op_init(&op, c, res, c->write_points, + bch2_write_op_init(&op, c, res, NULL, 0, POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0); closure_call(&op.cl, bch2_write, NULL, &cl); closure_sync(&cl); diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 00abaee1..e0dadcf0 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -70,7 +70,7 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -static inline u64 local_clock(void) +static inline u64 sched_clock(void) { struct timespec ts; @@ -79,6 +79,11 @@ static inline u64 local_clock(void) return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec; } +static inline u64 local_clock(void) +{ + return sched_clock(); +} + extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); @@ -87,7 +92,7 @@ extern unsigned long nsecs_to_jiffies(u64 n); static inline u64 get_jiffies_64(void) { - return nsecs_to_jiffies64(local_clock()); + return nsecs_to_jiffies64(sched_clock()); } #define jiffies_64 get_jiffies_64() diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8beb98dc..b6c61e12 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -1,8 +1,6 @@ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H -#ifdef __KERNEL__ - /* * RCU-protected list version */ @@ -671,5 +669,4 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) -#endif /* __KERNEL__ */ #endif diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index a1086576..dc7348fc 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -1118,6 +1119,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, { enum bucket_alloc_ret ret = NO_DEVICES; struct dev_alloc_list devs_sorted; + u64 buckets_free; unsigned i; BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); @@ -1127,46 +1129,55 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, rcu_read_lock(); devs_sorted = bch2_wp_alloc_list(c, wp, devs); + spin_lock(&ob->lock); for (i = 0; i < devs_sorted.nr; i++) { struct bch_dev *ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - long bucket; + struct open_bucket_ptr ptr; if (!ca) continue; - bucket = bch2_bucket_alloc(c, ca, reserve); - if (bucket < 0) { - ret = FREELIST_EMPTY; - continue; + if (wp->type == BCH_DATA_USER && + ca->open_buckets_partial_nr) { + ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + } else { + long bucket = bch2_bucket_alloc(c, ca, reserve); + if (bucket < 0) { + ret = FREELIST_EMPTY; + continue; + } + + ptr = (struct open_bucket_ptr) { + .ptr.gen = ca->buckets[bucket].mark.gen, + .ptr.offset = bucket_to_sector(ca, bucket), + .ptr.dev = ca->dev_idx, + .sectors_free = ca->mi.bucket_size, + }; } - wp->next_alloc[ca->dev_idx] += - div64_u64(U64_MAX, dev_buckets_free(ca) * - ca->mi.bucket_size); - bch2_wp_rescale(c, ca, wp); - - __clear_bit(ca->dev_idx, devs->d); - /* * open_bucket_add_buckets expects new pointers at the head of * the list: */ - BUG_ON(ob->nr_ptrs >= BCH_REPLICAS_MAX); + BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs)); memmove(&ob->ptrs[1], &ob->ptrs[0], ob->nr_ptrs * sizeof(ob->ptrs[0])); - memmove(&ob->ptr_offset[1], - &ob->ptr_offset[0], - ob->nr_ptrs * sizeof(ob->ptr_offset[0])); ob->nr_ptrs++; - ob->ptrs[0] = (struct bch_extent_ptr) { - .gen = ca->buckets[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), - .dev = ca->dev_idx, - }; - ob->ptr_offset[0] = 0; + ob->ptrs[0] = ptr; + + buckets_free = U64_MAX, dev_buckets_free(ca); + if (buckets_free) + wp->next_alloc[ca->dev_idx] += + div64_u64(U64_MAX, buckets_free * + ca->mi.bucket_size); + else + wp->next_alloc[ca->dev_idx] = U64_MAX; + bch2_wp_rescale(c, ca, wp); + + __clear_bit(ca->dev_idx, devs->d); if (ob->nr_ptrs == nr_replicas) { ret = ALLOC_SUCCESS; @@ -1175,6 +1186,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, } EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); + spin_unlock(&ob->lock); rcu_read_unlock(); return ret; } @@ -1242,24 +1254,45 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - const struct bch_extent_ptr *ptr; + const struct open_bucket_ptr *ptr; u8 new_ob; if (!atomic_dec_and_test(&ob->pin)) return; - spin_lock(&c->open_buckets_lock); + down_read(&c->alloc_gc_lock); + spin_lock(&ob->lock); + open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = c->devs[ptr->ptr.dev]; - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false); + if (ptr->sectors_free) { + /* + * This is a ptr to a bucket that still has free space, + * but we don't want to use it + */ + BUG_ON(ca->open_buckets_partial_nr >= + ARRAY_SIZE(ca->open_buckets_partial)); + + spin_lock(&ca->freelist_lock); + ca->open_buckets_partial[ca->open_buckets_partial_nr++] + = *ptr; + spin_unlock(&ca->freelist_lock); + } else { + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false); + } } - ob->nr_ptrs = 0; + + spin_unlock(&ob->lock); + up_read(&c->alloc_gc_lock); + new_ob = ob->new_ob; ob->new_ob = 0; - list_move(&ob->list, &c->open_buckets_free); + spin_lock(&c->open_buckets_lock); + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; c->open_buckets_nr_free++; spin_unlock(&c->open_buckets_lock); @@ -1270,22 +1303,19 @@ void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) } static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c, - unsigned nr_reserved, - struct closure *cl) + unsigned nr_reserved, + struct closure *cl) { struct open_bucket *ret; spin_lock(&c->open_buckets_lock); if (c->open_buckets_nr_free > nr_reserved) { - BUG_ON(list_empty(&c->open_buckets_free)); - ret = list_first_entry(&c->open_buckets_free, - struct open_bucket, list); - list_move(&ret->list, &c->open_buckets_open); - BUG_ON(ret->nr_ptrs); + BUG_ON(!c->open_buckets_freelist); + ret = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ret->freelist; atomic_set(&ret->pin, 1); /* XXX */ - ret->has_full_ptrs = false; BUG_ON(ret->new_ob); BUG_ON(ret->nr_ptrs); @@ -1307,148 +1337,259 @@ static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c, return ret; } -static unsigned ob_ptr_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - struct bch_extent_ptr *ptr) -{ - struct bch_dev *ca = c->devs[ptr->dev]; - unsigned i = ptr - ob->ptrs; - unsigned used = bucket_remainder(ca, ptr->offset) + - ob->ptr_offset[i]; - - BUG_ON(used > ca->mi.bucket_size); - - return ca->mi.bucket_size - used; -} - static unsigned open_bucket_sectors_free(struct bch_fs *c, struct open_bucket *ob, unsigned nr_replicas) { - unsigned i, sectors_free = UINT_MAX; + unsigned sectors_free = UINT_MAX; + struct open_bucket_ptr *ptr; - for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++) - sectors_free = min(sectors_free, - ob_ptr_sectors_free(c, ob, &ob->ptrs[i])); + open_bucket_for_each_ptr(ob, ptr) + sectors_free = min(sectors_free, ptr->sectors_free); return sectors_free != UINT_MAX ? sectors_free : 0; } -static void open_bucket_copy_unused_ptrs(struct bch_fs *c, - struct open_bucket *new, - struct open_bucket *old) +static void open_bucket_move_ptrs(struct bch_fs *c, + struct open_bucket *dst, + struct open_bucket *src, + struct bch_devs_mask *devs, + unsigned nr_ptrs_dislike) { bool moved_ptr = false; int i; - for (i = old->nr_ptrs - 1; i >= 0; --i) - if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) { - BUG_ON(new->nr_ptrs >= BCH_REPLICAS_MAX); + down_read(&c->alloc_gc_lock); - new->ptrs[new->nr_ptrs] = old->ptrs[i]; - new->ptr_offset[new->nr_ptrs] = old->ptr_offset[i]; - new->nr_ptrs++; + if (dst < src) { + spin_lock(&dst->lock); + spin_lock_nested(&src->lock, 1); + } else { + spin_lock(&src->lock); + spin_lock_nested(&dst->lock, 1); + } - old->nr_ptrs--; - memmove(&old->ptrs[i], - &old->ptrs[i + 1], - (old->nr_ptrs - i) * sizeof(old->ptrs[0])); - memmove(&old->ptr_offset[i], - &old->ptr_offset[i + 1], - (old->nr_ptrs - i) * sizeof(old->ptr_offset[0])); + for (i = src->nr_ptrs - 1; i >= 0; --i) { + if (!src->ptrs[i].sectors_free) { + /* + * Don't do anything: leave the ptr on the old + * open_bucket for gc to find + */ + } else if (nr_ptrs_dislike && + !test_bit(src->ptrs[i].ptr.dev, devs->d)) { + /* + * We don't want this pointer; bch2_open_bucket_put() + * will stick it on ca->open_buckets_partial to be + * reused + */ + --nr_ptrs_dislike; + } else { + BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs)); + + dst->ptrs[dst->nr_ptrs++] = src->ptrs[i]; + + src->nr_ptrs--; + memmove(&src->ptrs[i], + &src->ptrs[i + 1], + (src->nr_ptrs - i) * sizeof(src->ptrs[0])); moved_ptr = true; } + } if (moved_ptr) { - BUG_ON(old->new_ob); + BUG_ON(src->new_ob); - atomic_inc(&new->pin); - old->new_ob = new - c->open_buckets; + atomic_inc(&dst->pin); + src->new_ob = dst - c->open_buckets; } + + spin_unlock(&dst->lock); + spin_unlock(&src->lock); + up_read(&c->alloc_gc_lock); } static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct bch_extent_ptr *ptr; + const struct open_bucket_ptr *ptr; open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = c->devs[ptr->ptr.dev]; - BUG_ON(ptr_stale(ca, ptr)); + BUG_ON(ptr_stale(ca, &ptr->ptr)); } #endif } /* Sector allocator */ -static struct open_bucket *lock_writepoint(struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - - while ((ob = ACCESS_ONCE(wp->b))) { - mutex_lock(&ob->lock); - if (wp->b == ob) - break; - - mutex_unlock(&ob->lock); - } - - return ob; -} - static int open_bucket_add_buckets(struct bch_fs *c, struct write_point *wp, + struct bch_devs_mask *_devs, struct open_bucket *ob, unsigned nr_replicas, - unsigned nr_replicas_required, enum alloc_reserve reserve, struct closure *cl) { struct bch_devs_mask devs = c->rw_devs[wp->type]; - unsigned i; - int ret; + struct open_bucket_ptr *ptr; if (ob->nr_ptrs >= nr_replicas) return 0; + if (_devs) + bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX); + /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < ob->nr_ptrs; i++) - __clear_bit(ob->ptrs[i].dev, devs.d); + open_bucket_for_each_ptr(ob, ptr) + if (ptr->sectors_free) + __clear_bit(ptr->ptr.dev, devs.d); - if (wp->group) - bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX); + return bch2_bucket_alloc_set(c, wp, ob, nr_replicas, + reserve, &devs, cl); +} - ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, &devs, cl); +static struct write_point *__writepoint_find(struct hlist_head *head, + unsigned long write_point) +{ + struct write_point *wp; - if (ret == -EROFS && - ob->nr_ptrs >= nr_replicas_required) - ret = 0; + hlist_for_each_entry_rcu(wp, head, node) { + if (wp->write_point == write_point) + continue; - return ret; + mutex_lock(&wp->lock); + if (wp->write_point == write_point) + return wp; + mutex_unlock(&wp->lock); + } + + return NULL; +} + +static struct hlist_head *writepoint_hash(struct bch_fs *c, + unsigned long write_point) +{ + unsigned hash = + hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); + + return &c->write_points_hash[hash]; +} + +static struct write_point *writepoint_find(struct bch_fs *c, + enum bch_data_type data_type, + unsigned long write_point) +{ + struct write_point *wp, *oldest = NULL; + struct hlist_head *head; + + switch (data_type) { + case BCH_DATA_BTREE: + wp = &c->btree_write_point; + mutex_lock(&wp->lock); + return wp; + case BCH_DATA_USER: + break; + default: + BUG(); + } + + head = writepoint_hash(c, write_point); + wp = __writepoint_find(head, write_point); + if (wp) + goto out; + + mutex_lock(&c->write_points_hash_lock); + wp = __writepoint_find(head, write_point); + if (wp) + goto out_unlock; + + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); + wp++) + if (!oldest || time_before64(wp->last_used, oldest->last_used)) + oldest = wp; + + wp = oldest; + BUG_ON(!wp); + + mutex_lock(&wp->lock); + hlist_del_rcu(&wp->node); + wp->write_point = write_point; + hlist_add_head_rcu(&wp->node, head); +out_unlock: + mutex_unlock(&c->write_points_hash_lock); +out: + wp->last_used = sched_clock(); + return wp; } /* * Get us an open_bucket we can allocate from, return with it locked: */ -struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c, - struct write_point *wp, +struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_mask *devs, + unsigned long write_point, unsigned nr_replicas, unsigned nr_replicas_required, enum alloc_reserve reserve, + unsigned flags, struct closure *cl) { struct open_bucket *ob; - unsigned open_buckets_reserved = wp == &c->btree_write_point + struct write_point *wp; + struct open_bucket_ptr *ptr; + unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE ? 0 : BTREE_NODE_RESERVE; + unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0; int ret; BUG_ON(!nr_replicas); -retry: - ob = lock_writepoint(c, wp); + + wp = writepoint_find(c, data_type, write_point); + BUG_ON(wp->type != data_type); + + wp->last_used = sched_clock(); + + ob = wp->ob; + + /* does ob have ptrs we don't need? */ + open_bucket_for_each_ptr(ob, ptr) { + if (!ptr->sectors_free) + nr_ptrs_empty++; + else if (devs && !test_bit(ptr->ptr.dev, devs->d)) + nr_ptrs_dislike++; + } + + ret = open_bucket_add_buckets(c, wp, devs, ob, + nr_replicas + nr_ptrs_empty + nr_ptrs_dislike, + reserve, cl); + if (ret && ret != -EROFS) + goto err; + + if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + goto alloc_done; + + /* + * XXX: + * Should this allocation be _forced_ to used the specified device (e.g. + * internal migration), or should we fall back to allocating from all + * devices? + */ + ret = open_bucket_add_buckets(c, wp, NULL, ob, + nr_replicas + nr_ptrs_empty, + reserve, cl); + if (ret && ret != -EROFS) + goto err; +alloc_done: + if (ob->nr_ptrs - nr_ptrs_empty - + ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0) + < nr_replicas_required) { + ret = -EROFS; + goto err; + } /* * If ob->sectors_free == 0, one or more of the buckets ob points to is @@ -1456,53 +1597,34 @@ retry: * still needs to find them; instead, we must allocate a new open bucket * and copy any pointers to non-full buckets into the new open bucket. */ - if (!ob || ob->has_full_ptrs) { - struct open_bucket *new_ob; + BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike); + nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas; - new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl); - if (IS_ERR(new_ob)) - return new_ob; - - mutex_lock(&new_ob->lock); - - /* - * We point the write point at the open_bucket before doing the - * allocation to avoid a race with shutdown: - */ - if (race_fault() || - cmpxchg(&wp->b, ob, new_ob) != ob) { - /* We raced: */ - mutex_unlock(&new_ob->lock); - bch2_open_bucket_put(c, new_ob); - - if (ob) - mutex_unlock(&ob->lock); - goto retry; + if (nr_ptrs_empty || nr_ptrs_dislike) { + ob = bch2_open_bucket_get(c, open_buckets_reserved, cl); + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); + goto err; } - if (ob) { - open_bucket_copy_unused_ptrs(c, new_ob, ob); - mutex_unlock(&ob->lock); - bch2_open_bucket_put(c, ob); - } + /* Remove pointers we don't want to use: */ - ob = new_ob; + open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike); + bch2_open_bucket_put(c, wp->ob); + wp->ob = ob; } - ret = open_bucket_add_buckets(c, wp, ob, nr_replicas, - nr_replicas_required, - reserve, cl); - if (ret) { - mutex_unlock(&ob->lock); - return ERR_PTR(ret); - } + BUG_ON(ob->nr_ptrs < nr_replicas_required); - ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); + wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); - BUG_ON(!ob->sectors_free); + BUG_ON(!wp->sectors_free); verify_not_stale(c, ob); - return ob; + return wp; +err: + mutex_unlock(&wp->lock); + return ERR_PTR(ret); } /* @@ -1514,29 +1636,26 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, unsigned sectors) { struct bch_extent_ptr tmp; - bool has_data = false; - unsigned i; + struct open_bucket_ptr *ptr; /* * We're keeping any existing pointer k has, and appending new pointers: * __bch2_write() will only write to the pointers we add here: */ - BUG_ON(sectors > ob->sectors_free); + for (ptr = ob->ptrs; + ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) { + struct bch_dev *ca = c->devs[ptr->ptr.dev]; - /* didn't use all the ptrs: */ - if (nr_replicas < ob->nr_ptrs) - has_data = true; + EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev)); - for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) { - EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); - - tmp = ob->ptrs[i]; + tmp = ptr->ptr; tmp.cached = bkey_extent_is_cached(&e->k); - tmp.offset += ob->ptr_offset[i]; + tmp.offset += ca->mi.bucket_size - ptr->sectors_free; extent_ptr_append(e, tmp); - ob->ptr_offset[i] += sectors; + BUG_ON(sectors > ptr->sectors_free); + ptr->sectors_free -= sectors; } } @@ -1544,25 +1663,27 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, * Append pointers to the space we just allocated to @k, and mark @sectors space * as allocated out of @ob */ -void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob) +void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) { - bool has_data = false; - unsigned i; + struct open_bucket *ob = wp->ob, *new_ob = NULL; + struct open_bucket_ptr *ptr; + bool empty = false; - for (i = 0; i < ob->nr_ptrs; i++) { - if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i])) - ob->has_full_ptrs = true; - else - has_data = true; + open_bucket_for_each_ptr(ob, ptr) + empty |= !ptr->sectors_free; + + if (empty) + new_ob = bch2_open_bucket_get(c, 0, NULL); + + if (!IS_ERR_OR_NULL(new_ob)) { + /* writepoint's ref becomes our ref: */ + wp->ob = new_ob; + open_bucket_move_ptrs(c, new_ob, ob, 0, 0); + } else { + atomic_inc(&ob->pin); } - if (likely(has_data)) - atomic_inc(&ob->pin); - else - BUG_ON(xchg(&wp->b, NULL) != ob); - - mutex_unlock(&ob->lock); + mutex_unlock(&wp->lock); } /* @@ -1583,27 +1704,33 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp, * @cl - closure to wait for a bucket */ struct open_bucket *bch2_alloc_sectors(struct bch_fs *c, - struct write_point *wp, + enum bch_data_type data_type, + struct bch_devs_mask *devs, + unsigned long write_point, struct bkey_i_extent *e, unsigned nr_replicas, unsigned nr_replicas_required, enum alloc_reserve reserve, + unsigned flags, struct closure *cl) { + struct write_point *wp; struct open_bucket *ob; - ob = bch2_alloc_sectors_start(c, wp, nr_replicas, - nr_replicas_required, - reserve, cl); - if (IS_ERR_OR_NULL(ob)) - return ob; + wp = bch2_alloc_sectors_start(c, data_type, devs, write_point, + nr_replicas, nr_replicas_required, + reserve, flags, cl); + if (IS_ERR_OR_NULL(wp)) + return ERR_CAST(wp); - if (e->k.size > ob->sectors_free) - bch2_key_resize(&e->k, ob->sectors_free); + ob = wp->ob; + + if (e->k.size > wp->sectors_free) + bch2_key_resize(&e->k, wp->sectors_free); bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size); - bch2_alloc_sectors_done(c, wp, ob); + bch2_alloc_sectors_done(c, wp); return ob; } @@ -1640,8 +1767,7 @@ void bch2_recalc_capacity(struct bch_fs *c) } c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; - - c->promote_write_point.group = &fastest_tier->devs; + c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL; if (!fastest_tier) goto set_capacity; @@ -1713,49 +1839,61 @@ set_capacity: closure_wake_up(&c->freelist_wait); } +static bool open_bucket_has_device(struct open_bucket *ob, + struct bch_dev *ca) +{ + struct open_bucket_ptr *ptr; + bool ret = false; + + spin_lock(&ob->lock); + open_bucket_for_each_ptr(ob, ptr) + ret |= ptr->ptr.dev == ca->dev_idx; + spin_unlock(&ob->lock); + + return ret; +} + static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { struct open_bucket *ob; - struct bch_extent_ptr *ptr; + struct closure cl; - ob = lock_writepoint(c, wp); - if (!ob) + closure_init_stack(&cl); +retry: + mutex_lock(&wp->lock); + if (!open_bucket_has_device(wp->ob, ca)) { + mutex_unlock(&wp->lock); return; + } - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) - goto found; + ob = bch2_open_bucket_get(c, 0, &cl); + if (IS_ERR(ob)) { + mutex_unlock(&wp->lock); + closure_sync(&cl); + goto retry; - mutex_unlock(&ob->lock); - return; -found: - BUG_ON(xchg(&wp->b, NULL) != ob); - mutex_unlock(&ob->lock); + } - /* Drop writepoint's ref: */ - bch2_open_bucket_put(c, ob); + open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs); + bch2_open_bucket_put(c, wp->ob); + wp->ob = ob; + + mutex_unlock(&wp->lock); } static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { - struct bch_extent_ptr *ptr; struct open_bucket *ob; + bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) - if (atomic_read(&ob->pin)) { - mutex_lock(&ob->lock); - for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->dev_idx) { - mutex_unlock(&ob->lock); - return true; - } - mutex_unlock(&ob->lock); - } + if (atomic_read(&ob->pin)) + ret |= open_bucket_has_device(ob, ca); - return false; + return ret; } /* device goes ro: */ @@ -1782,11 +1920,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* Next, close write points that point to this device... */ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_stop_write_point(c, ca, &c->write_points[i]); - - bch2_stop_write_point(c, ca, &ca->copygc_write_point); - bch2_stop_write_point(c, ca, &c->promote_write_point); - bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp); - bch2_stop_write_point(c, ca, &c->migration_write_point); bch2_stop_write_point(c, ca, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); @@ -1880,35 +2013,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_init(struct bch_fs *c) { - unsigned i; + struct open_bucket *ob; + struct write_point *wp; - INIT_LIST_HEAD(&c->open_buckets_open); - INIT_LIST_HEAD(&c->open_buckets_free); + mutex_init(&c->write_points_hash_lock); + init_rwsem(&c->alloc_gc_lock); spin_lock_init(&c->open_buckets_lock); bch2_prio_timer_init(c, READ); bch2_prio_timer_init(c, WRITE); /* open bucket 0 is a sentinal NULL: */ - mutex_init(&c->open_buckets[0].lock); - INIT_LIST_HEAD(&c->open_buckets[0].list); + spin_lock_init(&c->open_buckets[0].lock); - for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) { - mutex_init(&c->open_buckets[i].lock); + for (ob = c->open_buckets + 1; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { + spin_lock_init(&ob->lock); c->open_buckets_nr_free++; - list_add(&c->open_buckets[i].list, &c->open_buckets_free); + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; } - c->journal.wp.type = BCH_DATA_JOURNAL; + mutex_init(&c->btree_write_point.lock); c->btree_write_point.type = BCH_DATA_BTREE; + c->btree_write_point.ob = bch2_open_bucket_get(c, 0, NULL); + BUG_ON(IS_ERR(c->btree_write_point.ob)); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - c->tiers[i].wp.type = BCH_DATA_USER; + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) { + mutex_init(&wp->lock); + wp->type = BCH_DATA_USER; + wp->ob = bch2_open_bucket_get(c, 0, NULL); + wp->last_used = sched_clock(); - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - c->write_points[i].type = BCH_DATA_USER; + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - c->promote_write_point.type = BCH_DATA_USER; - c->migration_write_point.type = BCH_DATA_USER; + BUG_ON(IS_ERR(wp->ob)); + } c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index f07f1bfc..1ea747d2 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -28,20 +28,28 @@ long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve); void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); -struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *, - struct write_point *, - unsigned, unsigned, - enum alloc_reserve, - struct closure *); +struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + enum bch_data_type, + struct bch_devs_mask *, + unsigned long, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *); void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *, unsigned, struct open_bucket *, unsigned); -void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *, - struct open_bucket *); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -struct open_bucket *bch2_alloc_sectors(struct bch_fs *, struct write_point *, - struct bkey_i_extent *, unsigned, unsigned, - enum alloc_reserve, struct closure *); +struct open_bucket *bch2_alloc_sectors(struct bch_fs *, + enum bch_data_type, + struct bch_devs_mask *, + unsigned long, + struct bkey_i_extent *, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *); static inline void bch2_wake_allocator(struct bch_dev *ca) { diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index bee6d28d..c48d0aaa 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -2,6 +2,7 @@ #define _BCACHEFS_ALLOC_TYPES_H #include +#include #include "clock_types.h" @@ -44,39 +45,34 @@ enum alloc_reserve { /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 +#define WRITE_POINT_COUNT 32 -#define WRITE_POINT_COUNT 16 +struct open_bucket_ptr { + struct bch_extent_ptr ptr; + unsigned sectors_free; +}; struct open_bucket { - struct list_head list; - struct mutex lock; + spinlock_t lock; atomic_t pin; - bool has_full_ptrs; + u8 freelist; u8 new_ob; + u8 nr_ptrs; - /* - * recalculated every time we allocate from this open_bucket based on - * how many pointers we're actually going to use: - */ - unsigned sectors_free; - unsigned nr_ptrs; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; - unsigned ptr_offset[BCH_REPLICAS_MAX]; + struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2]; }; struct write_point { - struct open_bucket *b; + struct hlist_node node; + struct mutex lock; + u64 last_used; + unsigned long write_point; enum bch_data_type type; - /* - * If not NULL, cache group for tiering, promotion and moving GC - - * always allocates a single replica - * - * Otherwise do a normal replicated bucket allocation that could come - * from any device in tier 0 (foreground write) - */ - struct bch_devs_mask *group; + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; + struct open_bucket *ob; u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 1828bfdf..58d4723e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -392,6 +392,9 @@ struct bch_dev { unsigned nr_invalidated; bool alloc_thread_started; + struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT]; + unsigned open_buckets_partial_nr; + size_t fifo_last_bucket; /* Allocation stuff: */ @@ -426,8 +429,6 @@ struct bch_dev { struct bch_pd_controller moving_gc_pd; - struct write_point copygc_write_point; - struct journal_device journal; struct work_struct io_error_work; @@ -472,7 +473,6 @@ struct bch_tier { struct bch_pd_controller pd; struct bch_devs_mask devs; - struct write_point wp; }; enum bch_fs_state { @@ -546,40 +546,7 @@ struct bch_fs { struct btree_root btree_roots[BTREE_ID_NR]; struct mutex btree_root_lock; - bool btree_cache_table_init_done; - struct rhashtable btree_cache_table; - - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex btree_cache_lock; - struct list_head btree_cache; - struct list_head btree_cache_freeable; - struct list_head btree_cache_freed; - - /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned btree_cache_used; - unsigned btree_cache_reserve; - struct shrinker btree_cache_shrink; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct closure_waitlist mca_wait; - struct task_struct *btree_cache_alloc_lock; + struct btree_cache btree_cache; mempool_t btree_reserve_pool; @@ -606,6 +573,7 @@ struct bch_fs { struct workqueue_struct *copygc_wq; /* ALLOCATION */ + struct rw_semaphore alloc_gc_lock; struct bch_pd_controller foreground_write_pd; struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; @@ -622,6 +590,7 @@ struct bch_fs { struct bch_devs_mask rw_devs[BCH_DATA_NR]; struct bch_tier tiers[BCH_TIER_MAX]; /* NULL if we only have devices in one tier: */ + struct bch_devs_mask *fastest_devs; struct bch_tier *fastest_tier; u64 capacity; /* sectors */ @@ -654,17 +623,17 @@ struct bch_fs { struct io_clock io_clock[2]; /* SECTOR ALLOCATOR */ - struct list_head open_buckets_open; - struct list_head open_buckets_free; - unsigned open_buckets_nr_free; - struct closure_waitlist open_buckets_wait; spinlock_t open_buckets_lock; + u8 open_buckets_freelist; + u8 open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; struct write_point btree_write_point; struct write_point write_points[WRITE_POINT_COUNT]; - struct write_point promote_write_point; + struct hlist_head write_points_hash[WRITE_POINT_COUNT]; + struct mutex write_points_hash_lock; /* * This write point is used for migrating data off a device diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 4147545d..22846d8a 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -31,13 +31,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) reserve += min_t(unsigned, 1, c->btree_roots[i].b->level) * 8; - c->btree_cache_reserve = reserve; + c->btree_cache.reserve = reserve; } -#define mca_can_free(c) \ - max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve) +static inline unsigned btree_cache_can_free(struct btree_cache *bc) +{ + return max_t(int, 0, bc->used - bc->reserve); +} -static void __mca_data_free(struct bch_fs *c, struct btree *b) +static void __btree_node_data_free(struct bch_fs *c, struct btree *b) { EBUG_ON(btree_node_write_in_flight(b)); @@ -46,11 +48,13 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b) bch2_btree_keys_free(b); } -static void mca_data_free(struct bch_fs *c, struct btree *b) +static void btree_node_data_free(struct bch_fs *c, struct btree *b) { - __mca_data_free(c, b); - c->btree_cache_used--; - list_move(&b->list, &c->btree_cache_freed); + struct btree_cache *bc = &c->btree_cache; + + __btree_node_data_free(c, b); + bc->used--; + list_move(&b->list, &bc->freed); } static const struct rhashtable_params bch_btree_cache_params = { @@ -59,8 +63,10 @@ static const struct rhashtable_params bch_btree_cache_params = { .key_len = sizeof(struct bch_extent_ptr), }; -static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { + struct btree_cache *bc = &c->btree_cache; + b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) goto err; @@ -68,16 +74,16 @@ static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) goto err; - c->btree_cache_used++; - list_move(&b->list, &c->btree_cache_freeable); + bc->used++; + list_move(&b->list, &bc->freeable); return; err: kvpfree(b->data, btree_bytes(c)); b->data = NULL; - list_move(&b->list, &c->btree_cache_freed); + list_move(&b->list, &bc->freed); } -static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) +static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) { struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) @@ -88,49 +94,48 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - mca_data_alloc(c, b, gfp); + btree_node_data_alloc(c, b, gfp); return b->data ? b : NULL; } /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b) +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { - rhashtable_remove_fast(&c->btree_cache_table, &b->hash, - bch_btree_cache_params); + rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); /* Cause future lookups for this node to fail: */ bkey_i_to_extent(&b->key)->v._data[0] = 0; } -int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b) +int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { - return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash, + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); } -int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b, - unsigned level, enum btree_id id) +int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + unsigned level, enum btree_id id) { int ret; b->level = level; b->btree_id = id; - mutex_lock(&c->btree_cache_lock); - ret = __bch2_btree_node_hash_insert(c, b); + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); if (!ret) - list_add(&b->list, &c->btree_cache); - mutex_unlock(&c->btree_cache_lock); + list_add(&b->list, &bc->live); + mutex_unlock(&bc->lock); return ret; } __flatten -static inline struct btree *mca_find(struct bch_fs *c, +static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) { - return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k), + return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), bch_btree_cache_params); } @@ -140,9 +145,10 @@ static inline struct btree *mca_find(struct bch_fs *c, */ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { + struct btree_cache *bc = &c->btree_cache; int ret = 0; - lockdep_assert_held(&c->btree_cache_lock); + lockdep_assert_held(&bc->lock); if (!six_trylock_intent(&b->lock)) return -ENOMEM; @@ -201,11 +207,12 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) return __btree_node_reclaim(c, b, true); } -static unsigned long bch2_mca_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache_shrink); + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free; @@ -218,8 +225,8 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink, /* Return -1 if we can't do anything right now */ if (sc->gfp_mask & __GFP_IO) - mutex_lock(&c->btree_cache_lock); - else if (!mutex_trylock(&c->btree_cache_lock)) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) return -1; /* @@ -230,11 +237,11 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= btree_pages(c); - can_free = mca_can_free(c); + can_free = btree_cache_can_free(bc); nr = min_t(unsigned long, nr, can_free); i = 0; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + list_for_each_entry_safe(b, t, &bc->freeable, list) { touched++; if (freed >= nr) @@ -242,34 +249,34 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink, if (++i > 3 && !btree_node_reclaim(c, b)) { - mca_data_free(c, b); + btree_node_data_free(c, b); six_unlock_write(&b->lock); six_unlock_intent(&b->lock); freed++; } } restart: - list_for_each_entry_safe(b, t, &c->btree_cache, list) { + list_for_each_entry_safe(b, t, &bc->live, list) { touched++; if (freed >= nr) { /* Save position */ - if (&t->list != &c->btree_cache) - list_move_tail(&c->btree_cache, &t->list); + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); break; } if (!btree_node_accessed(b) && !btree_node_reclaim(c, b)) { - /* can't call bch2_btree_node_hash_remove under btree_cache_lock */ + /* can't call bch2_btree_node_hash_remove under lock */ freed++; - if (&t->list != &c->btree_cache) - list_move_tail(&c->btree_cache, &t->list); + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); - mca_data_free(c, b); - mutex_unlock(&c->btree_cache_lock); + btree_node_data_free(c, b); + mutex_unlock(&bc->lock); - bch2_btree_node_hash_remove(c, b); + bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->lock); six_unlock_intent(&b->lock); @@ -277,97 +284,97 @@ restart: goto out; if (sc->gfp_mask & __GFP_IO) - mutex_lock(&c->btree_cache_lock); - else if (!mutex_trylock(&c->btree_cache_lock)) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) goto out; goto restart; } else clear_btree_node_accessed(b); } - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&bc->lock); out: return (unsigned long) freed * btree_pages(c); } -static unsigned long bch2_mca_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + struct shrink_control *sc) { struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache_shrink); + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; if (btree_shrinker_disabled(c)) return 0; - return mca_can_free(c) * btree_pages(c); + return btree_cache_can_free(bc) * btree_pages(c); } -void bch2_fs_btree_exit(struct bch_fs *c) +void bch2_fs_btree_cache_exit(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; unsigned i; - if (c->btree_cache_shrink.list.next) - unregister_shrinker(&c->btree_cache_shrink); + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); - mutex_lock(&c->btree_cache_lock); + mutex_lock(&bc->lock); #ifdef CONFIG_BCACHEFS_DEBUG if (c->verify_data) - list_move(&c->verify_data->list, &c->btree_cache); + list_move(&c->verify_data->list, &bc->live); kvpfree(c->verify_ondisk, btree_bytes(c)); #endif for (i = 0; i < BTREE_ID_NR; i++) if (c->btree_roots[i].b) - list_add(&c->btree_roots[i].b->list, &c->btree_cache); + list_add(&c->btree_roots[i].b->list, &bc->live); - list_splice(&c->btree_cache_freeable, - &c->btree_cache); + list_splice(&bc->freeable, &bc->live); - while (!list_empty(&c->btree_cache)) { - b = list_first_entry(&c->btree_cache, struct btree, list); + while (!list_empty(&bc->live)) { + b = list_first_entry(&bc->live, struct btree, list); if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); clear_btree_node_dirty(b); - mca_data_free(c, b); + btree_node_data_free(c, b); } - while (!list_empty(&c->btree_cache_freed)) { - b = list_first_entry(&c->btree_cache_freed, - struct btree, list); + while (!list_empty(&bc->freed)) { + b = list_first_entry(&bc->freed, struct btree, list); list_del(&b->list); kfree(b); } - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&bc->lock); - if (c->btree_cache_table_init_done) - rhashtable_destroy(&c->btree_cache_table); + if (bc->table_init_done) + rhashtable_destroy(&bc->table); } -int bch2_fs_btree_init(struct bch_fs *c) +int bch2_fs_btree_cache_init(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; unsigned i; int ret; - ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params); + ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) return ret; - c->btree_cache_table_init_done = true; + bc->table_init_done = true; bch2_recalc_btree_reserve(c); - for (i = 0; i < c->btree_cache_reserve; i++) - if (!mca_bucket_alloc(c, GFP_KERNEL)) + for (i = 0; i < bc->reserve; i++) + if (!btree_node_mem_alloc(c, GFP_KERNEL)) return -ENOMEM; - list_splice_init(&c->btree_cache, - &c->btree_cache_freeable); + list_splice_init(&bc->live, &bc->freeable); #ifdef CONFIG_BCACHEFS_DEBUG mutex_init(&c->verify_lock); @@ -376,42 +383,53 @@ int bch2_fs_btree_init(struct bch_fs *c) if (!c->verify_ondisk) return -ENOMEM; - c->verify_data = mca_bucket_alloc(c, GFP_KERNEL); + c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); if (!c->verify_data) return -ENOMEM; list_del_init(&c->verify_data->list); #endif - c->btree_cache_shrink.count_objects = bch2_mca_count; - c->btree_cache_shrink.scan_objects = bch2_mca_scan; - c->btree_cache_shrink.seeks = 4; - c->btree_cache_shrink.batch = btree_pages(c) * 2; - register_shrinker(&c->btree_cache_shrink); + bc->shrink.count_objects = bch2_btree_cache_count; + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; + bc->shrink.batch = btree_pages(c) * 2; + register_shrinker(&bc->shrink); return 0; } +void bch2_fs_btree_cache_init_early(struct btree_cache *bc) +{ + mutex_init(&bc->lock); + INIT_LIST_HEAD(&bc->live); + INIT_LIST_HEAD(&bc->freeable); + INIT_LIST_HEAD(&bc->freed); +} + /* * We can only have one thread cannibalizing other cached btree nodes at a time, * or we'll deadlock. We use an open coded mutex to ensure that, which a * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_node_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) { - if (c->btree_cache_alloc_lock == current) { + struct btree_cache *bc = &c->btree_cache; + + if (bc->alloc_lock == current) { trace_btree_node_cannibalize_unlock(c); - c->btree_cache_alloc_lock = NULL; - closure_wake_up(&c->mca_wait); + bc->alloc_lock = NULL; + closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) { + struct btree_cache *bc = &c->btree_cache; struct task_struct *old; - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + old = cmpxchg(&bc->alloc_lock, NULL, current); if (old == NULL || old == current) goto success; @@ -420,13 +438,13 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl) return -ENOMEM; } - closure_wait(&c->mca_wait, cl); + closure_wait(&bc->alloc_wait, cl); /* Try again, after adding ourselves to waitlist */ - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + old = cmpxchg(&bc->alloc_lock, NULL, current); if (old == NULL || old == current) { /* We raced */ - closure_wake_up(&c->mca_wait); + closure_wake_up(&bc->alloc_wait); goto success; } @@ -438,16 +456,17 @@ success: return 0; } -static struct btree *mca_cannibalize(struct bch_fs *c) +static struct btree *btree_node_cannibalize(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &c->btree_cache, list) + list_for_each_entry_reverse(b, &bc->live, list) if (!btree_node_reclaim(c, b)) return b; while (1) { - list_for_each_entry_reverse(b, &c->btree_cache, list) + list_for_each_entry_reverse(b, &bc->live, list) if (!btree_node_write_and_reclaim(c, b)) return b; @@ -462,16 +481,17 @@ static struct btree *mca_cannibalize(struct bch_fs *c) struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; u64 start_time = local_clock(); - mutex_lock(&c->btree_cache_lock); + mutex_lock(&bc->lock); /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: */ - list_for_each_entry(b, &c->btree_cache_freeable, list) + list_for_each_entry(b, &bc->freeable, list) if (!btree_node_reclaim(c, b)) goto out_unlock; @@ -479,9 +499,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ - list_for_each_entry(b, &c->btree_cache_freed, list) + list_for_each_entry(b, &bc->freed, list) if (!btree_node_reclaim(c, b)) { - mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); + btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); if (b->data) goto out_unlock; @@ -490,7 +510,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) goto err; } - b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO); + b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO); if (!b) goto err; @@ -501,7 +521,7 @@ out_unlock: BUG_ON(btree_node_write_in_flight(b)); list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&bc->lock); out: b->flags = 0; b->written = 0; @@ -517,18 +537,18 @@ out: return b; err: /* Try to cannibalize another cached btree node: */ - if (c->btree_cache_alloc_lock == current) { - b = mca_cannibalize(c); + if (bc->alloc_lock == current) { + b = btree_node_cannibalize(c); list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&bc->lock); - bch2_btree_node_hash_remove(c, b); + bch2_btree_node_hash_remove(bc, b); trace_btree_node_cannibalize(c); goto out; } - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&bc->lock); return ERR_PTR(-ENOMEM); } @@ -539,6 +559,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, unsigned level, enum six_lock_type lock_type) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; /* @@ -552,15 +573,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return b; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ bkey_i_to_extent(&b->key)->v._data[0] = 0; - mutex_lock(&c->btree_cache_lock); - list_add(&b->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); + mutex_lock(&bc->lock); + list_add(&b->list, &bc->freeable); + mutex_unlock(&bc->lock); six_unlock_write(&b->lock); six_unlock_intent(&b->lock); @@ -601,13 +622,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; BUG_ON(level >= BTREE_MAX_DEPTH); retry: rcu_read_lock(); - b = mca_find(c, k); + b = btree_cache_find(bc, k); rcu_read_unlock(); if (unlikely(!b)) { @@ -755,12 +777,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, unsigned level, enum btree_id btree_id) { + struct btree_cache *bc = &c->btree_cache; struct btree *b; BUG_ON(level >= BTREE_MAX_DEPTH); rcu_read_lock(); - b = mca_find(c, k); + b = btree_cache_find(bc, k); rcu_read_unlock(); if (b) @@ -771,15 +794,15 @@ void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, return; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(c, b, level, btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ bkey_i_to_extent(&b->key)->v._data[0] = 0; - mutex_lock(&c->btree_cache_lock); - list_add(&b->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); + mutex_lock(&bc->lock); + list_add(&b->list, &bc->freeable); + mutex_unlock(&bc->lock); goto out; } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 5e836acd..46d536eb 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -11,13 +11,13 @@ extern const char * const bch2_btree_ids[]; void bch2_recalc_btree_reserve(struct bch_fs *); -void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *); -int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *); -int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *, +void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); +int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); +int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); -void bch2_btree_node_cannibalize_unlock(struct bch_fs *); -int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *); +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); +int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); @@ -32,8 +32,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *, unsigned, enum btree_id); -void bch2_fs_btree_exit(struct bch_fs *); -int bch2_fs_btree_init(struct bch_fs *); +void bch2_fs_btree_cache_exit(struct bch_fs *); +int bch2_fs_btree_cache_init(struct bch_fs *); +void bch2_fs_btree_cache_init_early(struct btree_cache *); #define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e5cc00cc..b0901965 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -278,9 +278,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) { struct bch_dev *ca; struct open_bucket *ob; + const struct open_bucket_ptr *ptr; size_t i, j, iter; unsigned ci; + down_write(&c->alloc_gc_lock); + for_each_member_device(ca, c, ci) { spin_lock(&ca->freelist_lock); @@ -291,21 +294,26 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); + for (ptr = ca->open_buckets_partial; + ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr; + ptr++) + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); + spin_unlock(&ca->freelist_lock); } for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - const struct bch_extent_ptr *ptr; - - mutex_lock(&ob->lock); + spin_lock(&ob->lock); open_bucket_for_each_ptr(ob, ptr) { - ca = c->devs[ptr->dev]; - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true); + ca = c->devs[ptr->ptr.dev]; + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); } - mutex_unlock(&ob->lock); + spin_unlock(&ob->lock); } + + up_write(&c->alloc_gc_lock); } static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 507a6a9d..d50e9e8e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1364,17 +1364,17 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_node_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(c); - bch2_btree_node_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); bkey_copy(&b->key, k); - BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); bch2_btree_node_read(c, b, true); six_unlock_write(&b->lock); @@ -1844,8 +1844,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c) unsigned i; rcu_read_lock(); - tbl = rht_dereference_rcu(c->btree_cache_table.tbl, - &c->btree_cache_table); + tbl = rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(b, pos, tbl, i, hash) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index b1b62339..b0e64957 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -769,7 +769,7 @@ retry_all: closure_init_stack(&cl); do { - ret = bch2_btree_node_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); closure_sync(&cl); } while (ret); } @@ -817,7 +817,7 @@ retry: ret = btree_iter_linked(iter) ? -EINTR : 0; out: - bch2_btree_node_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(c); return ret; io_error: BUG_ON(ret != -EIO); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index c0c16205..8b4df034 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -130,6 +130,42 @@ struct btree { #endif }; +struct btree_cache { + struct rhashtable table; + bool table_init_done; + /* + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct mutex lock; + struct list_head live; + struct list_head freeable; + struct list_head freed; + + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; + struct shrinker shrink; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + struct task_struct *alloc_lock; + struct closure_waitlist alloc_wait; +}; + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 922a4863..2efb01c1 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -237,11 +237,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, six_lock_write(&b->lock); - bch2_btree_node_hash_remove(c, b); + bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache_lock); - list_move(&b->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); /* * By using six_unlock_write() directly instead of @@ -339,11 +339,11 @@ retry: bkey_extent_init(&tmp.k); tmp.k.k.size = c->opts.btree_node_size, - ob = bch2_alloc_sectors(c, &c->btree_write_point, - bkey_i_to_extent(&tmp.k), - res->nr_replicas, - c->opts.metadata_replicas_required, - alloc_reserve, cl); + ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0, + bkey_i_to_extent(&tmp.k), + res->nr_replicas, + c->opts.metadata_replicas_required, + alloc_reserve, 0, cl); if (IS_ERR(ob)) return ERR_CAST(ob); @@ -374,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->reserve->b[--as->reserve->nr]; - BUG_ON(bch2_btree_node_hash_insert(c, b, level, as->btree_id)); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id)); set_btree_node_accessed(b); set_btree_node_dirty(b); @@ -515,7 +515,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, * Protects reaping from the btree node cache and using the btree node * open bucket reserve: */ - ret = bch2_btree_node_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(c, cl); if (ret) { bch2_disk_reservation_put(c, &disk_res); return ERR_PTR(ret); @@ -543,11 +543,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, reserve->b[reserve->nr++] = b; } - bch2_btree_node_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(c); return reserve; err_free: bch2_btree_reserve_put(c, reserve); - bch2_btree_node_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(c); trace_btree_reserve_get_fail(c, nr_nodes, cl); return ERR_PTR(ret); } @@ -1015,9 +1015,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) { /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache_lock); + mutex_lock(&c->btree_cache.lock); list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); btree_node_root(c, b) = b; @@ -1802,7 +1802,7 @@ retry: PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { /* bch2_btree_reserve_get will unlock */ do { - ret = bch2_btree_node_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); closure_sync(&cl); } while (ret == -EAGAIN); @@ -1873,23 +1873,24 @@ retry: if (parent) { if (new_hash) { bkey_copy(&new_hash->key, &new_key->k_i); - BUG_ON(bch2_btree_node_hash_insert(c, new_hash, - b->level, b->btree_id)); + ret = bch2_btree_node_hash_insert(&c->btree_cache, + new_hash, b->level, b->btree_id); + BUG_ON(ret); } bch2_btree_insert_node(as, parent, &iter, &keylist_single(&new_key->k_i)); if (new_hash) { - mutex_lock(&c->btree_cache_lock); - bch2_btree_node_hash_remove(c, new_hash); + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - bch2_btree_node_hash_remove(c, b); + bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new_key->k_i); - ret = __bch2_btree_node_hash_insert(c, b); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&c->btree_cache.lock); } else { bkey_copy(&b->key, &new_key->k_i); } @@ -1918,9 +1919,9 @@ retry: bch2_btree_update_done(as); out: if (new_hash) { - mutex_lock(&c->btree_cache_lock); - list_move(&new_hash->list, &c->btree_cache_freeable); - mutex_unlock(&c->btree_cache_lock); + mutex_lock(&c->btree_cache.lock); + list_move(&new_hash->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); six_unlock_write(&new_hash->lock); six_unlock_intent(&new_hash->lock); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index fbc31012..6fdbb464 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -407,8 +407,11 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) { - return sectors * crc_compressed_size(NULL, crc) / - crc_uncompressed_size(NULL, crc); + if (!sectors) + return 0; + + return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc), + crc_uncompressed_size(NULL, crc))); } /* diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index c8a03c7f..7b45bb78 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -25,7 +25,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) { void *b; - BUG_ON(size > c->sb.encoded_extent_max); + BUG_ON(size > c->sb.encoded_extent_max << 9); b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); if (b) @@ -164,8 +164,8 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, } break; case BCH_COMPRESSION_LZ4: - ret = LZ4_decompress_safe(src_data.b, dst_data, - src_len, dst_len); + ret = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); if (ret != dst_len) { ret = -EIO; goto err; @@ -269,7 +269,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret = -ENOMEM; - if (crc_uncompressed_size(NULL, &crc) < c->sb.encoded_extent_max) + if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max || + crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max) return -EIO; dst_data = dst_len == dst_iter.bi_size @@ -294,7 +295,7 @@ static int __bio_compress(struct bch_fs *c, { struct bbuf src_data = { NULL }, dst_data = { NULL }; unsigned pad; - int ret; + int ret = 0; dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); @@ -307,23 +308,28 @@ static int __bio_compress(struct bch_fs *c, void *workspace; int len = src->bi_iter.bi_size; - ret = 0; - workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); - while (len > block_bytes(c) && - (!(ret = LZ4_compress_destSize( + while (1) { + if (len <= block_bytes(c)) { + ret = 0; + break; + } + + ret = LZ4_compress_destSize( src_data.b, dst_data.b, &len, dst->bi_iter.bi_size, - workspace)) || - (len & (block_bytes(c) - 1)))) { - /* - * On error, the compressed data was bigger than - * dst_len - round down to nearest block and try again: - */ + workspace); + if (ret >= len) { + /* uncompressible: */ + ret = 0; + break; + } + + if (!(len & (block_bytes(c) - 1))) + break; len = round_down(len, block_bytes(c)); } - mempool_free(workspace, &c->lz4_workspace_pool); if (!ret) @@ -331,6 +337,7 @@ static int __bio_compress(struct bch_fs *c, *src_len = len; *dst_len = ret; + ret = 0; break; } case BCH_COMPRESSION_GZIP: { @@ -446,20 +453,22 @@ int bch2_check_set_has_compressed_data(struct bch_fs *c, unsigned compression_type) { switch (compression_type) { - case BCH_COMPRESSION_NONE: + case BCH_COMPRESSION_OPT_NONE: return 0; - case BCH_COMPRESSION_LZ4: + case BCH_COMPRESSION_OPT_LZ4: if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) return 0; bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4); break; - case BCH_COMPRESSION_GZIP: + case BCH_COMPRESSION_OPT_GZIP: if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) return 0; bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP); break; + default: + BUG(); } return bch2_fs_compress_init(c); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 1937f4cb..7d2f5ccb 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -511,19 +511,19 @@ static void extent_pick_read_device(struct bch_fs *c, struct bch_dev *ca = c->devs[ptr->dev]; if (ptr->cached && ptr_stale(ca, ptr)) - return; + continue; if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return; + continue; if (avoid && test_bit(ca->dev_idx, avoid->d)) - return; + continue; if (pick->ca && pick->ca->mi.tier < ca->mi.tier) - return; + continue; if (!percpu_ref_tryget(&ca->io_ref)) - return; + continue; if (pick->ca) percpu_ref_put(&pick->ca->io_ref); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 5eb62f9d..8b41be87 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -974,7 +974,8 @@ alloc_io: (struct disk_reservation) { .nr_replicas = c->opts.data_replicas, }, - foreground_write_point(c, inode->ei_last_dirtied), + c->fastest_devs, + inode->ei_last_dirtied, POS(inum, 0), &inode->ei_journal_seq, BCH_WRITE_THROTTLE); @@ -1545,10 +1546,11 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) dio->iop.is_dio = true; dio->iop.new_i_size = U64_MAX; bch2_write_op_init(&dio->iop.op, dio->c, dio->res, - foreground_write_point(dio->c, (unsigned long) current), - POS(inode->v.i_ino, (dio->offset + dio->written) >> 9), - &inode->ei_journal_seq, - flags|BCH_WRITE_THROTTLE); + dio->c->fastest_devs, + (unsigned long) dio->task, + POS(inode->v.i_ino, (dio->offset + dio->written) >> 9), + &inode->ei_journal_seq, + flags|BCH_WRITE_THROTTLE); dio->iop.op.index_update_fn = bchfs_write_index_update; dio->res.sectors -= bio_sectors(bio); @@ -1568,13 +1570,13 @@ static void bch2_dio_write_loop_async(struct closure *cl) bch2_dio_write_done(dio); if (dio->iter.count && !dio->error) { - use_mm(dio->mm); + use_mm(dio->task->mm); pagecache_block_get(&mapping->add_lock); bch2_do_direct_IO_write(dio); pagecache_block_put(&mapping->add_lock); - unuse_mm(dio->mm); + unuse_mm(dio->task->mm); continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); } else { @@ -1617,7 +1619,7 @@ static int bch2_direct_IO_write(struct bch_fs *c, dio->offset = offset; dio->iovec = NULL; dio->iter = *iter; - dio->mm = current->mm; + dio->task = current; closure_init(&dio->cl, NULL); if (offset + iter->count > inode->v.i_size) diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index dfdc9b52..505cea73 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -74,7 +74,7 @@ struct dio_write { struct iovec inline_vecs[UIO_FASTIOV]; struct iov_iter iter; - struct mm_struct *mm; + struct task_struct *task; /* must be last: */ struct bchfs_write_op iop; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 946c75bb..e5fc72da 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -350,7 +350,7 @@ static void init_append_extent(struct bch_write_op *op, bch2_keylist_push(&op->insert_keys); } -static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *orig = &op->wbio.bio; @@ -371,7 +371,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) /* Need to decompress data? */ if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && (crc_uncompressed_size(NULL, &op->crc) != op->size || - crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) { + crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) { int ret; ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc); @@ -389,7 +389,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) op->crc.nonce, op->crc.csum, op->crc.csum_type, - ob); + wp->ob); bio = orig; wbio = wbio_init(bio); @@ -398,7 +398,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) compression_type != BCH_COMPRESSION_NONE) { /* all units here in bytes */ unsigned total_output = 0, output_available = - min(ob->sectors_free << 9, orig->bi_iter.bi_size); + min(wp->sectors_free << 9, orig->bi_iter.bi_size); unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type) ? op->nonce : 0; struct bch_csum csum; @@ -441,7 +441,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) init_append_extent(op, dst_len >> 9, src_len >> 9, fragment_compression_type, - crc_nonce, csum, csum_type, ob); + crc_nonce, csum, csum_type, wp->ob); total_output += dst_len; bio_advance(bio, dst_len); @@ -468,14 +468,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) more = orig->bi_iter.bi_size != 0; } else { - bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO, + bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = bio != orig; init_append_extent(op, bio_sectors(bio), bio_sectors(bio), compression_type, 0, - (struct bch_csum) { 0 }, csum_type, ob); + (struct bch_csum) { 0 }, csum_type, wp->ob); more = bio != orig; } @@ -505,7 +505,8 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; unsigned open_bucket_nr = 0; - struct open_bucket *b; + struct write_point *wp; + struct open_bucket *ob; int ret; do { @@ -519,16 +520,19 @@ static void __bch2_write(struct closure *cl) BKEY_EXTENT_U64s_MAX)) continue_at(cl, bch2_write_index, index_update_wq(op)); - b = bch2_alloc_sectors_start(c, op->wp, + wp = bch2_alloc_sectors_start(c, BCH_DATA_USER, + op->devs, + op->write_point, op->nr_replicas, c->opts.data_replicas_required, op->alloc_reserve, + op->flags, (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); - EBUG_ON(!b); + EBUG_ON(!wp); - if (unlikely(IS_ERR(b))) { - if (unlikely(PTR_ERR(b) != -EAGAIN)) { - ret = PTR_ERR(b); + if (unlikely(IS_ERR(wp))) { + if (unlikely(PTR_ERR(wp) != -EAGAIN)) { + ret = PTR_ERR(wp); goto err; } @@ -561,13 +565,15 @@ static void __bch2_write(struct closure *cl) continue; } - BUG_ON(b - c->open_buckets == 0 || - b - c->open_buckets > U8_MAX); - op->open_buckets[open_bucket_nr++] = b - c->open_buckets; + ob = wp->ob; - ret = bch2_write_extent(op, b); + BUG_ON(ob - c->open_buckets == 0 || + ob - c->open_buckets > U8_MAX); + op->open_buckets[open_bucket_nr++] = ob - c->open_buckets; - bch2_alloc_sectors_done(c, op->wp, b); + ret = bch2_write_extent(op, wp); + + bch2_alloc_sectors_done(c, wp); if (ret < 0) goto err; @@ -704,7 +710,9 @@ void bch2_write(struct closure *cl) void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct disk_reservation res, - struct write_point *wp, struct bpos pos, + struct bch_devs_mask *devs, + unsigned long write_point, + struct bpos pos, u64 *journal_seq, unsigned flags) { EBUG_ON(res.sectors && !res.nr_replicas); @@ -723,7 +731,8 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->pos = pos; op->version = ZERO_VERSION; op->res = res; - op->wp = wp; + op->devs = devs; + op->write_point = write_point; if (journal_seq) { op->journal_seq_p = journal_seq; @@ -826,6 +835,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, * Adjust bio to correspond to _live_ portion of @k - * which might be less than what we're actually reading: */ + bio->bi_iter.bi_size = sectors << 9; bio_advance(bio, pick->crc.offset << 9); BUG_ON(bio_sectors(bio) < k.k->size); bio->bi_iter.bi_size = k.k->size << 9; @@ -836,7 +846,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c, */ op->write.op.pos.offset = iter.bi_sector; } - bch2_migrate_write_init(c, &op->write, &c->promote_write_point, + bch2_migrate_write_init(c, &op->write, + c->fastest_devs, k, NULL, BCH_WRITE_ALLOC_NOWAIT| BCH_WRITE_CACHED); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 674cdf7a..658c15a5 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -22,11 +22,12 @@ enum bch_write_flags { BCH_WRITE_FLUSH = (1 << 2), BCH_WRITE_DATA_COMPRESSED = (1 << 3), BCH_WRITE_THROTTLE = (1 << 4), + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 5), - BCH_WRITE_DONE = (1 << 6), - BCH_WRITE_LOOPED = (1 << 7), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), + BCH_WRITE_DONE = (1 << 7), + BCH_WRITE_LOOPED = (1 << 8), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -35,15 +36,10 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } -static inline struct write_point *foreground_write_point(struct bch_fs *c, - unsigned long v) -{ - return c->write_points + - hash_long(v, ilog2(ARRAY_SIZE(c->write_points))); -} - void bch2_write_op_init(struct bch_write_op *, struct bch_fs *, - struct disk_reservation, struct write_point *, + struct disk_reservation, + struct bch_devs_mask *, + unsigned long, struct bpos, u64 *, unsigned); void bch2_write(struct closure *); diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index ae4f8f3c..f77106be 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -116,9 +116,10 @@ struct bch_write_op { struct bch_extent_crc128 crc; unsigned size; - struct disk_reservation res; + struct bch_devs_mask *devs; + unsigned long write_point; - struct write_point *wp; + struct disk_reservation res; union { u8 open_buckets[16]; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index c6659259..d7f27a3d 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -15,6 +15,7 @@ static int issue_migration_move(struct bch_dev *ca, struct moving_context *ctxt, + struct bch_devs_mask *devs, struct bkey_s_c k) { struct bch_fs *c = ca->fs; @@ -33,7 +34,7 @@ static int issue_migration_move(struct bch_dev *ca, found: /* XXX: we need to be doing something with the disk reservation */ - ret = bch2_data_move(c, ctxt, &c->migration_write_point, k, ptr); + ret = bch2_data_move(c, ctxt, devs, k, ptr); if (ret) bch2_disk_reservation_put(c, &res); return ret; @@ -110,7 +111,7 @@ int bch2_move_data_off_device(struct bch_dev *ca) ca->dev_idx)) goto next; - ret = issue_migration_move(ca, &ctxt, k); + ret = issue_migration_move(ca, &ctxt, NULL, k); if (ret == -ENOMEM) { bch2_btree_iter_unlock(&iter); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index f78cd72f..0c5b924c 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -139,7 +139,7 @@ out: void bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, - struct write_point *wp, + struct bch_devs_mask *devs, struct bkey_s_c k, const struct bch_extent_ptr *move_ptr, unsigned flags) @@ -155,8 +155,10 @@ void bch2_migrate_write_init(struct bch_fs *c, (move_ptr && move_ptr->cached)) flags |= BCH_WRITE_CACHED; - bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp, - bkey_start_pos(k.k), NULL, flags); + bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, + devs, (unsigned long) current, + bkey_start_pos(k.k), NULL, + flags|BCH_WRITE_ONLY_SPECIFIED_DEVS); if (m->move) m->op.alloc_reserve = RESERVE_MOVINGGC; @@ -249,7 +251,7 @@ static void read_moving_endio(struct bio *bio) int bch2_data_move(struct bch_fs *c, struct moving_context *ctxt, - struct write_point *wp, + struct bch_devs_mask *devs, struct bkey_s_c k, const struct bch_extent_ptr *move_ptr) { @@ -280,7 +282,7 @@ int bch2_data_move(struct bch_fs *c, migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size); - bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); + bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0); trace_move_read(&io->write.key.k); diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 71edcf13..a756a462 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -20,12 +20,9 @@ struct migrate_write { struct bch_write_op op; }; -void bch2_migrate_write_init(struct bch_fs *, - struct migrate_write *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *, - unsigned); +void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, + struct bch_devs_mask *, struct bkey_s_c, + const struct bch_extent_ptr *, unsigned); #define SECTORS_IN_FLIGHT_PER_DEVICE 2048 @@ -69,11 +66,9 @@ struct moving_io { struct bio_vec bi_inline_vecs[0]; }; -int bch2_data_move(struct bch_fs *, - struct moving_context *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *); +int bch2_data_move(struct bch_fs *, struct moving_context *, + struct bch_devs_mask *, struct bkey_s_c, + const struct bch_extent_ptr *); int bch2_move_ctxt_wait(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 72cbb9d5..125159ee 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -14,6 +14,7 @@ #include "keylist.h" #include "move.h" #include "movinggc.h" +#include "super-io.h" #include #include @@ -72,7 +73,7 @@ static int issue_moving_gc_move(struct bch_dev *ca, if (!ptr) /* We raced - bucket's been reused */ return 0; - ret = bch2_data_move(c, ctxt, &ca->copygc_write_point, k, ptr); + ret = bch2_data_move(c, ctxt, &ca->self, k, ptr); if (!ret) trace_gc_copy(k.k); else diff --git a/libbcachefs/super.c b/libbcachefs/super.c index dfb95d0d..0342778d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -376,7 +376,7 @@ err: static void bch2_fs_free(struct bch_fs *c) { bch2_fs_encryption_exit(c); - bch2_fs_btree_exit(c); + bch2_fs_btree_cache_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); @@ -491,7 +491,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->state_lock); mutex_init(&c->sb_lock); mutex_init(&c->replicas_gc_lock); - mutex_init(&c->btree_cache_lock); mutex_init(&c->bucket_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); @@ -507,9 +506,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_tiering_init(c); INIT_LIST_HEAD(&c->list); - INIT_LIST_HEAD(&c->btree_cache); - INIT_LIST_HEAD(&c->btree_cache_freeable); - INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->btree_interior_update_list); mutex_init(&c->btree_reserve_cache_lock); @@ -546,6 +542,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.blocked_time = &c->journal_blocked_time; c->journal.flush_seq_time = &c->journal_flush_seq_time; + bch2_fs_btree_cache_init_early(&c->btree_cache); + mutex_lock(&c->sb_lock); if (bch2_sb_to_fs(c, sb)) { @@ -599,7 +597,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || - bch2_fs_btree_init(c) || + bch2_fs_btree_cache_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || bch2_check_set_has_compressed_data(c, c->opts.compression)) @@ -1107,8 +1105,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - ca->copygc_write_point.type = BCH_DATA_USER; - spin_lock_init(&ca->freelist_lock); bch2_dev_moving_gc_init(ca); @@ -1169,8 +1165,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) for (i = 0; i < RESERVE_NR; i++) total_reserve += ca->free[i].size; - ca->copygc_write_point.group = &ca->self; - ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 07d9be75..c20769b7 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -209,11 +209,11 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) size_t ret = 0; struct btree *b; - mutex_lock(&c->btree_cache_lock); - list_for_each_entry(b, &c->btree_cache, list) + mutex_lock(&c->btree_cache.lock); + list_for_each_entry(b, &c->btree_cache.live, list) ret += btree_bytes(c); - mutex_unlock(&c->btree_cache_lock); + mutex_unlock(&c->btree_cache.lock); return ret; } @@ -436,7 +436,7 @@ STORE(__bch2_fs) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc); + c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); } return size; diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index b68cae75..cbfcfccc 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -54,7 +54,7 @@ static int issue_tiering_move(struct bch_fs *c, { int ret; - ret = bch2_data_move(c, ctxt, &tier->wp, k, NULL); + ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL); if (!ret) trace_tiering_copy(k.k); else @@ -241,6 +241,5 @@ void bch2_fs_tiering_init(struct bch_fs *c) for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { c->tiers[i].idx = i; bch2_pd_controller_init(&c->tiers[i].pd); - c->tiers[i].wp.group = &c->tiers[i].devs; } }