diff --git a/.bcachefs_revision b/.bcachefs_revision index b6371345..9f81e277 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -defaad6d47791d3e6285cba323f92847b6e4c226 +8bf4b038d41230504d3f0315a35e4d7a056e0a65 diff --git a/libbcachefs.c b/libbcachefs.c index 3ce69d1b..c8738f40 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -26,8 +26,6 @@ #define NSEC_PER_SEC 1000000000L -#define BCH_MIN_NR_NBUCKETS (1 << 10) - /* minimum size filesystem we can create, given a bucket size: */ static u64 min_size(unsigned bucket_size) { diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index c3efb435..9ff61deb 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -373,6 +373,11 @@ static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) } } +static inline u64 bucket_clock_freq(u64 capacity) +{ + return max(capacity >> 10, 2028ULL); +} + static void bch2_inc_clock_hand(struct io_timer *timer) { struct bucket_clock *clock = container_of(timer, @@ -411,7 +416,7 @@ static void bch2_inc_clock_hand(struct io_timer *timer) * RW mode (that will be 0 when we're RO, yet we can still service * reads) */ - timer->expire += capacity >> 10; + timer->expire += bucket_clock_freq(capacity); bch2_io_timer_add(&c->io_clock[clock->rw], timer); } @@ -423,7 +428,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) clock->hand = 1; clock->rw = rw; clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = c->capacity >> 10; + clock->rescale.expire = bucket_clock_freq(c->capacity); mutex_init(&clock->lock); } @@ -974,6 +979,7 @@ void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; unsigned long ra_pages = 0; unsigned i, j; @@ -1009,14 +1015,9 @@ void bch2_recalc_capacity(struct bch_fs *c) for (j = 0; j < RESERVE_NONE; j++) dev_reserve += ca->free[j].size; - dev_reserve += ca->free_inc.size; - - dev_reserve += ARRAY_SIZE(c->write_points); - dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ dev_reserve += 1; /* rebalance write point */ - dev_reserve += WRITE_POINT_COUNT; dev_reserve *= ca->mi.bucket_size; @@ -1026,6 +1027,9 @@ void bch2_recalc_capacity(struct bch_fs *c) ca->mi.first_bucket); reserved_sectors += dev_reserve * 2; + + bucket_size_max = max_t(unsigned, bucket_size_max, + ca->mi.bucket_size); } gc_reserve = c->opts.gc_reserve_bytes @@ -1038,6 +1042,8 @@ void bch2_recalc_capacity(struct bch_fs *c) c->capacity = capacity - reserved_sectors; + c->bucket_size_max = bucket_size_max; + if (c->capacity) { bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); @@ -1329,8 +1335,6 @@ not_enough: * invalidated on disk: */ if (invalidating_data) { - BUG(); - pr_info("holding writes"); pr_debug("invalidating existing data"); set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); } else { @@ -1390,40 +1394,12 @@ int bch2_fs_allocator_start(struct bch_fs *c) return bch2_alloc_write(c); } -void bch2_fs_allocator_init(struct bch_fs *c) +void bch2_fs_allocator_background_init(struct bch_fs *c) { - struct open_bucket *ob; - struct write_point *wp; - - mutex_init(&c->write_points_hash_lock); spin_lock_init(&c->freelist_lock); bch2_bucket_clock_init(c, READ); bch2_bucket_clock_init(c, WRITE); - /* open bucket 0 is a sentinal NULL: */ - spin_lock_init(&c->open_buckets[0].lock); - - for (ob = c->open_buckets + 1; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock_init(&ob->lock); - c->open_buckets_nr_free++; - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - } - - writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); - writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); - - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) { - writepoint_init(wp, BCH_DATA_USER); - - wp->last_used = sched_clock(); - wp->write_point = (unsigned long) wp; - hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - } - c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index b5dbf7eb..ea07705b 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -5,7 +5,7 @@ #include "alloc_types.h" #include "debug.h" -#define ALLOC_SCAN_BATCH(ca) ((ca)->mi.nbuckets >> 9) +#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); @@ -56,6 +56,6 @@ int bch2_dev_allocator_start(struct bch_dev *); int bch2_alloc_write(struct bch_fs *); int bch2_fs_allocator_start(struct bch_fs *); -void bch2_fs_allocator_init(struct bch_fs *); +void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index be94196e..06859960 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -491,7 +491,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, mutex_lock(&wp->lock); open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->ptr.dev == ca->dev_idx) + if (!ca || ob->ptr.dev == ca->dev_idx) open_bucket_free_unused(c, wp, ob); else ob_push(c, &ptrs, ob); @@ -500,6 +500,15 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&wp->lock); } +static inline struct hlist_head *writepoint_hash(struct bch_fs *c, + unsigned long write_point) +{ + unsigned hash = + hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); + + return &c->write_points_hash[hash]; +} + static struct write_point *__writepoint_find(struct hlist_head *head, unsigned long write_point) { @@ -512,6 +521,53 @@ static struct write_point *__writepoint_find(struct hlist_head *head, return NULL; } +static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +{ + u64 stranded = c->write_points_nr * c->bucket_size_max; + u64 free = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)); + + return stranded * factor > free; +} + +static bool try_increase_writepoints(struct bch_fs *c) +{ + struct write_point *wp; + + if (c->write_points_nr == ARRAY_SIZE(c->write_points) || + too_many_writepoints(c, 32)) + return false; + + wp = c->write_points + c->write_points_nr++; + hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); + return true; +} + +static bool try_decrease_writepoints(struct bch_fs *c, + unsigned old_nr) +{ + struct write_point *wp; + + mutex_lock(&c->write_points_hash_lock); + if (c->write_points_nr < old_nr) { + mutex_unlock(&c->write_points_hash_lock); + return true; + } + + if (c->write_points_nr == 1 || + !too_many_writepoints(c, 8)) { + mutex_unlock(&c->write_points_hash_lock); + return false; + } + + wp = c->write_points + --c->write_points_nr; + + hlist_del_rcu(&wp->node); + mutex_unlock(&c->write_points_hash_lock); + + bch2_writepoint_stop(c, NULL, wp); + return true; +} + static struct write_point *writepoint_find(struct bch_fs *c, unsigned long write_point) { @@ -535,16 +591,22 @@ lock_wp: mutex_unlock(&wp->lock); goto restart_find; } - +restart_find_oldest: oldest = NULL; for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) + wp < c->write_points + c->write_points_nr; wp++) if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; mutex_lock(&oldest->lock); mutex_lock(&c->write_points_hash_lock); + if (oldest >= c->write_points + c->write_points_nr || + try_increase_writepoints(c)) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto restart_find_oldest; + } + wp = __writepoint_find(head, write_point); if (wp && wp != oldest) { mutex_unlock(&c->write_points_hash_lock); @@ -580,10 +642,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned nr_effective = 0; struct open_buckets ptrs = { .nr = 0 }; bool have_cache = false; + unsigned write_points_nr; int ret = 0, i; BUG_ON(!nr_replicas || !nr_replicas_required); - +retry: + write_points_nr = c->write_points_nr; wp = writepoint_find(c, write_point.v); if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -636,6 +700,11 @@ err: wp->ptrs = ptrs; mutex_unlock(&wp->lock); + + if (ret == -ENOSPC && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + return ERR_PTR(ret); } @@ -687,3 +756,37 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) bch2_open_buckets_put(c, &ptrs); } + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) +{ + struct open_bucket *ob; + struct write_point *wp; + + mutex_init(&c->write_points_hash_lock); + c->write_points_nr = ARRAY_SIZE(c->write_points); + + /* open bucket 0 is a sentinal NULL: */ + spin_lock_init(&c->open_buckets[0].lock); + + for (ob = c->open_buckets + 1; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { + spin_lock_init(&ob->lock); + c->open_buckets_nr_free++; + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + } + + writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); + writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { + writepoint_init(wp, BCH_DATA_USER); + + wp->last_used = sched_clock(); + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, + writepoint_hash(c, wp->write_point)); + } +} diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index ae9844b5..729afc92 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -90,15 +90,6 @@ void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, struct write_point *); -static inline struct hlist_head *writepoint_hash(struct bch_fs *c, - unsigned long write_point) -{ - unsigned hash = - hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); - - return &c->write_points_hash[hash]; -} - static inline struct write_point_specifier writepoint_hashed(unsigned long v) { return (struct write_point_specifier) { .v = v | 1 }; @@ -116,4 +107,6 @@ static inline void writepoint_init(struct write_point *wp, wp->type = type; } +void bch2_fs_allocator_foreground_init(struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 94c041d2..110663ff 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -45,7 +45,9 @@ typedef FIFO(long) alloc_fifo; /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 -#define WRITE_POINT_COUNT 32 + +#define WRITE_POINT_HASH_NR 32 +#define WRITE_POINT_MAX 32 struct open_bucket { spinlock_t lock; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 6d5c7d6b..e23f45e8 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -322,7 +322,7 @@ enum bch_time_stats { #define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) /* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) +#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX struct btree; @@ -598,6 +598,7 @@ struct bch_fs { * and forces them to be revalidated */ u32 capacity_gen; + unsigned bucket_size_max; atomic64_t sectors_available; @@ -627,9 +628,10 @@ struct bch_fs { struct write_point btree_write_point; struct write_point rebalance_write_point; - struct write_point write_points[WRITE_POINT_COUNT]; - struct hlist_head write_points_hash[WRITE_POINT_COUNT]; + struct write_point write_points[WRITE_POINT_MAX]; + struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; struct mutex write_points_hash_lock; + unsigned write_points_nr; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index cdf392b3..7ad080bf 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -904,6 +904,8 @@ struct bch_sb_field_journal { /* BCH_SB_FIELD_members: */ +#define BCH_MIN_NR_NBUCKETS (1 << 6) + struct bch_member { uuid_le uuid; __le64 nbuckets; /* device size */ @@ -1381,7 +1383,7 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -#define BCH_JOURNAL_BUCKETS_MIN 20 +#define BCH_JOURNAL_BUCKETS_MIN 8 /* Btree: */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 271c02f1..15a07e36 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -299,11 +299,6 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) return min(c->capacity, __bch2_fs_sectors_used(c, stats)); } -static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats) -{ - return c->capacity - bch2_fs_sectors_used(c, stats); -} - static inline int is_unavailable_bucket(struct bucket_mark m) { return !is_available_bucket(m); @@ -883,9 +878,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 4, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 16, nbuckets >> 7); - size_t free_inc_nr = max(max_t(size_t, 16, nbuckets >> 12), + size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); + size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve); bool resize = ca->buckets != NULL, start_copygc = ca->copygc_thread != NULL; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index d9fe938a..17b82cd0 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -174,6 +174,12 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); +static inline u64 bch2_fs_sectors_free(struct bch_fs *c, + struct bch_fs_usage stats) +{ + return c->capacity - bch2_fs_sectors_used(c, stats); +} + static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 902f39f6..f530f202 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -277,7 +277,7 @@ out: return ret; err: fsck_err: - BUG_ON(!ret); + pr_err("Error in recovery: %s (%i)", err, ret); goto out; } @@ -380,6 +380,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - BUG_ON(!ret); + pr_err("Error initializing new filesystem: %s (%i)", err, ret); return ret; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 54de9fac..8ef5db3d 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -808,7 +808,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb, return "Too many buckets"; if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < 1 << 10) + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) return "Not enough buckets"; if (le16_to_cpu(m->bucket_size) < diff --git a/libbcachefs/super.c b/libbcachefs/super.c index be28d40f..b7a6f5fb 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -556,7 +556,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_allocator_init(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c);