From c35fbbc025c6099969befb4dfaf065215cf40cf3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 23 Apr 2017 21:56:57 -0800 Subject: [PATCH] Update bcachefs sources to 2e70771b8d --- .bcachefs_revision | 2 +- cmd_debug.c | 3 +- cmd_migrate.c | 4 +- include/linux/backing-dev.h | 1 + include/linux/bio.h | 6 +- include/trace/events/bcachefs.h | 6 +- libbcachefs/alloc.c | 80 +++++----- libbcachefs/alloc.h | 1 + libbcachefs/bcachefs.h | 10 +- libbcachefs/bcachefs_format.h | 36 +---- libbcachefs/bset.c | 30 ++-- libbcachefs/btree_cache.c | 49 ++++++- libbcachefs/btree_cache.h | 3 + libbcachefs/btree_gc.c | 8 +- libbcachefs/btree_io.c | 80 +++++++--- libbcachefs/btree_io.h | 11 +- libbcachefs/btree_iter.c | 108 +++++++++----- libbcachefs/btree_iter.h | 124 +++++++--------- libbcachefs/btree_types.h | 2 + libbcachefs/btree_update.c | 13 +- libbcachefs/buckets.c | 1 - libbcachefs/buckets.h | 37 +---- libbcachefs/buckets_types.h | 11 +- libbcachefs/clock.c | 6 +- libbcachefs/clock_types.h | 4 +- libbcachefs/debug.c | 6 +- libbcachefs/dirent.c | 12 +- libbcachefs/extents.c | 39 ++--- libbcachefs/eytzinger.h | 252 ++++++++++++++++++++++---------- libbcachefs/fs-io.c | 39 ++--- libbcachefs/fs.c | 6 +- libbcachefs/fsck.c | 20 +-- libbcachefs/inode.c | 10 +- libbcachefs/io.c | 16 +- libbcachefs/journal.c | 8 +- libbcachefs/migrate.c | 10 +- libbcachefs/move.c | 10 +- libbcachefs/movinggc.c | 107 +++++++++----- libbcachefs/six.c | 4 +- libbcachefs/str_hash.h | 29 ++-- libbcachefs/super.c | 13 +- libbcachefs/sysfs.c | 4 +- libbcachefs/tier.c | 3 +- libbcachefs/util.c | 101 +++++++++++++ libbcachefs/util.h | 80 +++++----- libbcachefs/xattr.c | 2 +- linux/bio.c | 6 +- 47 files changed, 874 insertions(+), 539 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 5f16f468..b04d44bc 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -846600a41b7853588796a5403b07347d36c5a65c +2e70771b8dc0d0f2d0356a5a7d16cab9430cd49e diff --git a/cmd_debug.c b/cmd_debug.c index d825753d..195e5885 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -160,7 +160,8 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, struct bkey_s_c k; char buf[512]; - for_each_btree_key(&iter, c, btree_id, start, k) { + for_each_btree_key(&iter, c, btree_id, start, + BTREE_ITER_PREFETCH, k) { if (bkey_cmp(k.k->p, end) > 0) break; diff --git a/cmd_migrate.c b/cmd_migrate.c index a18aae10..72cc004d 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -259,9 +259,7 @@ static void write_data(struct bch_fs *c, closure_init_stack(&cl); - bio_init(&bio.bio); - bio.bio.bi_max_vecs = 1; - bio.bio.bi_io_vec = &bv; + bio_init(&bio.bio, &bv, 1); bio.bio.bi_iter.bi_size = len; bch2_bio_map(&bio.bio, buf); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a68fca4b..01b2c153 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -9,6 +9,7 @@ enum wb_congested_state { }; struct backing_dev_info { + struct list_head bdi_list; unsigned ra_pages; unsigned capabilities; diff --git a/include/linux/bio.h b/include/linux/bio.h index 49d26b53..10cad5cc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -451,11 +451,15 @@ static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return bio_clone_bioset(bio, gfp_mask, NULL); } -static inline void bio_init(struct bio *bio) +static inline void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs) { memset(bio, 0, sizeof(*bio)); atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + + bio->bi_io_vec = table; + bio->bi_max_vecs = max_vecs; } #endif /* __LINUX_BIO_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 7dea9d63..06cb5ff3 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -90,8 +90,7 @@ DECLARE_EVENT_CLASS(bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u", @@ -156,8 +155,7 @@ TRACE_EVENT(write_throttle, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->delay = delay; ), diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index d5d2679f..a4e412ea 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -233,11 +233,8 @@ static void pd_controllers_update(struct work_struct *work) static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) { - bio_init(ca->bio_prio); - bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META); - - ca->bio_prio->bi_max_vecs = bucket_pages(ca); - ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs; + bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca)); + ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META; ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; ca->bio_prio->bi_bdev = ca->disk_sb.bdev; ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); @@ -636,9 +633,10 @@ static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, + struct bucket_mark mark) { - if (!is_available_bucket(READ_ONCE(g->mark))) + if (!is_available_bucket(mark)) return false; if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1) @@ -679,24 +677,38 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) * btree GC to rewrite nodes with stale pointers. */ -#define bucket_sort_key(g) \ -({ \ - unsigned long prio = g->read_prio - ca->min_prio[READ]; \ - prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \ - ca->min_prio[READ]); \ - \ - (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\ -}) +static unsigned long bucket_sort_key(bucket_heap *h, + struct bucket_heap_entry e) +{ + struct bch_dev *ca = container_of(h, struct bch_dev, alloc_heap); + struct bucket *g = ca->buckets + e.bucket; + unsigned long prio = g->read_prio - ca->min_prio[READ]; + prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - + ca->min_prio[READ]); + + return (prio + 1) * bucket_sectors_used(e.mark); +} + +static inline int bucket_alloc_cmp(bucket_heap *h, + struct bucket_heap_entry l, + struct bucket_heap_entry r) +{ + return bucket_sort_key(h, l) - bucket_sort_key(h, r); +} + +static inline long bucket_idx_cmp(bucket_heap *h, + struct bucket_heap_entry l, + struct bucket_heap_entry r) +{ + return l.bucket - r.bucket; +} static void invalidate_buckets_lru(struct bch_dev *ca) { struct bucket_heap_entry e; struct bucket *g; - unsigned i; - mutex_lock(&ca->heap_lock); - - ca->heap.used = 0; + ca->alloc_heap.used = 0; mutex_lock(&ca->fs->bucket_lock); bch2_recalc_min_prio(ca, READ); @@ -708,37 +720,32 @@ static void invalidate_buckets_lru(struct bch_dev *ca) * all buckets have been visited. */ for_each_bucket(g, ca) { - if (!bch2_can_invalidate_bucket(ca, g)) + struct bucket_mark m = READ_ONCE(g->mark); + struct bucket_heap_entry e = { g - ca->buckets, m }; + + if (!bch2_can_invalidate_bucket(ca, g, m)) continue; - bucket_heap_push(ca, g, bucket_sort_key(g)); + heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); } /* Sort buckets by physical location on disk for better locality */ - for (i = 0; i < ca->heap.used; i++) { - struct bucket_heap_entry *e = &ca->heap.data[i]; - - e->val = e->g - ca->buckets; - } - - heap_resort(&ca->heap, bucket_max_cmp); + heap_resort(&ca->alloc_heap, bucket_idx_cmp); /* * If we run out of buckets to invalidate, bch2_allocator_thread() will * kick stuff and retry us */ while (!fifo_full(&ca->free_inc) && - heap_pop(&ca->heap, e, bucket_max_cmp)) { - BUG_ON(!bch2_can_invalidate_bucket(ca, e.g)); - bch2_invalidate_one_bucket(ca, e.g); - } + heap_pop(&ca->alloc_heap, e, bucket_idx_cmp)) + bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]); mutex_unlock(&ca->fs->bucket_lock); - mutex_unlock(&ca->heap_lock); } static void invalidate_buckets_fifo(struct bch_dev *ca) { + struct bucket_mark m; struct bucket *g; size_t checked = 0; @@ -748,8 +755,9 @@ static void invalidate_buckets_fifo(struct bch_dev *ca) ca->fifo_last_bucket = ca->mi.first_bucket; g = ca->buckets + ca->fifo_last_bucket++; + m = READ_ONCE(g->mark); - if (bch2_can_invalidate_bucket(ca, g)) + if (bch2_can_invalidate_bucket(ca, g, m)) bch2_invalidate_one_bucket(ca, g); if (++checked >= ca->mi.nbuckets) @@ -759,6 +767,7 @@ static void invalidate_buckets_fifo(struct bch_dev *ca) static void invalidate_buckets_random(struct bch_dev *ca) { + struct bucket_mark m; struct bucket *g; size_t checked = 0; @@ -768,8 +777,9 @@ static void invalidate_buckets_random(struct bch_dev *ca) ca->mi.first_bucket; g = ca->buckets + n; + m = READ_ONCE(g->mark); - if (bch2_can_invalidate_bucket(ca, g)) + if (bch2_can_invalidate_bucket(ca, g, m)) bch2_invalidate_one_bucket(ca, g); if (++checked >= ca->mi.nbuckets / 2) diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index c6b57fa1..195108c2 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -1,6 +1,7 @@ #ifndef _BCACHE_ALLOC_H #define _BCACHE_ALLOC_H +#include "bcachefs.h" #include "alloc_types.h" struct bkey; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index b1f2528a..6259b50e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -1,5 +1,5 @@ -#ifndef _BCACHE_H -#define _BCACHE_H +#ifndef _BCACHEFS_H +#define _BCACHEFS_H /* * SOME HIGH LEVEL CODE DOCUMENTATION: @@ -418,8 +418,8 @@ struct bch_dev { atomic_long_t saturated_count; size_t inc_gen_needs_gc; - struct mutex heap_lock; - DECLARE_HEAP(struct bucket_heap_entry, heap); + bucket_heap alloc_heap; + bucket_heap copygc_heap; /* Moving GC: */ struct task_struct *moving_gc_read; @@ -803,4 +803,4 @@ static inline unsigned block_bytes(const struct bch_fs *c) return c->sb.block_size << 9; } -#endif /* _BCACHE_H */ +#endif /* _BCACHEFS_H */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index a99d96cd..ef854fb1 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1,15 +1,10 @@ -#ifndef _LINUX_BCACHE_H -#define _LINUX_BCACHE_H +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H /* * Bcache on disk data structures */ -#ifdef __cplusplus -typedef bool _Bool; -extern "C" { -#endif - #include #include #include @@ -230,8 +225,6 @@ struct bkey_i { }; }; -#ifndef __cplusplus - #define KEY(_inode, _offset, _size) \ ((struct bkey) { \ .u64s = BKEY_U64s, \ @@ -240,24 +233,6 @@ struct bkey_i { .size = _size, \ }) -#else - -static inline struct bkey KEY(__u64 inode, __u64 offset, __u64 size) -{ - struct bkey ret; - - memset(&ret, 0, sizeof(ret)); - ret.u64s = BKEY_U64s; - ret.format = KEY_FORMAT_CURRENT; - ret.p.inode = inode; - ret.p.offset = offset; - ret.size = size; - - return ret; -} - -#endif - static inline void bkey_init(struct bkey *k) { *k = KEY(0, 0, 0); @@ -1344,9 +1319,4 @@ struct btree_node_entry { }; } __attribute__((packed, aligned(8))); -#ifdef __cplusplus -} -#endif -#endif /* _LINUX_BCACHE_H */ - -/* vim: set foldnestmax=2: */ +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 280dcf3e..53627380 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -473,7 +473,7 @@ void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) * in one cacheline in t->set (BSET_CACHELINE bytes). * * This means we don't have to store the full index of the key that a node in - * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and + * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and * then bkey_float->m gives us the offset within that cacheline, in units of 8 * bytes. * @@ -534,7 +534,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, unsigned j) { return cacheline_to_bkey(b, t, - __eytzinger_to_inorder(j, t->size, t->extra), + __eytzinger1_to_inorder(j, t->size, t->extra), bkey_float(b, t, j)->key_offset); } @@ -882,7 +882,7 @@ retry: t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; /* First we figure out where the first key in each cacheline is */ - eytzinger_for_each(j, t->size) { + eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) prev = k, k = bkey_next(k); @@ -905,7 +905,7 @@ retry: t->max_key = bkey_unpack_pos(b, k); /* Then we build the tree */ - eytzinger_for_each(j, t->size) + eytzinger1_for_each(j, t->size) make_bfloat(b, t, j, &min_key, &max_key); } @@ -996,7 +996,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, do { p = j ? tree_to_bkey(b, t, - __inorder_to_eytzinger(j--, + __inorder_to_eytzinger1(j--, t->size, t->extra)) : btree_bkey_first(b, t); } while (p >= k); @@ -1087,30 +1087,30 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, if (inorder && inorder < t->size) { - j = __inorder_to_eytzinger(inorder, t->size, t->extra); + j = __inorder_to_eytzinger1(inorder, t->size, t->extra); if (k == tree_to_bkey(b, t, j)) { /* Fix the node this key corresponds to */ make_bfloat(b, t, j, &min_key, &max_key); /* Children for which this key is the right boundary */ - for (j = eytzinger_left_child(j); + for (j = eytzinger1_left_child(j); j < t->size; - j = eytzinger_right_child(j)) + j = eytzinger1_right_child(j)) make_bfloat(b, t, j, &min_key, &max_key); } } if (inorder + 1 < t->size) { - j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra); + j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); if (k == tree_to_prev_bkey(b, t, j)) { make_bfloat(b, t, j, &min_key, &max_key); /* Children for which this key is the left boundary */ - for (j = eytzinger_right_child(j); + for (j = eytzinger1_right_child(j); j < t->size; - j = eytzinger_left_child(j)) + j = eytzinger1_left_child(j)) make_bfloat(b, t, j, &min_key, &max_key); } } @@ -1331,7 +1331,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, p = bkey_float_get(base, n << 4); prefetch(p); } else if (n << 3 < t->size) { - inorder = __eytzinger_to_inorder(n, t->size, t->extra); + inorder = __eytzinger1_to_inorder(n, t->size, t->extra); p = bset_cacheline(b, t, inorder); #ifdef CONFIG_X86_64 asm(".intel_syntax noprefix;" @@ -1362,7 +1362,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, &search, packed_search, n); } while (n < t->size); - inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra); + inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); /* * n would have been the node we recursed to - the low bit tells us if @@ -1372,7 +1372,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, return cacheline_to_bkey(b, t, inorder, f->key_offset); } else { if (--inorder) { - n = eytzinger_prev(n >> 1, t->size); + n = eytzinger1_prev(n >> 1, t->size); f = bkey_float_get(base, n); return cacheline_to_bkey(b, t, inorder, f->key_offset); } else @@ -1790,7 +1790,7 @@ int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, if (!bset_has_ro_aux_tree(t)) goto out; - j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra); + j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra); if (j && j < t->size && k == tree_to_bkey(b, t, j)) diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index c37c8959..bdbe21ac 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -163,10 +163,14 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) goto out_unlock; if (btree_node_dirty(b) || - btree_node_write_in_flight(b)) { + btree_node_write_in_flight(b) || + btree_node_read_in_flight(b)) { if (!flush) goto out_unlock; + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -582,7 +586,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - bch2_btree_node_read(c, b); + bch2_btree_node_read(c, b, true); six_unlock_write(&b->lock); if (lock_type == SIX_LOCK_read) @@ -673,6 +677,9 @@ retry: } } + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + prefetch(b->aux_data); for_each_bset(b, t) { @@ -700,6 +707,44 @@ retry: return b; } +void bch2_btree_node_prefetch(struct btree_iter *iter, + const struct bkey_i *k, unsigned level) +{ + struct bch_fs *c = iter->c; + struct btree *b; + + BUG_ON(level >= BTREE_MAX_DEPTH); + + rcu_read_lock(); + b = mca_find(c, k); + rcu_read_unlock(); + + if (b) + return; + + b = bch2_btree_node_mem_alloc(c); + if (IS_ERR(b)) + return; + + bkey_copy(&b->key, k); + if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) { + /* raced with another fill: */ + + /* mark as unhashed... */ + bkey_i_to_extent(&b->key)->v._data[0] = 0; + + mutex_lock(&c->btree_cache_lock); + list_add(&b->list, &c->btree_cache_freeable); + mutex_unlock(&c->btree_cache_lock); + goto out; + } + + bch2_btree_node_read(c, b, false); +out: + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); +} + int bch2_print_btree_node(struct bch_fs *c, struct btree *b, char *buf, size_t len) { diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 23f637ab..ca8e3195 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -22,6 +22,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *, unsigned, enum six_lock_type); +void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *, + unsigned); + void bch2_fs_btree_exit(struct bch_fs *); int bch2_fs_btree_init(struct bch_fs *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 88ae3967..99d28f64 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -225,7 +225,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) btree_node_range_checks_init(&r, depth); - for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) { + __for_each_btree_node(&iter, c, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b) { btree_node_range_checks(c, b, &r); bch2_verify_btree_nr_keys(b); @@ -779,7 +780,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) */ memset(merge, 0, sizeof(merge)); - __for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) { + __for_each_btree_node(&iter, c, btree_id, POS_MIN, + U8_MAX, 0, BTREE_ITER_PREFETCH, b) { memmove(merge + 1, merge, sizeof(merge) - sizeof(merge[0])); memmove(lock_seq + 1, lock_seq, @@ -952,7 +954,7 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) * We have to hit every btree node before starting journal replay, in * order for the journal seq blacklist machinery to work: */ - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { btree_node_range_checks(c, b, &r); if (btree_node_has_ptrs(b)) { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 82dd196d..541fffb6 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1196,6 +1196,8 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, btree_node_reset_sib_u64s(b); out: + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); mempool_free(iter, &c->fill_iter); return; err: @@ -1206,13 +1208,48 @@ fsck_err: goto out; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b) +static void btree_node_read_work(struct work_struct *work) +{ + struct btree_read_bio *rb = + container_of(work, struct btree_read_bio, work); + + bch2_btree_node_read_done(rb->c, rb->bio.bi_private, + rb->pick.ca, &rb->pick.ptr); + + percpu_ref_put(&rb->pick.ca->io_ref); + bio_put(&rb->bio); +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct btree *b = bio->bi_private; + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + + if (bch2_dev_fatal_io_err_on(bio->bi_error, + rb->pick.ca, "IO error reading bucket %zu", + PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) || + bch2_meta_read_fault("btree")) { + set_btree_node_read_error(b); + percpu_ref_put(&rb->pick.ca->io_ref); + bio_put(bio); + return; + } + + INIT_WORK(&rb->work, btree_node_read_work); + schedule_work(&rb->work); +} + +void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bool sync) { uint64_t start_time = local_clock(); - struct bio *bio; struct extent_pick_ptr pick; + struct btree_read_bio *rb; + struct bio *bio; trace_btree_read(c, b); + set_btree_node_read_in_flight(b); pick = bch2_btree_pick_ptr(c, b); if (bch2_fs_fatal_err_on(!pick.ca, c, @@ -1222,27 +1259,36 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b) } bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); + rb = container_of(bio, struct btree_read_bio, bio); + rb->c = c; + rb->pick = pick; + bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_bdev = pick.ca->disk_sb.bdev; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); bch2_bio_map(bio, b->data); - submit_bio_wait(bio); + if (sync) { + submit_bio_wait(bio); - if (bch2_dev_fatal_io_err_on(bio->bi_error, - pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(pick.ca, &pick.ptr)) || - bch2_meta_read_fault("btree")) { - set_btree_node_read_error(b); - goto out; - } + if (bch2_dev_fatal_io_err_on(bio->bi_error, + pick.ca, "IO error reading bucket %zu", + PTR_BUCKET_NR(pick.ca, &pick.ptr)) || + bch2_meta_read_fault("btree")) { + set_btree_node_read_error(b); + goto out; + } - bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr); - bch2_time_stats_update(&c->btree_read_time, start_time); + bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr); + bch2_time_stats_update(&c->btree_read_time, start_time); out: - bio_put(bio); - percpu_ref_put(&pick.ca->io_ref); + bio_put(bio); + percpu_ref_put(&pick.ca->io_ref); + } else { + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = b; + submit_bio(bio); + } } int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, @@ -1267,7 +1313,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, bkey_copy(&b->key, k); BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); - bch2_btree_node_read(c, b); + bch2_btree_node_read(c, b, true); six_unlock_write(&b->lock); if (btree_node_read_error(b)) { @@ -1557,10 +1603,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->put_bio = true; wbio->order = order; wbio->used_mempool = used_mempool; + bio->bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; bio->bi_iter.bi_size = sectors_to_write << 9; bio->bi_end_io = btree_node_write_endio; bio->bi_private = b; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); if (parent) closure_get(parent); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index d023dfae..7333f305 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -1,11 +1,20 @@ #ifndef _BCACHE_BTREE_IO_H #define _BCACHE_BTREE_IO_H +#include "extents.h" + struct bch_fs; struct btree_write; struct btree; struct btree_iter; +struct btree_read_bio { + struct bch_fs *c; + struct extent_pick_ptr pick; + struct work_struct work; + struct bio bio; +}; + static inline void btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); @@ -64,7 +73,7 @@ void bch2_btree_init_next(struct bch_fs *, struct btree *, void bch2_btree_node_read_done(struct bch_fs *, struct btree *, struct bch_dev *, const struct bch_extent_ptr *); -void bch2_btree_node_read(struct bch_fs *, struct btree *); +void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 0b28082e..e5da186b 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -161,8 +161,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - linked->locks_want = max(linked->locks_want, - iter->locks_want); + linked->locks_want = max_t(unsigned, + linked->locks_want, + iter->locks_want); return false; } @@ -177,8 +178,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (linked->btree_id == iter->btree_id && level > __fls(linked->nodes_locked)) { - linked->locks_want = max(linked->locks_want, - iter->locks_want); + linked->locks_want = max_t(unsigned, + linked->locks_want, + iter->locks_want); return false; } } @@ -247,12 +249,10 @@ fail: static int __bch2_btree_iter_unlock(struct btree_iter *iter) { - BUG_ON(iter->error == -EINTR); - while (iter->nodes_locked) btree_node_unlock(iter, __ffs(iter->nodes_locked)); - return iter->error; + return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } int bch2_btree_iter_unlock(struct btree_iter *iter) @@ -285,7 +285,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, ? bch2_btree_node_iter_prev(&tmp, b) : bch2_btree_node_iter_prev_all(&tmp, b); if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->is_extents)) { + iter->flags & BTREE_ITER_IS_EXTENTS)) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); @@ -296,7 +296,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->is_extents)) { + iter->flags & BTREE_ITER_IS_EXTENTS)) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); @@ -340,7 +340,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) + iter->flags & BTREE_ITER_IS_EXTENTS)) bch2_btree_node_iter_push(node_iter, b, where, end); return; found: @@ -352,7 +352,7 @@ found: if (new_u64s && btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) { + iter->flags & BTREE_ITER_IS_EXTENTS)) { set->k = offset; bch2_btree_node_iter_sort(node_iter, b); } else if (set->k < offset + clobber_u64s) { @@ -388,7 +388,7 @@ found: */ if (b->level && new_u64s && !bkey_deleted(where) && btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->is_extents)) { + iter->flags & BTREE_ITER_IS_EXTENTS)) { struct bset_tree *t; struct bkey_packed *k; @@ -535,9 +535,9 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) static inline void __btree_iter_init(struct btree_iter *iter, struct btree *b) { - bch2_btree_node_iter_init(&iter->node_iters[b->level], b, - iter->pos, iter->is_extents, - btree_node_is_extents(b)); + bch2_btree_node_iter_init(&iter->node_iters[b->level], b, iter->pos, + iter->flags & BTREE_ITER_IS_EXTENTS, + btree_node_is_extents(b)); /* Skip to first non whiteout: */ if (b->level) @@ -549,7 +549,8 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, { return iter->btree_id == b->btree_id && bkey_cmp(iter->pos, b->data->min_key) >= 0 && - btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents); + btree_iter_pos_cmp(iter->pos, &b->key.k, + iter->flags & BTREE_ITER_IS_EXTENTS); } static inline void btree_iter_node_set(struct btree_iter *iter, @@ -695,6 +696,26 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, } } +noinline +static void btree_iter_prefetch(struct btree_iter *iter) +{ + struct btree *b = iter->nodes[iter->level + 1]; + struct btree_node_iter node_iter = iter->node_iters[iter->level + 1]; + struct bkey_packed *k; + BKEY_PADDED(k) tmp; + unsigned nr = iter->level ? 1 : 8; + + while (nr) { + bch2_btree_node_iter_advance(&node_iter, b); + k = bch2_btree_node_iter_peek(&node_iter, b); + if (!k) + break; + + bch2_bkey_unpack(b, &tmp.k, k); + bch2_btree_node_prefetch(iter, &tmp.k, iter->level); + } +} + static inline int btree_iter_down(struct btree_iter *iter) { struct btree *b; @@ -712,6 +733,10 @@ static inline int btree_iter_down(struct btree_iter *iter) iter->level = level; mark_btree_node_locked(iter, level, lock_type); btree_iter_node_set(iter, b); + + if (iter->flags & BTREE_ITER_PREFETCH) + btree_iter_prefetch(iter); + return 0; } @@ -791,7 +816,7 @@ out: io_error: BUG_ON(ret != -EIO); - iter->error = ret; + iter->flags |= BTREE_ITER_ERROR; iter->nodes[iter->level] = NULL; goto out; } @@ -834,7 +859,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) bch2_btree_node_relock(iter, iter->level) && btree_iter_pos_cmp(iter->pos, &iter->nodes[iter->level]->key.k, - iter->is_extents))) + iter->flags & BTREE_ITER_IS_EXTENTS))) btree_iter_up(iter); /* @@ -845,7 +870,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) struct bkey_s_c k; while ((k = __btree_iter_peek_all(iter)).k && - !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents)) + !btree_iter_pos_cmp(iter->pos, k.k, + iter->flags & BTREE_ITER_IS_EXTENTS)) __btree_iter_advance(iter); } @@ -875,7 +901,7 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) if (unlikely(!iter->nodes[iter->level])) return 0; - iter->at_end_of_leaf = false; + iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; ret = __bch2_btree_iter_traverse(iter); if (unlikely(ret)) @@ -891,7 +917,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b; int ret; - EBUG_ON(iter->is_extents); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ret = bch2_btree_iter_traverse(iter); if (ret) @@ -912,7 +938,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) struct btree *b; int ret; - EBUG_ON(iter->is_extents); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); btree_iter_up(iter); @@ -964,12 +990,13 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && !btree_iter_pos_cmp_packed(b, &new_pos, k, - iter->is_extents)) + iter->flags & BTREE_ITER_IS_EXTENTS)) bch2_btree_node_iter_advance(node_iter, b); if (!k && - !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents)) - iter->at_end_of_leaf = true; + !btree_iter_pos_cmp(new_pos, &b->key.k, + iter->flags & BTREE_ITER_IS_EXTENTS)) + iter->flags |= BTREE_ITER_AT_END_OF_LEAF; iter->pos = new_pos; } @@ -1006,6 +1033,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) struct bkey_s_c k; int ret; + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (iter->btree_id == BTREE_ID_EXTENTS)); + while (1) { ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) { @@ -1019,7 +1049,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) * iter->pos should always be equal to the key we just * returned - except extents can straddle iter->pos: */ - if (!iter->is_extents || + if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); return k; @@ -1043,6 +1073,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter) struct bkey n; int ret; + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (iter->btree_id == BTREE_ID_EXTENTS)); + while (1) { ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) { @@ -1057,7 +1090,7 @@ recheck: bkey_init(&n); n.p = iter->pos; - if (iter->is_extents) { + if (iter->flags & BTREE_ITER_IS_EXTENTS) { if (n.p.offset == KEY_OFFSET_MAX) { iter->pos = bkey_successor(iter->pos); goto recheck; @@ -1087,21 +1120,18 @@ recheck: } void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned depth) + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned depth, + unsigned flags) { - iter->level = depth; - /* bch2_bkey_ops isn't used much, this would be a cache miss */ - /* iter->is_extents = bch2_bkey_ops[btree_id]->is_extents; */ - iter->is_extents = btree_id == BTREE_ID_EXTENTS; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); - iter->btree_id = btree_id; - iter->at_end_of_leaf = 0; - iter->error = 0; iter->c = c; iter->pos = pos; + iter->flags = flags; + iter->btree_id = btree_id; + iter->level = depth; + iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; memset(iter->nodes, 0, sizeof(iter->nodes)); iter->nodes[iter->level] = BTREE_ITER_NOT_END; iter->next = iter; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 7cf9bd63..57f38765 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -3,38 +3,39 @@ #include "btree_types.h" + +#define BTREE_ITER_INTENT (1 << 0) +#define BTREE_ITER_WITH_HOLES (1 << 1) +#define BTREE_ITER_PREFETCH (1 << 2) +/* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for + * @pos or the first key strictly greater than @pos + */ +#define BTREE_ITER_IS_EXTENTS (1 << 3) +/* + * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: + */ +#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) +#define BTREE_ITER_ERROR (1 << 5) + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ struct btree_iter { - /* Current btree depth */ - u8 level; - - /* - * Used in bch2_btree_iter_traverse(), to indicate whether we're - * searching for @pos or the first key strictly greater than @pos - */ - u8 is_extents; - - /* Bitmasks for read/intent locks held per level */ - u8 nodes_locked; - u8 nodes_intent_locked; - - /* Btree level below which we start taking intent locks */ - u8 locks_want; - - enum btree_id btree_id:8; - - /* - * indicates we need to call bch2_btree_iter_traverse() to revalidate - * iterator: - */ - u8 at_end_of_leaf; - - s8 error; - - struct bch_fs *c; - - /* Current position of the iterator */ + struct bch_fs *c; struct bpos pos; + u8 flags; + enum btree_id btree_id:8; + unsigned level:4, + locks_want:4, + nodes_locked:4, + nodes_intent_locked:4; + u32 lock_seq[BTREE_MAX_DEPTH]; /* @@ -166,22 +167,17 @@ void bch2_btree_iter_advance_pos(struct btree_iter *); void bch2_btree_iter_rewind(struct btree_iter *, struct bpos); void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *, - enum btree_id, struct bpos, unsigned , unsigned); + enum btree_id, struct bpos, + unsigned , unsigned, unsigned); static inline void bch2_btree_iter_init(struct btree_iter *iter, - struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) + struct bch_fs *c, enum btree_id btree_id, + struct bpos pos, unsigned flags) { - __bch2_btree_iter_init(iter, c, btree_id, pos, 0, 0); -} - -static inline void bch2_btree_iter_init_intent(struct btree_iter *iter, - struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) -{ - __bch2_btree_iter_init(iter, c, btree_id, pos, 1, 0); + __bch2_btree_iter_init(iter, c, btree_id, pos, + flags & BTREE_ITER_INTENT ? 1 : 0, 0, + btree_id == BTREE_ID_EXTENTS + ? BTREE_ITER_IS_EXTENTS : 0); } void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); @@ -216,45 +212,25 @@ static inline int btree_iter_cmp(const struct btree_iter *l, return __btree_iter_cmp(l->btree_id, l->pos, r); } -#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \ - _b, _locks_want) \ - for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, _depth), \ - (_iter)->is_extents = false, \ +#define __for_each_btree_node(_iter, _c, _btree_id, _start, \ + _locks_want, _depth, _flags, _b) \ + for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \ + _locks_want, _depth, _flags), \ _b = bch2_btree_iter_peek_node(_iter); \ (_b); \ (_b) = bch2_btree_iter_next_node(_iter, _depth)) -#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b) \ - __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0) +#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \ + __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b) -#define __for_each_btree_key(_iter, _c, _btree_id, _start, \ - _k, _locks_want) \ - for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, 0); \ - !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek(_iter)).k); \ +#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \ + for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \ + (_start), (_flags)); \ + !IS_ERR_OR_NULL(((_k) = (((_flags) & BTREE_ITER_WITH_HOLES)\ + ? bch2_btree_iter_peek_with_holes(_iter)\ + : bch2_btree_iter_peek(_iter))).k); \ bch2_btree_iter_advance_pos(_iter)) -#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0) - -#define for_each_btree_key_intent(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1) - -#define __for_each_btree_key_with_holes(_iter, _c, _btree_id, \ - _start, _k, _locks_want) \ - for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - _start, _locks_want, 0); \ - !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek_with_holes(_iter)).k);\ - bch2_btree_iter_advance_pos(_iter)) - -#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \ - __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0) - -#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id, \ - _start, _k) \ - __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1) - static inline int btree_iter_err(struct bkey_s_c k) { return IS_ERR(k.k) ? PTR_ERR(k.k) : 0; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index a0f5b579..c613a7bc 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -141,6 +141,7 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } enum btree_flags { + BTREE_NODE_read_in_flight, BTREE_NODE_read_error, BTREE_NODE_write_error, BTREE_NODE_dirty, @@ -152,6 +153,7 @@ enum btree_flags { BTREE_NODE_just_written, }; +BTREE_FLAG(read_in_flight); BTREE_FLAG(read_error); BTREE_FLAG(write_error); BTREE_FLAG(dirty); diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index cfd2a455..2f67c092 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -2047,7 +2047,7 @@ unlock: * traversed again */ trans_for_each_entry(trans, i) - if (i->iter->at_end_of_leaf) + if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) goto out; trans_for_each_entry(trans, i) @@ -2161,7 +2161,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct btree_iter iter; int ret, ret2; - bch2_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k)); + bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(&iter); if (unlikely(ret)) @@ -2187,7 +2188,8 @@ int bch2_btree_update(struct bch_fs *c, enum btree_id id, EBUG_ON(id == BTREE_ID_EXTENTS); - bch2_btree_iter_init_intent(&iter, c, id, k->k.p); + bch2_btree_iter_init(&iter, c, id, k->k.p, + BTREE_ITER_INTENT); u = bch2_btree_iter_peek_with_holes(&iter); ret = btree_iter_err(u); @@ -2222,7 +2224,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bkey_s_c k; int ret = 0; - bch2_btree_iter_init_intent(&iter, c, id, start); + bch2_btree_iter_init(&iter, c, id, start, + BTREE_ITER_INTENT); while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = btree_iter_err(k))) { @@ -2248,7 +2251,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, delete.k.p = iter.pos; delete.k.version = version; - if (iter.is_extents) { + if (iter.flags & BTREE_ITER_IS_EXTENTS) { /* * The extents btree is special - KEY_TYPE_DISCARD is * used for deletions, not KEY_TYPE_DELETED. This is an diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 184a29f9..1c2f6921 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -317,7 +317,6 @@ void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g) new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; - new.copygc = 0; new.gen++; })); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 3b82d7f3..f99a62bc 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -95,33 +95,6 @@ static inline u8 ptr_stale(const struct bch_dev *ca, return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); } -/* bucket heaps */ - -static inline bool bucket_min_cmp(struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return l.val < r.val; -} - -static inline bool bucket_max_cmp(struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return l.val > r.val; -} - -static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g, - unsigned long val) -{ - struct bucket_heap_entry new = { g, val }; - - if (!heap_full(&ca->heap)) - heap_add(&ca->heap, new, bucket_min_cmp); - else if (bucket_min_cmp(new, heap_peek(&ca->heap))) { - ca->heap.data[0] = new; - heap_sift(&ca->heap, 0, bucket_min_cmp); - } -} - /* bucket gc marks */ /* The dirty and cached sector counts saturate. If this occurs, @@ -129,14 +102,16 @@ static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g, * GC must be performed. */ #define GC_MAX_SECTORS_USED ((1U << 15) - 1) -static inline bool bucket_unused(struct bucket *g) +static inline unsigned bucket_sectors_used(struct bucket_mark mark) { - return !g->mark.counter; + return mark.dirty_sectors + mark.cached_sectors; } -static inline unsigned bucket_sectors_used(struct bucket *g) +static inline bool bucket_unused(struct bucket_mark mark) { - return g->mark.dirty_sectors + g->mark.cached_sectors; + return !mark.owned_by_allocator && + !mark.data_type && + !bucket_sectors_used(mark); } /* Per device stats: */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index ca187099..18bf1713 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -1,6 +1,8 @@ #ifndef _BUCKETS_TYPES_H #define _BUCKETS_TYPES_H +#include "util.h" + enum bucket_data_type { BUCKET_DATA = 0, BUCKET_BTREE, @@ -18,9 +20,6 @@ struct bucket_mark { struct { u8 gen; - /* generation copygc is going to move this bucket into */ - unsigned copygc:1; - unsigned journal_seq_valid:1; /* @@ -96,10 +95,12 @@ struct bch_fs_usage { }; struct bucket_heap_entry { - struct bucket *g; - unsigned long val; + size_t bucket; + struct bucket_mark mark; }; +typedef HEAP(struct bucket_heap_entry) bucket_heap; + /* * A reservation for space on disk: */ diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 68ac62b4..650be8ce 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -5,9 +5,11 @@ #include #include -static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r) +static inline long io_timer_cmp(io_timer_heap *h, + struct io_timer *l, + struct io_timer *r) { - return time_after(l->expire, r->expire); + return l->expire - r->expire; } void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index 4a02f467..ae068c6d 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -22,12 +22,14 @@ struct io_timer { /* Amount to buffer up on a percpu counter */ #define IO_CLOCK_PCPU_SECTORS 128 +typedef HEAP(struct io_timer *) io_timer_heap; + struct io_clock { atomic_long_t now; u16 __percpu *pcpu_buf; spinlock_t timer_lock; - DECLARE_HEAP(struct io_timer *, timers); + io_timer_heap timers; }; #endif /* _BCACHE_CLOCK_TYPES_H */ diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index bf160e0b..d4c8ce55 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -60,9 +60,9 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); bio->bi_bdev = pick.ca->disk_sb.bdev; + bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); bch2_bio_map(bio, n_sorted); submit_bio_wait(bio); @@ -212,7 +212,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_btree_iter_init(&iter, i->c, i->id, i->from); + bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(&iter)).k && !(err = btree_iter_err(k))) { @@ -314,7 +314,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_btree_iter_init(&iter, i->c, i->id, i->from); + bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(&iter)).k && !(err = btree_iter_err(k))) { diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index e2978bab..056715bc 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -214,11 +214,13 @@ int bch2_dirent_rename(struct bch_fs *c, bool need_whiteout; int ret = -ENOMEM; - bch2_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos); - bch2_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos); + bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos, + BTREE_ITER_INTENT); + bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos, + BTREE_ITER_INTENT); bch2_btree_iter_link(&src_iter, &dst_iter); - bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos); + bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, 0); bch2_btree_iter_link(&src_iter, &whiteout_iter); if (mode == BCH_RENAME_EXCHANGE) { @@ -376,7 +378,7 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) struct bkey_s_c k; int ret = 0; - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) { + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) { if (k.k->p.inode > dir_inum) break; @@ -405,7 +407,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos); for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(inode->i_ino, ctx->pos), k) { + POS(inode->i_ino, ctx->pos), 0, k) { if (k.k->type != BCH_DIRENT) continue; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c80da362..219b60a3 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -41,13 +41,13 @@ static void sort_key_next(struct btree_node_iter *iter, * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ -#define key_sort_cmp(l, r) \ +#define key_sort_cmp(h, l, r) \ ({ \ - int _c = bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)); \ + bkey_cmp_packed(b, \ + __btree_node_offset_to_key(b, (l).k), \ + __btree_node_offset_to_key(b, (r).k)) \ \ - _c ? _c > 0 : (l).k > (r).k; \ + ?: (l).k - (r).k; \ }) static inline bool should_drop_next_key(struct btree_node_iter *iter, @@ -63,7 +63,7 @@ static inline bool should_drop_next_key(struct btree_node_iter *iter, return false; if (iter->used > 2 && - key_sort_cmp(r[0], r[1])) + key_sort_cmp(iter, r[0], r[1]) >= 0) r++; /* @@ -98,7 +98,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, } sort_key_next(iter, b, iter->data); - heap_sift(iter, 0, key_sort_cmp); + heap_sift_down(iter, 0, key_sort_cmp); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); @@ -754,27 +754,26 @@ static void extent_save(struct btree *b, struct btree_node_iter *iter, } /* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. + * If keys compare equal, compare by pointer order: * * Necessary for sort_fix_overlapping() - if there are multiple keys that * compare equal in different sets, we have to process them newest to oldest. */ -#define extent_sort_cmp(l, r) \ +#define extent_sort_cmp(h, l, r) \ ({ \ struct bkey _ul = bkey_unpack_key(b, \ __btree_node_offset_to_key(b, (l).k)); \ struct bkey _ur = bkey_unpack_key(b, \ __btree_node_offset_to_key(b, (r).k)); \ \ - int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \ - _c ? _c > 0 : (l).k < (r).k; \ + bkey_cmp(bkey_start_pos(&_ul), \ + bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ }) static inline void extent_sort_sift(struct btree_node_iter *iter, struct btree *b, size_t i) { - heap_sift(iter, i, extent_sort_cmp); + heap_sift_down(iter, i, extent_sort_cmp); } static inline void extent_sort_next(struct btree_node_iter *iter, @@ -782,7 +781,7 @@ static inline void extent_sort_next(struct btree_node_iter *iter, struct btree_node_iter_set *i) { sort_key_next(iter, b, i); - heap_sift(iter, i - iter->data, extent_sort_cmp); + heap_sift_down(iter, i - iter->data, extent_sort_cmp); } static void extent_sort_append(struct bch_fs *c, @@ -843,7 +842,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, _r = iter->data + 1; if (iter->used > 2 && - extent_sort_cmp(_r[0], _r[1])) + extent_sort_cmp(iter, _r[0], _r[1]) >= 0) _r++; rk = __btree_node_offset_to_key(b, _r->k); @@ -1433,11 +1432,12 @@ stop: gc_pos_btree_node(b)); EBUG_ON(bkey_cmp(iter->pos, s->committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); + EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != + !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); bch2_cut_front(iter->pos, insert); - if (insert->k.size && iter->at_end_of_leaf) + if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) ret = BTREE_INSERT_NEED_TRAVERSE; EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); @@ -1596,9 +1596,10 @@ stop: EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf); + EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != + !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); - if (insert->k->k.size && iter->at_end_of_leaf) + if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) ret = BTREE_INSERT_NEED_TRAVERSE; EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index 13d54e5e..dc23e44d 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -9,160 +9,162 @@ /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an * array - * - * We used one based indexing, not zero based: with one based indexing, each - * level of the tree starts at a power of two - leading to better alignment - - * and it's what you want for implementing next/prev and to/from inorder. - * - * To/from inorder also uses 1 based indexing. - * - * Size parameter is treated as if we were using 0 based indexing, however: - * valid nodes, and inorder indices, are in the range [1..size) */ -static inline unsigned eytzinger_child(unsigned j, unsigned child) +/* + * One based indexing version: + * + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: + * + * Size parameter is treated as if we were using 0 based indexing, however: + * valid nodes, and inorder indices, are in the range [1..size) - that is, there + * are actually size - 1 elements + */ + +static inline unsigned eytzinger1_child(unsigned i, unsigned child) { EBUG_ON(child > 1); - return (j << 1) + child; + return (i << 1) + child; } -static inline unsigned eytzinger_left_child(unsigned j) +static inline unsigned eytzinger1_left_child(unsigned i) { - return eytzinger_child(j, 0); + return eytzinger1_child(i, 0); } -static inline unsigned eytzinger_right_child(unsigned j) +static inline unsigned eytzinger1_right_child(unsigned i) { - return eytzinger_child(j, 1); + return eytzinger1_child(i, 1); } -static inline unsigned eytzinger_first(unsigned size) +static inline unsigned eytzinger1_first(unsigned size) { return rounddown_pow_of_two(size - 1); } -static inline unsigned eytzinger_last(unsigned size) +static inline unsigned eytzinger1_last(unsigned size) { return rounddown_pow_of_two(size) - 1; } /* - * eytzinger_next() and eytzinger_prev() have the nice properties that + * eytzinger1_next() and eytzinger1_prev() have the nice properties that * - * eytzinger_next(0) == eytzinger_first()) - * eytzinger_prev(0) == eytzinger_last()) + * eytzinger1_next(0) == eytzinger1_first()) + * eytzinger1_prev(0) == eytzinger1_last()) * - * eytzinger_prev(eytzinger_first()) == 0 - * eytzinger_next(eytzinger_last()) == 0 + * eytzinger1_prev(eytzinger1_first()) == 0 + * eytzinger1_next(eytzinger1_last()) == 0 */ -static inline unsigned eytzinger_next(unsigned j, unsigned size) +static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(j >= size); + EBUG_ON(i >= size); - if (eytzinger_right_child(j) < size) { - j = eytzinger_right_child(j); + if (eytzinger1_right_child(i) < size) { + i = eytzinger1_right_child(i); - j <<= __fls(size) - __fls(j); - j >>= j >= size; + i <<= __fls(size) - __fls(i); + i >>= i >= size; } else { - j >>= ffz(j) + 1; + i >>= ffz(i) + 1; } - return j; + return i; } -static inline unsigned eytzinger_prev(unsigned j, unsigned size) +static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(j >= size); + EBUG_ON(i >= size); - if (eytzinger_left_child(j) < size) { - j = eytzinger_left_child(j); + if (eytzinger1_left_child(i) < size) { + i = eytzinger1_left_child(i); - j <<= __fls(size) - __fls(j); - j -= 1; - j >>= j >= size; + i <<= __fls(size) - __fls(i); + i -= 1; + i >>= i >= size; } else { - j >>= __ffs(j) + 1; + i >>= __ffs(i) + 1; } - return j; + return i; } -static inline unsigned eytzinger_extra(unsigned size) +static inline unsigned eytzinger1_extra(unsigned size) { return (size - rounddown_pow_of_two(size - 1)) << 1; } -static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size, +static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned extra) { - unsigned b = __fls(j); + unsigned b = __fls(i); unsigned shift = __fls(size - 1) - b; int s; - EBUG_ON(!j || j >= size); + EBUG_ON(!i || i >= size); - j ^= 1U << b; - j <<= 1; - j |= 1; - j <<= shift; + i ^= 1U << b; + i <<= 1; + i |= 1; + i <<= shift; /* * sign bit trick: * - * if (j > extra) - * j -= (j - extra) >> 1; + * if (i > extra) + * i -= (i - extra) >> 1; */ - s = extra - j; - j += (s >> 1) & (s >> 31); + s = extra - i; + i += (s >> 1) & (s >> 31); - return j; + return i; } -static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size, - unsigned extra) +static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, + unsigned extra) { unsigned shift; int s; - EBUG_ON(!j || j >= size); + EBUG_ON(!i || i >= size); /* * sign bit trick: * - * if (j > extra) - * j += j - extra; + * if (i > extra) + * i += i - extra; */ - s = extra - j; - j -= s & (s >> 31); + s = extra - i; + i -= s & (s >> 31); - shift = __ffs(j); + shift = __ffs(i); - j >>= shift + 1; - j |= 1U << (__fls(size - 1) - shift); + i >>= shift + 1; + i |= 1U << (__fls(size - 1) - shift); - return j; + return i; } -static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size) +static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) { - return __eytzinger_to_inorder(j, size, eytzinger_extra(size)); + return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); } -static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size) +static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) { - return __inorder_to_eytzinger(j, size, eytzinger_extra(size)); + return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); } -#define eytzinger_for_each(_i, _size) \ - for ((_i) = eytzinger_first((_size)); \ +#define eytzinger1_for_each(_i, _size) \ + for ((_i) = eytzinger1_first((_size)); \ (_i) != 0; \ - (_i) = eytzinger_next((_i), (_size))) + (_i) = eytzinger1_next((_i), (_size))) #if 0 -void eytzinger_test(void) +void eytzinger0_test(void) { unsigned i, j, size; @@ -172,20 +174,20 @@ void eytzinger_test(void) if (!(size % 4096)) printk(KERN_INFO "tree size %u\n", size); - assert(eytzinger_prev(0, size) == eytzinger_last(size)); - assert(eytzinger_next(0, size) == eytzinger_first(size)); + assert(eytzinger1_prev(0, size) == eytzinger1_last(size)); + assert(eytzinger1_next(0, size) == eytzinger1_first(size)); - assert(eytzinger_prev(eytzinger_first(size), size) == 0); - assert(eytzinger_next(eytzinger_last(size), size) == 0); + assert(eytzinger1_prev(eytzinger1_first(size), size) == 0); + assert(eytzinger1_next(eytzinger1_last(size), size) == 0); - eytzinger_for_each(j, size) { + eytzinger1_for_each(j, size) { assert(from_inorder(i, size) == j); assert(to_inorder(j, size) == i); - if (j != eytzinger_last(size)) { - unsigned next = eytzinger_next(j, size); + if (j != eytzinger1_last(size)) { + unsigned next = eytzinger1_next(j, size); - assert(eytzinger_prev(next, size) == j); + assert(eytzinger1_prev(next, size) == j); } } } @@ -193,4 +195,96 @@ void eytzinger_test(void) } #endif +/* Zero based indexing version: */ + +static inline unsigned eytzinger0_child(unsigned i, unsigned child) +{ + EBUG_ON(child > 1); + + return (i << 1) + 1 + child; +} + +static inline unsigned eytzinger0_left_child(unsigned i) +{ + return eytzinger0_child(i, 0); +} + +static inline unsigned eytzinger0_right_child(unsigned i) +{ + return eytzinger0_child(i, 1); +} + +#if 0 +static inline unsigned eytzinger0_first(unsigned size) +{ +} + +static inline unsigned eytzinger0_last(unsigned size) +{ +} + +static inline unsigned eytzinger0_next(unsigned i, unsigned size) +{ +} + +static inline unsigned eytzinger0_prev(unsigned i, unsigned size) +{ +} +#endif + +static inline unsigned eytzinger0_extra(unsigned size) +{ + return (size + 1 - rounddown_pow_of_two(size)) << 1; +} + +static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; +} + +static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, + unsigned extra) +{ + return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; +} + +static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) +{ + return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); +} + +static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) +{ + return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); +} + +#define eytzinger0_find(base, _nr, _size, _cmp, _search) \ +({ \ + void *_base = base; \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < (_nr) && \ + (_res = _cmp(_search, _base + _i * (_size), _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + \ + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { \ + bool found1 = _i < _nr, found2 = false; \ + unsigned _j; \ + \ + for (_j = 0; _j < _nr; _j++) \ + if (!_cmp(_base + _j * (_size), _search, _size))\ + found2 = true; \ + \ + BUG_ON(found1 != found2); \ + } \ + \ + _i; \ +}) + +void eytzinger0_sort(void *, size_t, size_t, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + #endif /* _EYTZINGER_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index dc5c7f4c..4a680ade 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -282,10 +282,12 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino); - bch2_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k)); - bch2_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES, - POS(extent_iter.pos.inode, 0)); + bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); + bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES, + POS(extent_iter.pos.inode, 0), + BTREE_ITER_INTENT); hook.op = op; hook.hook.fn = bchfs_extent_update_hook; @@ -786,7 +788,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, .mapping = mapping, .nr_pages = nr_pages }; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0); INIT_LIST_HEAD(&readpages_iter.pages); list_add(&readpages_iter.pages, pages); @@ -841,7 +843,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); bio_add_page_contig(&rbio->bio, page); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0); bchfs_read(c, &iter, rbio, inode, NULL); } @@ -1036,7 +1038,7 @@ do_io: w->io->op.new_i_size = i_size; if (wbc->sync_mode == WB_SYNC_ALL) - w->io->bio.bio.bi_opf |= WRITE_SYNC; + w->io->bio.bio.bi_opf |= REQ_SYNC; /* Before unlocking the page, transfer reservation to w->io: */ old = page_state_cmpxchg(page_state(page), new, { @@ -1448,7 +1450,7 @@ start: bio->bi_iter.bi_sector = offset >> 9; bio->bi_private = dio; - ret = bio_get_user_pages(bio, iter, 1); + ret = bio_iov_iter_get_pages(bio, iter); if (ret < 0) { /* XXX: fault inject this path */ bio->bi_error = ret; @@ -1537,7 +1539,7 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; - ret = bio_get_user_pages(bio, &dio->iter, 0); + ret = bio_iov_iter_get_pages(bio, &dio->iter); if (ret < 0) { /* * these didn't get initialized, but bch2_dio_write_done() will @@ -1908,7 +1910,7 @@ static int __bch2_truncate_page(struct address_space *mapping, */ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode->i_ino, - index << (PAGE_SHIFT - 9)), k) { + index << (PAGE_SHIFT - 9)), 0, k) { if (bkey_cmp(bkey_start_pos(k.k), POS(inode->i_ino, (index + 1) << (PAGE_SHIFT - 9))) >= 0) @@ -2122,10 +2124,11 @@ static long bch2_fcollapse(struct inode *inode, loff_t offset, loff_t len) if ((offset | len) & (PAGE_SIZE - 1)) return -EINVAL; - bch2_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9)); + bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9), + BTREE_ITER_INTENT); /* position will be set from dst iter's position: */ - bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, 0); bch2_btree_iter_link(&src, &dst); /* @@ -2249,7 +2252,8 @@ static long bch2_fallocate(struct inode *inode, int mode, unsigned replicas = READ_ONCE(c->opts.data_replicas); int ret; - bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_INTENT); inode_lock(inode); inode_dio_wait(inode); @@ -2459,7 +2463,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9), k) { + POS(inode->i_ino, offset >> 9), 0, k) { if (k.k->p.inode != inode->i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -2527,8 +2531,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9), k) { + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9), + BTREE_ITER_WITH_HOLES, k) { if (k.k->p.inode != inode->i_ino) { next_hole = bch2_next_pagecache_hole(inode, offset, MAX_LFS_FILESIZE); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 3c02b0c6..201cdfcb 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -81,7 +81,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c, lockdep_assert_held(&ei->update_lock); - bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0)); + bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0), + BTREE_ITER_INTENT); do { struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); @@ -714,7 +715,7 @@ static int bch2_fiemap(struct inode *inode, struct fiemap_extent_info *info, return -EINVAL; for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, start >> 9), k) + POS(inode->i_ino, start >> 9), 0, k) if (bkey_extent_is_data(k.k) || k.k->type == BCH_RESERVATION) { if (bkey_cmp(bkey_start_pos(k.k), @@ -990,7 +991,6 @@ static const struct file_operations bch_dir_file_operations = { }; static const struct inode_operations bch_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index e5052070..18d1d533 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -134,8 +134,8 @@ struct hash_check { static void hash_check_init(const struct bch_hash_desc desc, struct hash_check *h, struct bch_fs *c) { - bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN); - bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN); + bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0); + bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0); } static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c, @@ -251,7 +251,7 @@ static int check_extents(struct bch_fs *c) int ret = 0; for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(BCACHE_ROOT_INO, 0), k) { + POS(BCACHE_ROOT_INO, 0), 0, k) { if (k.k->type == KEY_TYPE_DISCARD) continue; @@ -310,7 +310,7 @@ static int check_dirents(struct bch_fs *c) hash_check_init(bch2_dirent_hash_desc, &h, c); for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(BCACHE_ROOT_INO, 0), k) { + POS(BCACHE_ROOT_INO, 0), 0, k) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; @@ -444,7 +444,7 @@ static int check_xattrs(struct bch_fs *c) hash_check_init(bch2_xattr_hash_desc, &h, c); for_each_btree_key(&iter, c, BTREE_ID_XATTRS, - POS(BCACHE_ROOT_INO, 0), k) { + POS(BCACHE_ROOT_INO, 0), 0, k) { ret = walk_inode(c, &w, k.k->p.inode); if (ret) break; @@ -664,7 +664,7 @@ next: goto up; for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(e->inum, e->offset + 1), k) { + POS(e->inum, e->offset + 1), 0, k) { if (k.k->p.inode != e->inum) break; @@ -712,7 +712,7 @@ up: path.nr--; } - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) { if (k.k->type != BCH_INODE_FS || !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode))) continue; @@ -794,7 +794,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false); - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) { + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) { switch (k.k->type) { case BCH_DIRENT: d = bkey_s_c_to_dirent(k); @@ -825,7 +825,7 @@ s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum) struct bkey_s_c k; u64 sectors = 0; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) { + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) { if (k.k->p.inode != inum) break; @@ -999,7 +999,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, int ret = 0, ret2 = 0; u64 nlinks_pos; - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0)); + bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0); genradix_iter_init(&nlinks_iter); while ((k = bch2_btree_iter_peek(&iter)).k && diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 5b56a628..0a37153d 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -276,7 +276,8 @@ int bch2_inode_create(struct bch_fs *c, struct bkey_i *inode, if (*hint == min) searched_from_start = true; again: - bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0)); + bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0), + BTREE_ITER_INTENT); while (1) { struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); @@ -376,8 +377,9 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, struct bkey_s_c k; int ret = -ENOENT; - for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES, - POS(inode_nr, 0), k) { + for_each_btree_key(&iter, c, BTREE_ID_INODES, + POS(inode_nr, 0), + BTREE_ITER_WITH_HOLES, k) { switch (k.k->type) { case BCH_INODE_FS: ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); @@ -400,7 +402,7 @@ int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid, struct btree_iter iter; struct bkey_s_c k; - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) { + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), 0, k) { if (k.k->p.inode >= BLOCKDEV_INODE_MAX) break; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 0f27eaf6..d588f6ab 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -182,8 +182,9 @@ static int bch2_write_index_default(struct bch_write_op *op) struct btree_iter iter; int ret; - bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k)); + bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); ret = bch2_btree_insert_list_at(&iter, keys, &op->res, NULL, op_journal_seq(op), @@ -1112,9 +1113,9 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, if (promote_op) { struct bio *promote_bio = &promote_op->write.wbio.bio; - bio_init(promote_bio); - promote_bio->bi_max_vecs = pages; - promote_bio->bi_io_vec = promote_bio->bi_inline_vecs; + bio_init(promote_bio, + promote_bio->bi_inline_vecs, + pages); bounce = true; /* could also set read_full */ } @@ -1265,8 +1266,9 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), k) { + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_WITH_HOLES, k) { BKEY_PADDED(k) tmp; struct extent_pick_ptr pick; unsigned bytes, sectors; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index ca96330c..510066a2 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -163,8 +163,7 @@ static void journal_seq_blacklist_flush(struct journal *j, n = bl->entries[i]; mutex_unlock(&j->blacklist_lock); - bch2_btree_iter_init(&iter, c, n.btree_id, n.pos); - iter.is_extents = false; + __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); redo_peek: b = bch2_btree_iter_peek_node(&iter); @@ -1921,6 +1920,9 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) struct journal_entry_pin *pin; u64 pin_seq; + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return; + while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) pin->flush(j, pin, pin_seq); @@ -2374,9 +2376,9 @@ static void journal_write(struct closure *cl) bio = ca->journal.bio; bio_reset(bio); bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_opf = REQ_OP_FLUSH; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); closure_bio_submit(bio, cl); } diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index f79b624d..8680b100 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -97,7 +97,8 @@ int bch2_move_data_off_device(struct bch_dev *ca) atomic_set(&ctxt.error_count, 0); atomic_set(&ctxt.error_flags, 0); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_PREFETCH); while (!bch2_move_ctxt_wait(&ctxt) && (k = bch2_btree_iter_peek(&iter)).k && @@ -167,7 +168,7 @@ static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id) closure_init_stack(&cl); - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); retry: if (!bch2_extent_has_device(e, ca->dev_idx)) @@ -197,7 +198,7 @@ retry: return ret; /* btree IO error */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); BUG_ON(bch2_extent_has_device(e, ca->dev_idx)); @@ -341,7 +342,8 @@ int bch2_flag_data_bad(struct bch_dev *ca) struct bkey_s_c_extent e; struct btree_iter iter; - bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, + POS_MIN, BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = btree_iter_err(k))) { diff --git a/libbcachefs/move.c b/libbcachefs/move.c index f718f42a..8c9395de 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -54,8 +54,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct btree_iter iter; int ret = 0; - bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k)); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); while (1) { struct bkey_s_extent insert = @@ -171,13 +172,12 @@ void bch2_migrate_write_init(struct bch_fs *c, static void migrate_bio_init(struct moving_io *io, struct bio *bio, unsigned sectors) { - bio_init(bio); + bio_init(bio, io->bi_inline_vecs, + DIV_ROUND_UP(sectors, PAGE_SECTORS)); bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); bio->bi_iter.bi_size = sectors << 9; - bio->bi_max_vecs = DIV_ROUND_UP(sectors, PAGE_SECTORS); bio->bi_private = &io->cl; - bio->bi_io_vec = io->bi_inline_vecs; bch2_bio_map(bio, NULL); } diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index cc7d3f68..72cbb9d5 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -9,6 +9,7 @@ #include "buckets.h" #include "clock.h" #include "extents.h" +#include "eytzinger.h" #include "io.h" #include "keylist.h" #include "move.h" @@ -18,20 +19,43 @@ #include #include #include +#include #include /* Moving GC - IO loop */ +static int bucket_idx_cmp(const void *_l, const void *_r, size_t size) +{ + const struct bucket_heap_entry *l = _l; + const struct bucket_heap_entry *r = _r; + + if (l->bucket < r->bucket) + return -1; + if (l->bucket > r->bucket) + return 1; + return 0; +} + static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca, struct bkey_s_c k) { + bucket_heap *h = &ca->copygc_heap; const struct bch_extent_ptr *ptr; if (bkey_extent_is_data(k.k) && (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k), - ca->dev_idx)) && - PTR_BUCKET(ca, ptr)->mark.copygc) - return ptr; + ca->dev_idx))) { + struct bucket_heap_entry search = { + .bucket = PTR_BUCKET_NR(ca, ptr) + }; + + size_t i = eytzinger0_find(h->data, h->used, + sizeof(h->data[0]), + bucket_idx_cmp, &search); + + if (i < h->used) + return ptr; + } return NULL; } @@ -60,17 +84,19 @@ static void read_moving(struct bch_dev *ca, size_t buckets_to_move, u64 sectors_to_move) { struct bch_fs *c = ca->fs; - struct bucket *g; + bucket_heap *h = &ca->copygc_heap; struct moving_context ctxt; struct btree_iter iter; struct bkey_s_c k; u64 sectors_not_moved = 0; size_t buckets_not_moved = 0; + struct bucket_heap_entry *i; bch2_ratelimit_reset(&ca->moving_gc_pd.rate); bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate, SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_PREFETCH); while (1) { if (kthread_should_stop()) @@ -108,11 +134,14 @@ next: buckets_to_move); /* don't check this if we bailed out early: */ - for_each_bucket(g, ca) - if (g->mark.copygc && bucket_sectors_used(g)) { - sectors_not_moved += bucket_sectors_used(g); + for (i = h->data; i < h->data + h->used; i++) { + struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark); + + if (i->mark.gen == m.gen && bucket_sectors_used(m)) { + sectors_not_moved += bucket_sectors_used(m); buckets_not_moved++; } + } if (sectors_not_moved) bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved", @@ -138,15 +167,20 @@ static bool have_copygc_reserve(struct bch_dev *ca) return ret; } +static inline int sectors_used_cmp(bucket_heap *heap, + struct bucket_heap_entry l, + struct bucket_heap_entry r) +{ + return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark); +} + static void bch2_moving_gc(struct bch_dev *ca) { struct bch_fs *c = ca->fs; struct bucket *g; - struct bucket_mark new; - u64 sectors_to_move; + u64 sectors_to_move = 0; size_t buckets_to_move, buckets_unused = 0; - struct bucket_heap_entry e; - unsigned sectors_used, i; + struct bucket_heap_entry e, *i; int reserve_sectors; if (!have_copygc_reserve(ca)) { @@ -174,52 +208,47 @@ static void bch2_moving_gc(struct bch_dev *ca) */ /* - * We need bucket marks to be up to date, so gc can't be recalculating - * them, and we don't want the allocator invalidating a bucket after - * we've decided to evacuate it but before we set copygc: + * We need bucket marks to be up to date - gc can't be recalculating + * them: */ down_read(&c->gc_lock); - mutex_lock(&ca->heap_lock); - mutex_lock(&ca->fs->bucket_lock); - - ca->heap.used = 0; + ca->copygc_heap.used = 0; for_each_bucket(g, ca) { - bucket_cmpxchg(g, new, new.copygc = 0); + struct bucket_mark m = READ_ONCE(g->mark); + struct bucket_heap_entry e = { g - ca->buckets, m }; - if (bucket_unused(g)) { + if (bucket_unused(m)) { buckets_unused++; continue; } - if (g->mark.owned_by_allocator || - g->mark.data_type != BUCKET_DATA) + if (m.owned_by_allocator || + m.data_type != BUCKET_DATA) continue; - sectors_used = bucket_sectors_used(g); - - if (sectors_used >= ca->mi.bucket_size) + if (bucket_sectors_used(m) >= ca->mi.bucket_size) continue; - bucket_heap_push(ca, g, sectors_used); + heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp); } + up_read(&c->gc_lock); - sectors_to_move = 0; - for (i = 0; i < ca->heap.used; i++) - sectors_to_move += ca->heap.data[i].val; + for (i = ca->copygc_heap.data; + i < ca->copygc_heap.data + ca->copygc_heap.used; + i++) + sectors_to_move += bucket_sectors_used(i->mark); while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp)); - sectors_to_move -= e.val; + BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp)); + sectors_to_move -= bucket_sectors_used(e.mark); } - for (i = 0; i < ca->heap.used; i++) - bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1); + buckets_to_move = ca->copygc_heap.used; - buckets_to_move = ca->heap.used; - - mutex_unlock(&ca->fs->bucket_lock); - mutex_unlock(&ca->heap_lock); - up_read(&c->gc_lock); + eytzinger0_sort(ca->copygc_heap.data, + ca->copygc_heap.used, + sizeof(ca->copygc_heap.data[0]), + bucket_idx_cmp, NULL); read_moving(ca, buckets_to_move, sectors_to_move); } diff --git a/libbcachefs/six.c b/libbcachefs/six.c index 32837855..c60a6730 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -169,7 +169,7 @@ static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner) break; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); @@ -222,7 +222,7 @@ static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } osq_unlock(&lock->osq); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 02052165..b237b751 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -190,7 +190,7 @@ bch2_hash_lookup(const struct bch_hash_desc desc, struct btree_iter *iter, const void *key) { bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); + POS(inode, desc.hash_key(info, key)), 0); return bch2_hash_lookup_at(desc, info, iter, key); } @@ -201,8 +201,9 @@ bch2_hash_lookup_intent(const struct bch_hash_desc desc, struct bch_fs *c, u64 inode, struct btree_iter *iter, const void *key) { - bch2_btree_iter_init_intent(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); + bch2_btree_iter_init(iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_INTENT); return bch2_hash_lookup_at(desc, info, iter, key); } @@ -232,8 +233,9 @@ static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, struct btree_iter *iter, const void *key) { - bch2_btree_iter_init_intent(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); + bch2_btree_iter_init(iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_INTENT); return bch2_hash_hole_at(desc, iter); } @@ -278,9 +280,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc, struct bkey_s_c k; int ret; - bch2_btree_iter_init_intent(&hashed_slot, c, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)))); - bch2_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos); + bch2_btree_iter_init(&hashed_slot, c, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + BTREE_ITER_INTENT); + bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos, + BTREE_ITER_INTENT); bch2_btree_iter_link(&hashed_slot, &iter); retry: /* @@ -353,7 +357,7 @@ static inline int bch2_hash_delete_at(const struct bch_hash_desc desc, int ret = -ENOENT; bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id, - iter->pos); + iter->pos, 0); bch2_btree_iter_link(iter, &whiteout_iter); ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter); @@ -382,10 +386,11 @@ static inline int bch2_hash_delete(const struct bch_hash_desc desc, struct bkey_s_c k; int ret = -ENOENT; - bch2_btree_iter_init_intent(&iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); + bch2_btree_iter_init(&iter, c, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_INTENT); bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key))); + POS(inode, desc.hash_key(info, key)), 0); bch2_btree_iter_link(&iter, &whiteout_iter); retry: k = bch2_hash_lookup_at(desc, info, &iter, key); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 7a981360..528538b5 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -377,7 +377,8 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bdi_destroy(&c->bdi); + if (c->bdi.bdi_list.next) + bdi_destroy(&c->bdi); lg_lock_free(&c->usage_lock); free_percpu(c->usage_percpu); mempool_exit(&c->btree_bounce_pool); @@ -572,7 +573,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_interior_update)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_read_bio, 1, 0) || + bioset_init(&c->btree_read_bio, 1, + offsetof(struct btree_read_bio, bio)) || bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) || bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) || bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) || @@ -984,7 +986,8 @@ static void bch2_dev_free(struct bch_dev *ca) kfree(ca->bio_prio); kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - free_heap(&ca->heap); + free_heap(&ca->copygc_heap); + free_heap(&ca->alloc_heap); free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) @@ -1105,7 +1108,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->prio_buckets_lock); - mutex_init(&ca->heap_lock); mutex_init(&ca->prio_write_lock); bch2_dev_moving_gc_init(ca); @@ -1142,7 +1144,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) movinggc_reserve, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&ca->heap, heap_size, GFP_KERNEL) || + !init_heap(&ca->alloc_heap, heap_size, GFP_KERNEL) || + !init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) || !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets * sizeof(u8), GFP_KERNEL|__GFP_ZERO)) || diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index c34048a3..3c47f1cb 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -263,7 +263,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) if (!bch2_fs_running(c)) return -EPERM; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k) if (k.k->type == BCH_EXTENT) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; @@ -604,7 +604,7 @@ static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g, static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g, void *private) { - return bucket_sectors_used(g); + return bucket_sectors_used(g->mark); } static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g, diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 16d32928..6bc20845 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -118,7 +118,8 @@ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) bch2_move_ctxt_init(&ctxt, &tier->pd.rate, nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_PREFETCH); while (!kthread_should_stop() && !bch2_move_ctxt_wait(&ctxt) && diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 79a2aeb1..6ffc9811 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -431,3 +431,104 @@ size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len) return n; } + +#include "eytzinger.h" + +static int alignment_ok(const void *base, size_t align) +{ + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + ((unsigned long)base & (align - 1)) == 0; +} + +static void u32_swap(void *a, void *b, size_t size) +{ + u32 t = *(u32 *)a; + *(u32 *)a = *(u32 *)b; + *(u32 *)b = t; +} + +static void u64_swap(void *a, void *b, size_t size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + +static void generic_swap(void *a, void *b, size_t size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while (--size > 0); +} + +static inline int do_cmp(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + size_t l, size_t r) +{ + return cmp_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +static inline void do_swap(void *base, size_t n, size_t size, + void (*swap_func)(void *, void *, size_t), + size_t l, size_t r) +{ + swap_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)) +{ + int i, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + do_swap(base, n, size, swap_func, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } +} diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 8aa5c34b..d7511aeb 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -98,11 +98,13 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ?: __vmalloc(size, gfp_mask, PAGE_KERNEL); } -#define DECLARE_HEAP(type, name) \ - struct { \ - size_t size, used; \ - type *data; \ - } name +#define HEAP(type) \ +struct { \ + size_t size, used; \ + type *data; \ +} + +#define DECLARE_HEAP(type, name) HEAP(type) name #define init_heap(heap, _size, gfp) \ ({ \ @@ -120,46 +122,62 @@ do { \ #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) -#define heap_sift(h, i, cmp) \ -do { \ - size_t _r, _j = i; \ - \ - for (; _j * 2 + 1 < (h)->used; _j = _r) { \ - _r = _j * 2 + 1; \ - if (_r + 1 < (h)->used && \ - cmp((h)->data[_r], (h)->data[_r + 1])) \ - _r++; \ - \ - if (cmp((h)->data[_r], (h)->data[_j])) \ - break; \ - heap_swap(h, _r, _j); \ - } \ -} while (0) +#define heap_peek(h) \ +({ \ + EBUG_ON(!(h)->used); \ + (h)->data[0]; \ +}) + +#define heap_full(h) ((h)->used == (h)->size) #define heap_sift_down(h, i, cmp) \ +do { \ + size_t _c, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _c) { \ + _c = _j * 2 + 1; \ + if (_c + 1 < (h)->used && \ + cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ + _c++; \ + \ + if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ + break; \ + heap_swap(h, _c, _j); \ + } \ +} while (0) + +#define heap_sift_up(h, i, cmp) \ do { \ while (i) { \ size_t p = (i - 1) / 2; \ - if (cmp((h)->data[i], (h)->data[p])) \ + if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ break; \ heap_swap(h, i, p); \ i = p; \ } \ } while (0) -#define heap_add(h, d, cmp) \ +#define heap_add(h, new, cmp) \ ({ \ bool _r = !heap_full(h); \ if (_r) { \ size_t _i = (h)->used++; \ - (h)->data[_i] = d; \ + (h)->data[_i] = new; \ \ - heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ + heap_sift_up(h, _i, cmp); \ } \ _r; \ }) +#define heap_add_or_replace(h, new, cmp) \ +do { \ + if (!heap_add(h, new, cmp) && \ + cmp(h, new, heap_peek(h)) >= 0) { \ + (h)->data[0] = new; \ + heap_sift_down(h, 0, cmp); \ + } \ +} while (0) + #define heap_del(h, i, cmp) \ do { \ size_t _i = (i); \ @@ -167,8 +185,8 @@ do { \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ heap_swap(h, _i, (h)->used); \ + heap_sift_up(h, _i, cmp); \ heap_sift_down(h, _i, cmp); \ - heap_sift(h, _i, cmp); \ } while (0) #define heap_pop(h, d, cmp) \ @@ -181,19 +199,11 @@ do { \ _r; \ }) -#define heap_peek(h) \ -({ \ - EBUG_ON(!(h)->used); \ - (h)->data[0]; \ -}) - -#define heap_full(h) ((h)->used == (h)->size) - #define heap_resort(heap, cmp) \ do { \ ssize_t _i; \ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift(heap, _i, cmp); \ + heap_sift_down(heap, _i, cmp); \ } while (0) /* diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 4e82e42c..b2075c2e 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -282,7 +282,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ssize_t ret = 0; size_t len; - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) { + for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) diff --git a/linux/bio.c b/linux/bio.c index 8fb10ce4..f4356699 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -278,10 +278,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) return NULL; bio = p + front_pad; - bio_init(bio); - bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bio->bi_inline_vecs; + bio_init(bio, bio->bi_inline_vecs, nr_iovecs); + bio->bi_pool = bs; return bio; }