diff --git a/.bcachefs_revision b/.bcachefs_revision index 088f645c..8eca0593 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -bcca1c557b1897ecc3aeb1f89ab91865487d91ab +99750eab4d583132cf61f071082c7cf21f5295c0 diff --git a/include/asm/page.h b/include/asm/page.h new file mode 100644 index 00000000..e69de29b diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 7471bd97..38a364c0 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -37,6 +37,7 @@ typedef struct { #define xchg_acquire(p, v) uatomic_xchg(p, v) #define cmpxchg(p, old, new) uatomic_cmpxchg(p, old, new) #define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new) +#define cmpxchg_release(p, old, new) uatomic_cmpxchg(p, old, new) #define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add() #define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add() @@ -77,6 +78,16 @@ typedef struct { __old; \ }) +#define cmpxchg_release(p, old, new) \ +({ \ + typeof(*(p)) __old = (old); \ + \ + __atomic_compare_exchange_n((p), &__old, new, false, \ + __ATOMIC_RELEASE, \ + __ATOMIC_RELEASE); \ + __old; \ +}) + #define smp_mb__before_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST) #define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST) #define smp_wmb() __atomic_thread_fence(__ATOMIC_SEQ_CST) diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 7f637e17..3a91130a 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -1,34 +1,60 @@ #ifndef _LINUX_GENERIC_RADIX_TREE_H #define _LINUX_GENERIC_RADIX_TREE_H -/* - * Generic radix trees/sparse arrays: +/** + * DOC: Generic radix trees/sparse arrays: * - * A generic radix tree has all nodes of size PAGE_SIZE - both leaves and - * interior nodes. + * Very simple and minimalistic, supporting arbitrary size entries up to + * PAGE_SIZE. + * + * A genradix is defined with the type it will store, like so: + * + * static GENRADIX(struct foo) foo_genradix; + * + * The main operations are: + * + * - genradix_init(radix) - initialize an empty genradix + * + * - genradix_free(radix) - free all memory owned by the genradix and + * reinitialize it + * + * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning + * NULL if that entry does not exist + * + * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry, + * allocating it if necessary + * + * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix + * + * The radix tree allocates one page of entries at a time, so entries may exist + * that were never explicitly allocated - they will be initialized to all + * zeroes. + * + * Internally, a genradix is just a radix tree of pages, and indexing works in + * terms of byte offsets. The wrappers in this header file use sizeof on the + * type the radix contains to calculate a byte offset from the index - see + * __idx_to_offset. */ +#include #include #include #include -struct genradix_node; +struct genradix_root; struct __genradix { - struct genradix_node *root; - size_t depth; + struct genradix_root __rcu *root; }; /* - * NOTE: currently, sizeof(_type) must be a power of two and not larger than - * PAGE_SIZE: + * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: */ #define __GENRADIX_INITIALIZER \ { \ .tree = { \ .root = NULL, \ - .depth = 0, \ } \ } @@ -49,6 +75,12 @@ struct { \ #define DEFINE_GENRADIX(_name, _type) \ GENRADIX(_type) _name = __GENRADIX_INITIALIZER +/** + * genradix_init - initialize a genradix + * @_radix: genradix to initialize + * + * Does not fail + */ #define genradix_init(_radix) \ do { \ *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \ @@ -56,11 +88,20 @@ do { \ void __genradix_free(struct __genradix *); +/** + * genradix_free: free all memory owned by a genradix + * @_radix: the genradix to free + * + * After freeing, @_radix will be reinitialized and empty + */ #define genradix_free(_radix) __genradix_free(&(_radix)->tree) static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { - BUILD_BUG_ON(obj_size > PAGE_SIZE); + if (__builtin_constant_p(obj_size)) + BUILD_BUG_ON(obj_size > PAGE_SIZE); + else + BUG_ON(obj_size > PAGE_SIZE); if (!is_power_of_2(obj_size)) { size_t objs_per_page = PAGE_SIZE / obj_size; @@ -79,7 +120,13 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) void *__genradix_ptr(struct __genradix *, size_t); -/* Returns a pointer to element at @_idx */ +/** + * genradix_ptr - get a pointer to a genradix entry + * @_radix: genradix to access + * @_idx: index to fetch + * + * Returns a pointer to entry at @_idx, or NULL if that entry does not exist. + */ #define genradix_ptr(_radix, _idx) \ (__genradix_cast(_radix) \ __genradix_ptr(&(_radix)->tree, \ @@ -87,7 +134,15 @@ void *__genradix_ptr(struct __genradix *, size_t); void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); -/* Returns a pointer to element at @_idx, allocating it if necessary */ +/** + * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it + * if necessary + * @_radix: genradix to access + * @_idx: index to fetch + * @_gfp: gfp mask + * + * Returns a pointer to entry at @_idx, or NULL on allocation failure + */ #define genradix_ptr_alloc(_radix, _idx, _gfp) \ (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ @@ -99,6 +154,11 @@ struct genradix_iter { size_t pos; }; +/** + * genradix_iter_init - initialize a genradix_iter + * @_radix: genradix that will be iterated over + * @_idx: index to start iterating from + */ #define genradix_iter_init(_radix, _idx) \ ((struct genradix_iter) { \ .pos = (_idx), \ @@ -107,6 +167,14 @@ struct genradix_iter { void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); +/** + * genradix_iter_peek - get first entry at or above iterator's current + * position + * @_iter: a genradix_iter + * @_radix: genradix being iterated over + * + * If no more entries exist at or above @_iter's current position, returns NULL + */ #define genradix_iter_peek(_iter, _radix) \ (__genradix_cast(_radix) \ __genradix_iter_peek(_iter, &(_radix)->tree, \ @@ -127,4 +195,37 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, #define genradix_iter_advance(_iter, _radix) \ __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) +#define genradix_for_each_from(_radix, _iter, _p, _start) \ + for (_iter = genradix_iter_init(_radix, _start); \ + (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ + genradix_iter_advance(&_iter, _radix)) + +/** + * genradix_for_each - iterate over entry in a genradix + * @_radix: genradix to iterate over + * @_iter: a genradix_iter to track current position + * @_p: pointer to genradix entry type + * + * On every iteration, @_p will point to the current entry, and @_iter.pos + * will be the current entry's index. + */ +#define genradix_for_each(_radix, _iter, _p) \ + genradix_for_each_from(_radix, _iter, _p, 0) + +int __genradix_prealloc(struct __genradix *, size_t, gfp_t); + +/** + * genradix_prealloc - preallocate entries in a generic radix tree + * @_radix: genradix to preallocate + * @_nr: number of entries to preallocate + * @_gfp: gfp mask + * + * Returns 0 on success, -ENOMEM on failure + */ +#define genradix_prealloc(_radix, _nr, _gfp) \ + __genradix_prealloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _nr + 1),\ + _gfp) + + #endif /* _LINUX_GENERIC_RADIX_TREE_H */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 6de6e263..2552d457 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -249,6 +249,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } + for_each_member_device(ca, c, i) + bch2_dev_usage_from_buckets(c, ca); + mutex_lock(&c->bucket_clock[READ].lock); for_each_member_device(ca, c, i) { down_read(&ca->bucket_lock); @@ -280,35 +283,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, #endif struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k); struct bucket *g; - struct bucket_mark m; + struct bucket_mark m, new; int ret; BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); a->k.p = POS(ca->dev_idx, b); + bch2_btree_iter_set_pos(iter, a->k.p); + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + percpu_down_read_preempt_disable(&c->mark_lock); g = bucket(ca, b); - m = bucket_cmpxchg(g, m, m.dirty = false); + m = READ_ONCE(g->mark); + + if (!m.dirty) { + percpu_up_read_preempt_enable(&c->mark_lock); + return 0; + } __alloc_write_key(a, g, m); percpu_up_read_preempt_enable(&c->mark_lock); bch2_btree_iter_cond_resched(iter); - bch2_btree_iter_set_pos(iter, a->k.p); - ret = bch2_btree_insert_at(c, NULL, journal_seq, + BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE| flags, BTREE_INSERT_ENTRY(iter, &a->k_i)); + if (ret) + return ret; - if (!ret && ca->buckets_written) + new = m; + new.dirty = false; + atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter); + + if (ca->buckets_written) set_bit(b, ca->buckets_written); - return ret; + return 0; } int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) @@ -898,10 +917,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t for (i = 0; i < RESERVE_NR; i++) if (fifo_push(&ca->free[i], bucket)) { fifo_pop(&ca->free_inc, bucket); + closure_wake_up(&c->freelist_wait); + ca->allocator_blocked_full = false; + spin_unlock(&c->freelist_lock); goto out; } + + if (!ca->allocator_blocked_full) { + ca->allocator_blocked_full = true; + closure_wake_up(&c->freelist_wait); + } + spin_unlock(&c->freelist_lock); if ((current->flags & PF_KTHREAD) && @@ -1226,6 +1254,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } +void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) +{ + closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full); +} + /* stop allocator thread: */ void bch2_dev_allocator_stop(struct bch_dev *ca) { @@ -1333,6 +1366,24 @@ static void allocator_start_issue_discards(struct bch_fs *c) ca->mi.bucket_size, GFP_NOIO, 0); } +static int resize_free_inc(struct bch_dev *ca) +{ + alloc_fifo free_inc; + + if (!fifo_full(&ca->free_inc)) + return 0; + + if (!init_fifo(&free_inc, + ca->free_inc.size * 2, + GFP_KERNEL)) + return -ENOMEM; + + fifo_move(&free_inc, &ca->free_inc); + swap(free_inc, ca->free_inc); + free_fifo(&free_inc); + return 0; +} + static int __bch2_fs_allocator_start(struct bch_fs *c) { struct bch_dev *ca; @@ -1408,6 +1459,12 @@ not_enough: while (!fifo_full(&ca->free[RESERVE_BTREE]) && (bu = next_alloc_bucket(ca)) >= 0) { + ret = resize_free_inc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + bch2_invalidate_one_bucket(c, ca, bu, &journal_seq); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index a0c08e34..26561b3b 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -51,6 +51,7 @@ void bch2_recalc_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 14e6453b..f2f9015d 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -106,6 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false, gc_pos_alloc(c, ob), 0); ob->valid = false; + ob->type = 0; spin_unlock(&ob->lock); percpu_up_read_preempt_enable(&c->mark_lock); @@ -141,6 +142,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ob = c->open_buckets + c->open_buckets_freelist; c->open_buckets_freelist = ob->freelist; atomic_set(&ob->pin, 1); + ob->type = 0; c->open_buckets_nr_free--; return ob; @@ -209,9 +211,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) case RESERVE_ALLOC: return 0; case RESERVE_BTREE: - return BTREE_NODE_RESERVE / 2; + return BTREE_NODE_OPEN_BUCKET_RESERVE; default: - return BTREE_NODE_RESERVE; + return BTREE_NODE_OPEN_BUCKET_RESERVE * 2; } } @@ -837,15 +839,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, { struct write_point *wp; struct open_bucket *ob; - unsigned nr_effective = 0; - struct open_buckets ptrs = { .nr = 0 }; - bool have_cache = false; - unsigned write_points_nr; - int ret = 0, i; + struct open_buckets ptrs; + unsigned nr_effective, write_points_nr; + bool have_cache; + int ret, i; BUG_ON(!nr_replicas || !nr_replicas_required); retry: + ptrs.nr = 0; + nr_effective = 0; write_points_nr = c->write_points_nr; + have_cache = false; wp = writepoint_find(c, write_point.v); diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index b0e44f75..5224a52f 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -85,6 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) { + ob->type = wp->type; atomic_inc(&ob->pin); ob_push(c, ptrs, ob); } diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 6f17f094..66457fc7 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -55,9 +55,10 @@ struct open_bucket { spinlock_t lock; atomic_t pin; u8 freelist; - bool valid; - bool on_partial_list; u8 ec_idx; + u8 type; + unsigned valid:1; + unsigned on_partial_list:1; unsigned sectors_free; struct bch_extent_ptr ptr; struct ec_stripe_new *ec; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 449eb0c1..f42b2f90 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -330,6 +330,8 @@ enum bch_time_stats { /* Size of the freelist we allocate btree nodes from: */ #define BTREE_NODE_RESERVE BTREE_RESERVE_MAX +#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) + struct btree; enum gc_phase { @@ -426,7 +428,13 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; + + /* + * XXX: this should be an enum for allocator state, so as to include + * error state + */ bool allocator_blocked; + bool allocator_blocked_full; alloc_heap alloc_heap; @@ -597,6 +605,7 @@ struct bch_fs { struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; + struct workqueue_struct *journal_reclaim_wq; /* ALLOCATION */ struct delayed_work pd_controllers_update; diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 25725e42..40ce33a4 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1010,11 +1010,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, nr_key_bits -= 64; } - if (l_v != r_v) - return l_v < r_v ? -1 : 1; - - if (!nr_key_bits) - return 0; + if (!nr_key_bits || l_v != r_v) + break; l = next_word(l); r = next_word(r); @@ -1022,6 +1019,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, l_v = *l; r_v = *r; } + + return (l_v > r_v) - (l_v < r_v); } #endif diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 23013fbb..433e8f22 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -483,31 +483,6 @@ static void bch2_gc_free(struct bch_fs *c) percpu_up_write(&c->mark_lock); } -/* - * Accumulate percpu counters onto one cpu's copy - only valid when access - * against any percpu counter is guarded against - */ -static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr) -{ - u64 *ret; - int cpu; - - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); - - for_each_possible_cpu(cpu) { - u64 *i = per_cpu_ptr(p, cpu); - - if (i != ret) { - acc_u64s(ret, i, nr); - memset(i, 0, nr * sizeof(u64)); - } - } - - return ret; -} - static void bch2_gc_done_nocheck(struct bch_fs *c) { struct bch_dev *ca; @@ -543,9 +518,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c) for_each_member_device(ca, c, i) { unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); struct bch_dev_usage *dst = (void *) - acc_percpu_u64s((void *) ca->usage[0], nr); + bch2_acc_percpu_u64s((void *) ca->usage[0], nr); struct bch_dev_usage *src = (void *) - acc_percpu_u64s((void *) ca->usage[1], nr); + bch2_acc_percpu_u64s((void *) ca->usage[1], nr); *dst = *src; } @@ -554,9 +529,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c) unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr; struct bch_fs_usage *dst = (void *) - acc_percpu_u64s((void *) c->usage[0], nr); + bch2_acc_percpu_u64s((void *) c->usage[0], nr); struct bch_fs_usage *src = (void *) - acc_percpu_u64s((void *) c->usage[1], nr); + bch2_acc_percpu_u64s((void *) c->usage[1], nr); memcpy(&dst->s.gc_start[0], &src->s.gc_start[0], @@ -582,6 +557,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) dst_iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ + dst->dirty = true; \ } #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ @@ -612,16 +588,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { + BUG_ON(src_iter.pos != dst_iter.pos); + copy_stripe_field(alive, "alive"); copy_stripe_field(sectors, "sectors"); copy_stripe_field(algorithm, "algorithm"); copy_stripe_field(nr_blocks, "nr_blocks"); copy_stripe_field(nr_redundant, "nr_redundant"); - copy_stripe_field(blocks_nonempty.counter, + copy_stripe_field(blocks_nonempty, "blocks_nonempty"); for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) - copy_stripe_field(block_sectors[i].counter, + copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); if (dst->alive) @@ -656,9 +634,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) for_each_member_device(ca, c, i) { unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); struct bch_dev_usage *dst = (void *) - acc_percpu_u64s((void *) ca->usage[0], nr); + bch2_acc_percpu_u64s((void *) ca->usage[0], nr); struct bch_dev_usage *src = (void *) - acc_percpu_u64s((void *) ca->usage[1], nr); + bch2_acc_percpu_u64s((void *) ca->usage[1], nr); unsigned b; for (b = 0; b < BCH_DATA_NR; b++) @@ -678,9 +656,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr; struct bch_fs_usage *dst = (void *) - acc_percpu_u64s((void *) c->usage[0], nr); + bch2_acc_percpu_u64s((void *) c->usage[0], nr); struct bch_fs_usage *src = (void *) - acc_percpu_u64s((void *) c->usage[1], nr); + bch2_acc_percpu_u64s((void *) c->usage[1], nr); copy_fs_field(s.hidden, "hidden"); copy_fs_field(s.data, "data"); diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 8af5f841..1905acfa 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -109,7 +109,7 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) do { seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(pos, c->gc_pos) <= 0; + ret = gc_pos_cmp(pos, c->gc_pos) < 0; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); return ret; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index dd9d2559..4bd07258 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -77,6 +77,7 @@ enum { __BTREE_INSERT_ATOMIC, __BTREE_INSERT_NOUNLOCK, __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, @@ -100,6 +101,8 @@ enum { /* Don't check for -ENOSPC: */ #define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) +#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) + /* for copygc, or when merging btree nodes */ #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 7eca9203..0df894fc 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -628,7 +628,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans) trans_for_each_entry(trans, i) btree_insert_entry_checks(c, i); - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + !percpu_ref_tryget(&c->writes))) return -EROFS; retry: trans_for_each_iter(trans, i) { @@ -658,7 +659,8 @@ retry: trans_for_each_iter(trans, i) bch2_btree_iter_downgrade(i->iter); out: - percpu_ref_put(&c->writes); + if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&c->writes); /* make sure we didn't drop or screw up locks: */ trans_for_each_iter(trans, i) { diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index d33d0bf0..ea71acb5 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -151,7 +151,6 @@ retry: acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], sizeof(*ret) / sizeof(u64) + nr); - percpu_up_read_preempt_enable(&c->mark_lock); return ret; } @@ -223,13 +222,14 @@ static bool bucket_became_unavailable(struct bucket_mark old, !is_available_bucket(new); } -void bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res, - struct gc_pos gc_pos) +int bch2_fs_usage_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct disk_reservation *disk_res, + struct gc_pos gc_pos) { s64 added = fs_usage->s.data + fs_usage->s.reserved; s64 should_not_have_added; + int ret = 0; percpu_rwsem_assert_held(&c->mark_lock); @@ -242,6 +242,7 @@ void bch2_fs_usage_apply(struct bch_fs *c, "disk usage increased without a reservation")) { atomic64_sub(should_not_have_added, &c->sectors_available); added -= should_not_have_added; + ret = -1; } if (added > 0) { @@ -259,6 +260,8 @@ void bch2_fs_usage_apply(struct bch_fs *c, (u64 *) fs_usage, sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); } + + return ret; } static inline void account_bucket(struct bch_fs_usage *fs_usage, @@ -363,10 +366,7 @@ static inline void update_cached_sectors(struct bch_fs *c, { struct bch_replicas_padded r; - r.e.data_type = BCH_DATA_CACHED; - r.e.nr_devs = 1; - r.e.nr_required = 1; - r.e.devs[0] = dev; + bch2_replicas_entry_cached(&r.e, dev); update_replicas(c, fs_usage, &r.e, sectors); } @@ -382,7 +382,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ BUG_ON(!is_available_bucket(new)); - new.owned_by_allocator = 1; + new.owned_by_allocator = true; + new.dirty = true; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; @@ -455,6 +456,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, type != BCH_DATA_JOURNAL); bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.dirty = true; new.data_type = type; checked_add(new.dirty_sectors, sectors); })); @@ -480,13 +482,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, true); } else { struct bucket *g; - struct bucket_mark old, new; + struct bucket_mark new; rcu_read_lock(); g = bucket(ca, b); - old = bucket_cmpxchg(g, new, ({ - new.data_type = type; + bucket_cmpxchg(g, new, ({ + new.dirty = true; + new.data_type = type; checked_add(new.dirty_sectors, sectors); })); @@ -537,6 +540,8 @@ static void bch2_mark_pointer(struct bch_fs *c, do { new.v.counter = old.v.counter = v; + new.dirty = true; + /* * Check this after reading bucket mark to guard against * the allocator invalidating a bucket after we've already @@ -591,9 +596,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, int blocks_nonempty_delta; s64 parity_sectors; + BUG_ON(!sectors); + m = genradix_ptr(&c->stripes[gc], p.idx); + spin_lock(&c->ec_stripes_heap_lock); + if (!m || !m->alive) { + spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); return -1; @@ -609,19 +619,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, parity_sectors = -parity_sectors; sectors += parity_sectors; - new = atomic_add_return(sectors, &m->block_sectors[p.block]); - old = new - sectors; + old = m->block_sectors[p.block]; + m->block_sectors[p.block] += sectors; + new = m->block_sectors[p.block]; blocks_nonempty_delta = (int) !!new - (int) !!old; - if (!blocks_nonempty_delta) - return 0; + if (blocks_nonempty_delta) { + m->blocks_nonempty += blocks_nonempty_delta; - atomic_add(blocks_nonempty_delta, &m->blocks_nonempty); + if (!gc) + bch2_stripes_heap_update(c, m, p.idx); + } - BUG_ON(atomic_read(&m->blocks_nonempty) < 0); + m->dirty = true; - if (!gc) - bch2_stripes_heap_update(c, m, p.idx); + spin_unlock(&c->ec_stripes_heap_lock); update_replicas(c, fs_usage, &m->r.e, sectors); @@ -629,8 +641,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, } static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, - enum bch_data_type data_type, + s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, unsigned journal_seq, unsigned flags, bool gc) @@ -701,14 +712,13 @@ static void bucket_set_stripe(struct bch_fs *c, BUG_ON(ptr_stale(ca, ptr)); old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.dirty = true; new.stripe = enabled; if (journal_seq) { new.journal_seq_valid = 1; new.journal_seq = journal_seq; } })); - - BUG_ON(old.stripe == enabled); } } @@ -723,22 +733,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, struct stripe *m = genradix_ptr(&c->stripes[gc], idx); unsigned i; + spin_lock(&c->ec_stripes_heap_lock); + if (!m || (!inserting && !m->alive)) { + spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); return -1; } - if (inserting && m->alive) { - bch_err_ratelimited(c, "error marking stripe %zu: already exists", - idx); - return -1; - } + if (m->alive) + bch2_stripes_heap_del(c, m, idx); - BUG_ON(atomic_read(&m->blocks_nonempty)); - - for (i = 0; i < EC_STRIPE_MAX; i++) - BUG_ON(atomic_read(&m->block_sectors[i])); + memset(m, 0, sizeof(*m)); if (inserting) { m->sectors = le16_to_cpu(s.v->sectors); @@ -754,7 +761,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, for (i = 0; i < s.v->nr_blocks; i++) m->r.e.devs[i] = s.v->ptrs[i].dev; - } /* * XXX: account for stripes somehow here @@ -763,15 +769,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, update_replicas(c, fs_usage, &m->r.e, stripe_sectors); #endif - if (!gc) { - if (inserting) + /* gc recalculates these fields: */ + if (!(flags & BCH_BUCKET_MARK_GC)) { + for (i = 0; i < s.v->nr_blocks; i++) { + m->block_sectors[i] = + stripe_blockcount_get(s.v, i); + m->blocks_nonempty += !!m->block_sectors[i]; + } + } + + if (!gc) bch2_stripes_heap_insert(c, m, idx); else - bch2_stripes_heap_del(c, m, idx); - } else { - m->alive = inserting; + m->alive = true; } + spin_unlock(&c->ec_stripes_heap_lock); + bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); return 0; } @@ -879,6 +893,8 @@ void bch2_mark_update(struct btree_insert *trans, struct bch_fs_usage *fs_usage; struct gc_pos pos = gc_pos_btree_node(b); struct bkey_packed *_k; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + static int warned_disk_usage = 0; if (!btree_node_type_needs_gc(iter->btree_id)) return; @@ -939,7 +955,37 @@ void bch2_mark_update(struct btree_insert *trans, bch2_btree_node_iter_advance(&node_iter, b); } - bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos); + if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) && + !warned_disk_usage && + !xchg(&warned_disk_usage, 1)) { + char buf[200]; + + pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + + pr_err("while inserting"); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k)); + pr_err("%s", buf); + pr_err("overlapping with"); + + node_iter = iter->l[0].iter; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k; + + k = bkey_disassemble(b, _k, &unpacked); + + if (btree_node_is_extents(b) + ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(insert->k->k.p, k.k->p)) + break; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); + + bch2_btree_node_iter_advance(&node_iter, b); + } + } percpu_up_read_preempt_enable(&c->mark_lock); } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index ebd39e85..6f368172 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -181,6 +181,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); +void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *); + static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -264,8 +266,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, bool, s64, struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); -void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); +int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, struct gc_pos); /* disk reservations: */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 56ceb260..b84ae5c9 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -402,6 +402,8 @@ static long bch2_ioctl_usage(struct bch_fs *c, if (!src) return -ENOMEM; + percpu_up_read_preempt_enable(&c->mark_lock); + dst.used = bch2_fs_sectors_used(c, *src); dst.online_reserved = src->s.online_reserved; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 755a2603..8018c2bc 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -11,6 +11,7 @@ #include "ec.h" #include "error.h" #include "io.h" +#include "journal_io.h" #include "keylist.h" #include "super-io.h" #include "util.h" @@ -98,40 +99,6 @@ struct ec_bio { /* Stripes btree keys: */ -static unsigned stripe_csums_per_device(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(le16_to_cpu(s->sectors), - 1 << s->csum_granularity_bits); -} - -static unsigned stripe_csum_offset(const struct bch_stripe *s, - unsigned dev, unsigned csum_idx) -{ - unsigned csum_bytes = bch_crc_bytes[s->csum_type]; - - return sizeof(struct bch_stripe) + - sizeof(struct bch_extent_ptr) * s->nr_blocks + - (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -} - -static unsigned stripe_blockcount_offset(const struct bch_stripe *s, - unsigned idx) -{ - return stripe_csum_offset(s, s->nr_blocks, 0) + - sizeof(16) * idx; -} - -static unsigned stripe_val_u64s(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), - sizeof(u64)); -} - -static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx) -{ - return (void *) s + stripe_csum_offset(s, dev, csum_idx); -} - const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; @@ -164,8 +131,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 1U << s->csum_granularity_bits); for (i = 0; i < s->nr_blocks; i++) - pr_buf(out, " %u:%llu", s->ptrs[i].dev, - (u64) s->ptrs[i].offset); + pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, + (u64) s->ptrs[i].offset, + stripe_blockcount_get(s, i)); } static int ptr_matches_stripe(struct bch_fs *c, @@ -609,29 +577,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) BUG_ON(h->data[m->heap_idx].idx != idx); } -static inline unsigned stripe_entry_blocks(struct stripe *m) -{ - return atomic_read(&m->blocks_nonempty); -} - void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; - bool queue_delete; size_t i; - spin_lock(&c->ec_stripes_heap_lock); - - if (!m->alive) { - spin_unlock(&c->ec_stripes_heap_lock); - return; - } - heap_verify_backpointer(c, idx); - h->data[m->heap_idx].blocks_nonempty = - stripe_entry_blocks(m); + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; i = m->heap_idx; heap_sift_up(h, i, ec_stripes_heap_cmp, @@ -641,44 +595,35 @@ void bch2_stripes_heap_update(struct bch_fs *c, heap_verify_backpointer(c, idx); - queue_delete = stripe_idx_to_delete(c) >= 0; - spin_unlock(&c->ec_stripes_heap_lock); - - if (queue_delete) + if (stripe_idx_to_delete(c) >= 0) schedule_work(&c->ec_stripe_delete_work); } void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { - spin_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); m->alive = false; heap_del(&c->ec_stripes_heap, m->heap_idx, ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); - spin_unlock(&c->ec_stripes_heap_lock); } void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - spin_lock(&c->ec_stripes_heap_lock); - BUG_ON(heap_full(&c->ec_stripes_heap)); heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { .idx = idx, - .blocks_nonempty = stripe_entry_blocks(m), + .blocks_nonempty = m->blocks_nonempty, }), ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); m->alive = true; heap_verify_backpointer(c, idx); - - spin_unlock(&c->ec_stripes_heap_lock); } /* stripe deletion */ @@ -1217,6 +1162,116 @@ unlock: mutex_unlock(&c->ec_new_stripe_lock); } +static int __bch2_stripe_write_key(struct bch_fs *c, + struct btree_iter *iter, + struct stripe *m, + size_t idx, + struct bkey_i_stripe *new_key, + unsigned flags) +{ + struct bkey_s_c k; + unsigned i; + int ret; + + bch2_btree_iter_set_pos(iter, POS(0, idx)); + + k = bch2_btree_iter_peek_slot(iter); + ret = btree_iter_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_stripe) + return -EIO; + + bkey_reassemble(&new_key->k_i, k); + + spin_lock(&c->ec_stripes_heap_lock); + + for (i = 0; i < new_key->v.nr_blocks; i++) + stripe_blockcount_set(&new_key->v, i, + m->block_sectors[i]); + m->dirty = false; + + spin_unlock(&c->ec_stripes_heap_lock); + + return bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_NOFAIL|flags, + BTREE_INSERT_ENTRY(iter, &new_key->k_i)); +} + +int bch2_stripes_write(struct bch_fs *c, bool *wrote) +{ + struct btree_iter iter; + struct genradix_iter giter; + struct bkey_i_stripe *new_key; + struct stripe *m; + int ret = 0; + + new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); + BUG_ON(!new_key); + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + genradix_for_each(&c->stripes[0], giter, m) { + if (!m->dirty) + continue; + + ret = __bch2_stripe_write_key(c, &iter, m, giter.pos, + new_key, BTREE_INSERT_NOCHECK_RW); + if (ret) + break; + + *wrote = true; + } + + bch2_btree_iter_unlock(&iter); + + kfree(new_key); + + return ret; +} + +static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k) +{ + + struct gc_pos pos = { 0 }; + + bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0); +} + +int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list) +{ + struct journal_replay *r; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_fs_ec_start(c); + if (ret) + return ret; + + for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) { + bch2_stripe_read_key(c, k); + bch2_btree_iter_cond_resched(&iter); + } + + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + list_for_each_entry(r, journal_replay_list, list) { + struct bkey_i *k, *n; + struct jset_entry *entry; + + for_each_jset_key(k, n, entry, &r->j) + if (entry->btree_id == BTREE_ID_EC) + bch2_stripe_read_key(c, bkey_i_to_s_c(k)); + } + + return 0; +} + int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) { struct btree_iter iter; diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index c728c52c..28178330 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -13,6 +13,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_stripe_to_text, \ } +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static inline unsigned stripe_csum_offset(const struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + + return sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, + unsigned idx) +{ + return stripe_csum_offset(s, s->nr_blocks, 0) + + sizeof(u16) * idx; +} + +static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, + unsigned idx) +{ + return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); +} + +static inline void stripe_blockcount_set(struct bch_stripe *s, + unsigned idx, unsigned v) +{ + __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); + + *p = cpu_to_le16(v); +} + +static inline unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), + sizeof(u64)); +} + +static inline void *stripe_csum(struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + return (void *) s + stripe_csum_offset(s, dev, csum_idx); +} + struct bch_read_bio; struct ec_stripe_buf { @@ -100,6 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_ec_flush_new_stripes(struct bch_fs *); +int bch2_stripes_read(struct bch_fs *, struct list_head *); +int bch2_stripes_write(struct bch_fs *, bool *); + int bch2_ec_mem_alloc(struct bch_fs *, bool); int bch2_fs_ec_start(struct bch_fs *); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 44c5d382..b4d37705 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -19,9 +19,10 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; - u8 alive; - atomic_t blocks_nonempty; - atomic_t block_sectors[EC_STRIPE_MAX]; + unsigned alive:1; + unsigned dirty:1; + u8 blocks_nonempty; + u16 block_sectors[EC_STRIPE_MAX]; struct bch_replicas_padded r; }; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 29804168..0f075fa1 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1664,12 +1664,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, return ret == BCH_MERGE_MERGE; } -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) { struct btree_iter iter; struct bpos end = pos; struct bkey_s_c k; - int ret = 0; + bool ret = true; end.offset += size; @@ -1678,8 +1679,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; + if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { + ret = false; break; } } @@ -1688,6 +1689,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) return ret; } +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + break; + } + case KEY_TYPE_reservation: + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return ret; +} + /* KEY_TYPE_reservation: */ const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 0e6f4a0b..698b2581 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -571,6 +571,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst, BUG_ON(!bch2_bkey_pack_key(dst, src, f)); } -int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index 66fa227c..d19d809c 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -262,18 +262,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } } -static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) -{ - size_t i = 0; - int res; - - while (i < nr && - (res = cmp(search, base + i * size, size))) - i = eytzinger0_child(i, res > 0); - - return i; -} +#define eytzinger0_find(base, nr, size, _cmp, search) \ +({ \ + void *_base = (base); \ + void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < _nr && \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + _i; \ +}) void eytzinger0_sort(void *, size_t, size_t, int (*cmp_func)(const void *, const void *, size_t), diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index c1739f53..2cfc2d9e 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter, BUG_ON(btree_iter_err(old)); if (allocating && - !bch2_extent_is_fully_allocated(old)) + !*allocating && + bch2_bkey_nr_ptrs_allocated(old) < + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) *allocating = true; delta += (min(new->k.p.offset, @@ -858,9 +860,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = !bch2_extent_is_compressed(k) - ? bch2_bkey_nr_dirty_ptrs(k) - : 0; + unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); bio_for_each_segment(bv, bio, iter) { /* brand new pages, don't need to be locked: */ @@ -1759,6 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = dio->iop.inode; struct bio *bio = &dio->iop.op.wbio.bio; struct bio_vec *bv; + loff_t offset; bool sync; long ret; int i; @@ -1770,12 +1771,16 @@ static long bch2_dio_write_loop(struct dio_write *dio) __pagecache_block_get(&mapping->add_lock); /* Write and invalidate pagecache range that we're writing to: */ - ret = write_invalidate_inode_pages_range(mapping, req->ki_pos, - req->ki_pos + iov_iter_count(&dio->iter) - 1); + offset = req->ki_pos + (dio->iop.op.written << 9); + ret = write_invalidate_inode_pages_range(mapping, + offset, + offset + iov_iter_count(&dio->iter) - 1); if (unlikely(ret)) goto err; while (1) { + offset = req->ki_pos + (dio->iop.op.written << 9); + BUG_ON(current->pagecache_lock); current->pagecache_lock = &mapping->add_lock; if (kthread) @@ -1792,13 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio) /* gup might have faulted pages back in: */ ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos + (dio->iop.op.written << 9), - req->ki_pos + iov_iter_count(&dio->iter) - 1); + offset, + offset + bio->bi_iter.bi_size - 1); if (unlikely(ret)) goto err; - dio->iop.op.pos = POS(inode->v.i_ino, - (req->ki_pos >> 9) + dio->iop.op.written); + dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9); task_io_account_write(bio->bi_iter.bi_size); @@ -1878,7 +1882,6 @@ static int bch2_direct_IO_write(struct kiocb *req, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct dio_write *dio; struct bio *bio; - loff_t offset = req->ki_pos; ssize_t ret; lockdep_assert_held(&inode->v.i_rwsem); @@ -1886,7 +1889,7 @@ static int bch2_direct_IO_write(struct kiocb *req, if (unlikely(!iter->count)) return 0; - if (unlikely((offset|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, @@ -1898,7 +1901,7 @@ static int bch2_direct_IO_write(struct kiocb *req, dio->mm = current->mm; dio->loop = false; dio->sync = is_sync_kiocb(req) || - offset + iter->count > inode->v.i_size; + req->ki_pos + iter->count > inode->v.i_size; dio->free_iov = false; dio->quota_res.sectors = 0; dio->iter = *iter; @@ -1915,19 +1918,20 @@ static int bch2_direct_IO_write(struct kiocb *req, if (unlikely(ret)) goto err; + dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas; + ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, dio->iop.op.opts.data_replicas, 0); if (unlikely(ret)) { - if (bch2_check_range_allocated(c, POS(inode->v.i_ino, - offset >> 9), - iter->count >> 9)) + if (!bch2_check_range_allocated(c, POS(inode->v.i_ino, + req->ki_pos >> 9), + iter->count >> 9, + dio->iop.op.opts.data_replicas)) goto err; dio->iop.unalloc = true; } - dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas; - return bch2_dio_write_loop(dio); err: bch2_disk_reservation_put(c, &dio->iop.op.res); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 52498627..5cc0651c 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } list_for_each_entry(i, list, list) { + struct bch_replicas_padded replicas; + char buf[80]; + + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, - i->devs, false), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL))) { - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); + fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + "superblock not marked as containing replicas %s", + (bch2_replicas_entry_to_text(&PBUF(buf), + &replicas.e), buf)))) { + ret = bch2_mark_replicas(c, &replicas.e); if (ret) return ret; } @@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl) struct journal_buf *w = journal_prev_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); @@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl) goto err; } - if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + + if (bch2_mark_replicas(c, &replicas.e)) goto err; spin_lock(&j->lock); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 4a997366..a795e888 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -335,7 +335,7 @@ void bch2_journal_reclaim_work(struct work_struct *work) mutex_unlock(&j->reclaim_lock); if (!test_bit(BCH_FS_RO, &c->flags)) - queue_delayed_work(system_freezable_wq, &j->reclaim_work, + queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, msecs_to_jiffies(j->reclaim_delay_ms)); } @@ -387,7 +387,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; - struct bch_devs_list devs; u64 iter, seq = 0; int ret = 0; @@ -412,12 +411,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) spin_lock(&j->lock); while (!ret && seq < j->pin.back) { + struct bch_replicas_padded replicas; + seq = max(seq, journal_last_seq(j)); - devs = journal_seq_pin(j, seq)->devs; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, + journal_seq_pin(j, seq)->devs); seq++; spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); + ret = bch2_mark_replicas(c, &replicas.e); spin_lock(&j->lock); } spin_unlock(&j->lock); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index b2198651..bb425d88 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "extents.h" #include "io.h" @@ -152,6 +153,16 @@ retry: bch2_btree_iter_unlock(&iter); } + /* flush relevant btree updates */ + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); + } + ret = 0; out: ret = bch2_replicas_gc_end(c, ret); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 80909ae4..98cfcefd 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -3,6 +3,7 @@ #include "alloc_foreground.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "disk_groups.h" #include "inode.h" @@ -763,6 +764,16 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; + + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); + } + ret = bch2_gc_btree_replicas(c) ?: ret; ret = bch2_move_data(c, NULL, diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index eae38ea7..f5f3f94e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -214,12 +214,12 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - err = "cannot allocate memory"; - ret = bch2_fs_ec_start(c); + ret = bch2_stripes_read(c, &journal); if (ret) goto err; + pr_info("stripes_read done"); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 66ca13aa..230f807b 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -13,6 +13,16 @@ static inline int u8_cmp(u8 l, u8 r) return (l > r) - (l < r); } +static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) +{ +#ifdef CONFIG_BCACHES_DEBUG + unsigned i; + + for (i = 0; i + 1 < e->nr_devs; i++) + BUG_ON(e->devs[i] >= e->devs[i + 1]); +#endif +} + static void replicas_entry_sort(struct bch_replicas_entry *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); @@ -23,19 +33,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e) (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ _i = (void *) (_i) + (_r)->entry_size) -static inline struct bch_replicas_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } -static void replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) { unsigned i; @@ -60,7 +64,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, pr_buf(out, " "); first = false; - replicas_entry_to_text(out, e); + bch2_replicas_entry_to_text(out, e); } } @@ -100,8 +104,8 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -static void bkey_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *e) +static void bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) { e->nr_devs = 0; @@ -119,11 +123,13 @@ static void bkey_to_replicas(struct bkey_s_c k, stripe_to_replicas(k, e); break; } + + replicas_entry_sort(e); } -static inline void devlist_to_replicas(struct bch_devs_list devs, - enum bch_data_type data_type, - struct bch_replicas_entry *e) +void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + enum bch_data_type data_type, + struct bch_devs_list devs) { unsigned i; @@ -137,6 +143,8 @@ static inline void devlist_to_replicas(struct bch_devs_list devs, for (i = 0; i < devs.nr; i++) e->devs[e->nr_devs++] = devs.devs[i]; + + replicas_entry_sort(e); } static struct bch_replicas_cpu @@ -150,6 +158,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, replicas_entry_bytes(new_entry)), }; + BUG_ON(!new_entry->data_type); + verify_replicas_entry_sorted(new_entry); + new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); if (!new.entries) return new; @@ -175,13 +186,12 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, if (unlikely(entry_size > r->entry_size)) return -1; - replicas_entry_sort(search); - - while (entry_size < r->entry_size) - ((char *) search)[entry_size++] = 0; + verify_replicas_entry_sorted(search); +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, - memcmp, search); + entry_cmp, search); +#undef entry_cmp return idx < r->nr ? idx : -1; } @@ -189,6 +199,8 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, int bch2_replicas_entry_idx(struct bch_fs *c, struct bch_replicas_entry *search) { + replicas_entry_sort(search); + return __replicas_entry_idx(&c->replicas, search); } @@ -198,12 +210,17 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, return __replicas_entry_idx(r, search) >= 0; } -static bool replicas_has_entry(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search, + bool check_gc_replicas) { bool marked; + if (!search->nr_devs) + return true; + + verify_replicas_entry_sorted(search); + percpu_down_read_preempt_disable(&c->mark_lock); marked = __replicas_has_entry(&c->replicas, search) && (!check_gc_replicas || @@ -214,35 +231,31 @@ static bool replicas_has_entry(struct bch_fs *c, return marked; } -static void __replicas_table_update(struct bch_fs_usage __percpu *dst, +static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, struct bch_replicas_cpu *dst_r, - struct bch_fs_usage __percpu *src, + struct bch_fs_usage __percpu *src_p, struct bch_replicas_cpu *src_r) { - int src_idx, dst_idx, cpu; + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((void *) src_p, src_nr); + int src_idx, dst_idx; + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + *dst = *src; for (src_idx = 0; src_idx < src_r->nr; src_idx++) { - u64 *dst_v, src_v = 0; - - for_each_possible_cpu(cpu) - src_v += *per_cpu_ptr(&src->data[src_idx], cpu); + if (!src->data[src_idx]) + continue; dst_idx = __replicas_entry_idx(dst_r, cpu_replicas_entry(src_r, src_idx)); + BUG_ON(dst_idx < 0); - if (dst_idx < 0) { - BUG_ON(src_v); - continue; - } - - preempt_disable(); - - dst_v = this_cpu_ptr(&dst->data[dst_idx]); - BUG_ON(*dst_v); - - *dst_v = src_v; - - preempt_enable(); + dst->data[dst_idx] = src->data[src_idx]; } } @@ -344,30 +357,32 @@ err: return ret; } -static int __bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *devs) +int bch2_mark_replicas(struct bch_fs *c, + struct bch_replicas_entry *r) { - return likely(replicas_has_entry(c, devs, true)) + return likely(bch2_replicas_marked(c, r, true)) ? 0 - : bch2_mark_replicas_slowpath(c, devs); + : bch2_mark_replicas_slowpath(c, r); } -int bch2_mark_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) +bool bch2_bkey_replicas_marked(struct bch_fs *c, + struct bkey_s_c k, + bool check_gc_replicas) { struct bch_replicas_padded search; + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; - if (!devs.nr) - return 0; + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); - memset(&search, 0, sizeof(search)); + if (!bch2_replicas_marked(c, &search.e, check_gc_replicas)) + return false; + } - BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + bkey_to_replicas(&search.e, k); - devlist_to_replicas(devs, data_type, &search.e); - - return __bch2_mark_replicas(c, &search.e); + return bch2_replicas_marked(c, &search.e, check_gc_replicas); } int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) @@ -377,18 +392,17 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) unsigned i; int ret; - memset(&search, 0, sizeof(search)); + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); - for (i = 0; i < cached.nr; i++) - if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i])))) + ret = bch2_mark_replicas(c, &search.e); + if (ret) return ret; + } - bkey_to_replicas(k, &search.e); + bkey_to_replicas(&search.e, k); - return search.e.nr_devs - ? __bch2_mark_replicas(c, &search.e) - : 0; + return bch2_mark_replicas(c, &search.e); } int bch2_replicas_gc_end(struct bch_fs *c, int ret) @@ -749,7 +763,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, pr_buf(out, " "); first = false; - replicas_entry_to_text(out, e); + bch2_replicas_entry_to_text(out, e); } } @@ -798,46 +812,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { /* Query replicas: */ -bool bch2_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs, - bool check_gc_replicas) -{ - struct bch_replicas_padded search; - - if (!devs.nr) - return true; - - memset(&search, 0, sizeof(search)); - - devlist_to_replicas(devs, data_type, &search.e); - - return replicas_has_entry(c, &search.e, check_gc_replicas); -} - -bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) -{ - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - - memset(&search, 0, sizeof(search)); - - for (i = 0; i < cached.nr; i++) - if (!bch2_replicas_marked(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i]), - check_gc_replicas)) - return false; - - bkey_to_replicas(k, &search.e); - - return search.e.nr_devs - ? replicas_has_entry(c, &search.e, check_gc_replicas) - : true; -} - struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index fc833653..0ac2b8e0 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -4,17 +4,39 @@ #include "eytzinger.h" #include "replicas_types.h" +void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); +void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); + +static inline struct bch_replicas_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + int bch2_replicas_entry_idx(struct bch_fs *, struct bch_replicas_entry *); -bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bch_devs_list, bool); + +void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +bool bch2_replicas_marked(struct bch_fs *, + struct bch_replicas_entry *, bool); +int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c, bool); -int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); -void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +{ + e->data_type = BCH_DATA_CACHED; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +} struct replicas_status { struct { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index a539f2a8..1835b535 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -205,7 +205,9 @@ int bch2_congested(void *data, int bdi_bits) static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; + bool wrote; unsigned i; + int ret; bch2_rebalance_stop(c); @@ -220,23 +222,42 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ bch2_journal_flush_all_pins(&c->journal); + do { + ret = bch2_alloc_write(c, false, &wrote); + if (ret) { + bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); + break; + } + + ret = bch2_stripes_write(c, &wrote); + if (ret) { + bch2_fs_inconsistent(c, "error writing out stripes"); + break; + } + + for_each_member_device(ca, c, i) + bch2_dev_allocator_quiesce(c, ca); + + bch2_journal_flush_all_pins(&c->journal); + + /* + * We need to explicitly wait on btree interior updates to complete + * before stopping the journal, flushing all journal pins isn't + * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree + * interior updates have to drop their journal pin before they're + * fully complete: + */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + } while (wrote); + for_each_member_device(ca, c, i) bch2_dev_allocator_stop(ca); - bch2_journal_flush_all_pins(&c->journal); - - /* - * We need to explicitly wait on btree interior updates to complete - * before stopping the journal, flushing all journal pins isn't - * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree - * interior updates have to drop their journal pin before they're - * fully complete: - */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - bch2_fs_journal_stop(&c->journal); + /* XXX: mark super that alloc info is persistent */ + /* * the journal kicks off btree writes via reclaim - wait for in flight * writes after stopping journal: @@ -420,6 +441,8 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); + if (c->journal_reclaim_wq) + destroy_workqueue(c->journal_reclaim_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->wq) @@ -638,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || !(c->copygc_wq = alloc_workqueue("bcache_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, sizeof(struct btree_reserve)) || @@ -1297,8 +1322,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (data) { char data_has_str[100]; - bch2_string_opt_to_text(&PBUF(data_has_str), - bch2_data_types, data); + bch2_flags_to_text(&PBUF(data_has_str), + bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ret = -EBUSY; goto err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 2e6e9bd5..40384e7e 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -234,17 +234,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); - unsigned replicas; + unsigned i; if (!fs_usage) return -ENOMEM; pr_buf(&out, "capacity:\t\t%llu\n", c->capacity); - for (replicas = 0; - replicas < ARRAY_SIZE(fs_usage->persistent_reserved); - replicas++) { - pr_buf(&out, "%u replicas:\n", replicas + 1); + for (i = 0; + i < ARRAY_SIZE(fs_usage->persistent_reserved); + i++) { + pr_buf(&out, "%u replicas:\n", i + 1); #if 0 for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) pr_buf(&out, "\t%s:\t\t%llu\n", @@ -254,12 +254,23 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) stats.replicas[replicas].ec_data); #endif pr_buf(&out, "\treserved:\t%llu\n", - fs_usage->persistent_reserved[replicas]); + fs_usage->persistent_reserved[i]); } pr_buf(&out, "online reserved:\t%llu\n", fs_usage->s.online_reserved); + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + pr_buf(&out, "\t"); + bch2_replicas_entry_to_text(&out, e); + pr_buf(&out, ":\t%llu\n", fs_usage->data[i]); + } + + percpu_up_read_preempt_enable(&c->mark_lock); + kfree(fs_usage); return out.pos - buf; @@ -797,6 +808,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) { struct bch_fs *c = ca->fs; struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + unsigned i, nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].type]++; return scnprintf(buf, PAGE_SIZE, "free_inc: %zu/%zu\n" @@ -823,7 +840,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " copygc threshold: %llu\n" "freelist_wait: %s\n" "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n", + "open_buckets_wait: %s\n" + "open_buckets_btree: %u\n" + "open_buckets_user: %u\n" + "btree reserve cache: %u\n", fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, @@ -845,8 +865,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.sectors_fragmented, ca->copygc_threshold, c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty"); + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_BTREE], + nr[BCH_DATA_USER], + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 5c060e77..fea80e24 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -133,6 +133,7 @@ void bch2_flags_to_text(struct printbuf *out, const char * const list[], u64 flags) { unsigned bit, nr = 0; + bool first = true; if (out->pos != out->end) *out->pos = '\0'; @@ -141,7 +142,10 @@ void bch2_flags_to_text(struct printbuf *out, nr++; while (flags && (bit = __ffs(flags)) < nr) { - pr_buf(out, "%s,", list[bit]); + pr_buf(out, "%s", list[bit]); + if (!first) + pr_buf(out, ","); + first = false; flags ^= 1 << bit; } } @@ -894,3 +898,28 @@ void eytzinger0_find_test(void) kfree(test_array); } #endif + +/* + * Accumulate percpu counters onto one cpu's copy - only valid when access + * against any percpu counter is guarded against + */ +u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) +{ + u64 *ret; + int cpu; + + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + + if (i != ret) { + acc_u64s(ret, i, nr); + memset(i, 0, nr * sizeof(u64)); + } + } + + return ret; +} diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 25d67509..fbfb2085 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -715,4 +715,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, acc_u64s(acc, per_cpu_ptr(src, cpu), nr); } +u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + #endif /* _BCACHEFS_UTIL_H */ diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c index 5c4a275e..4f43d0bb 100644 --- a/linux/generic-radix-tree.c +++ b/linux/generic-radix-tree.c @@ -1,4 +1,5 @@ +#include #include #include #include @@ -16,7 +17,7 @@ struct genradix_node { }; }; -static inline unsigned genradix_depth_shift(unsigned depth) +static inline int genradix_depth_shift(unsigned depth) { return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; } @@ -29,16 +30,34 @@ static inline size_t genradix_depth_size(unsigned depth) return 1UL << genradix_depth_shift(depth); } +/* depth that's needed for a genradix that can address up to ULONG_MAX: */ +#define GENRADIX_MAX_DEPTH \ + DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) + +#define GENRADIX_DEPTH_MASK \ + ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) + +unsigned genradix_root_to_depth(struct genradix_root *r) +{ + return (unsigned long) r & GENRADIX_DEPTH_MASK; +} + +struct genradix_node *genradix_root_to_node(struct genradix_root *r) +{ + return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); +} + /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { - size_t level = radix->depth; - struct genradix_node *n = radix->root; + struct genradix_root *r = READ_ONCE(radix->root); + struct genradix_node *n = genradix_root_to_node(r); + unsigned level = genradix_root_to_depth(r); - if (offset >= genradix_depth_size(radix->depth)) + if (ilog2(offset) >= genradix_depth_shift(level)) return NULL; while (1) { @@ -64,43 +83,60 @@ EXPORT_SYMBOL(__genradix_ptr); void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, gfp_t gfp_mask) { - struct genradix_node **n; - size_t level; + struct genradix_root *v = READ_ONCE(radix->root); + struct genradix_node *n, *new_node = NULL; + unsigned level; /* Increase tree depth if necessary: */ - - while (offset >= genradix_depth_size(radix->depth)) { - struct genradix_node *new_root = - (void *) __get_free_page(gfp_mask|__GFP_ZERO); - - if (!new_root) - return NULL; - - new_root->children[0] = radix->root; - radix->root = new_root; - radix->depth++; - } - - n = &radix->root; - level = radix->depth; - while (1) { - if (!*n) { - *n = (void *) __get_free_page(gfp_mask|__GFP_ZERO); - if (!*n) + struct genradix_root *r = v, *new_root; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (n && ilog2(offset) < genradix_depth_shift(level)) + break; + + if (!new_node) { + new_node = (void *) + __get_free_page(gfp_mask|__GFP_ZERO); + if (!new_node) return NULL; } - if (!level) - break; + new_node->children[0] = n; + new_root = ((struct genradix_root *) + ((unsigned long) new_node | (n ? level + 1 : 0))); - level--; - - n = &(*n)->children[offset >> genradix_depth_shift(level)]; - offset &= genradix_depth_size(level) - 1; + if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { + v = new_root; + new_node = NULL; + } } - return &(*n)->data[offset]; + while (level--) { + struct genradix_node **p = + &n->children[offset >> genradix_depth_shift(level)]; + offset &= genradix_depth_size(level) - 1; + + n = READ_ONCE(*p); + if (!n) { + if (!new_node) { + new_node = (void *) + __get_free_page(gfp_mask|__GFP_ZERO); + if (!new_node) + return NULL; + } + + if (!(n = cmpxchg_release(p, NULL, new_node))) + swap(n, new_node); + } + } + + if (new_node) + free_page((unsigned long) new_node); + + return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr_alloc); @@ -108,17 +144,19 @@ void *__genradix_iter_peek(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page) { + struct genradix_root *r; struct genradix_node *n; - size_t level, i; - - if (!radix->root) - return NULL; + unsigned level, i; restart: - if (iter->offset >= genradix_depth_size(radix->depth)) + r = READ_ONCE(radix->root); + if (!r) return NULL; - n = radix->root; - level = radix->depth; + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (ilog2(iter->offset) >= genradix_depth_shift(level)) + return NULL; while (level) { level--; @@ -157,11 +195,24 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level) free_page((unsigned long) n); } +int __genradix_prealloc(struct __genradix *radix, size_t size, + gfp_t gfp_mask) +{ + size_t offset; + + for (offset = 0; offset < size; offset += PAGE_SIZE) + if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(__genradix_prealloc); + void __genradix_free(struct __genradix *radix) { - genradix_free_recurse(radix->root, radix->depth); + struct genradix_root *r = xchg(&radix->root, NULL); - radix->root = NULL; - radix->depth = 0; + genradix_free_recurse(genradix_root_to_node(r), + genradix_root_to_depth(r)); } EXPORT_SYMBOL(__genradix_free);