From a4eb187a6f0af8041ae2128e6ee82ab7a43cb87c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Feb 2019 21:34:16 -0500 Subject: [PATCH] Update bcachefs sources to 75e8a078b8 bcachefs: improved flush_held_btree_writes() --- .bcachefs_revision | 2 +- cmd_migrate.c | 3 +- libbcachefs/alloc_background.c | 255 ++++++++++++---- libbcachefs/alloc_background.h | 9 + libbcachefs/alloc_foreground.c | 2 +- libbcachefs/bcachefs.h | 2 - libbcachefs/bcachefs_format.h | 15 +- libbcachefs/btree_gc.c | 178 +++++------ libbcachefs/btree_types.h | 2 +- libbcachefs/btree_update.h | 9 +- libbcachefs/btree_update_interior.c | 29 +- libbcachefs/btree_update_leaf.c | 103 +++---- libbcachefs/buckets.c | 358 +++++++++++----------- libbcachefs/buckets.h | 55 ++-- libbcachefs/buckets_types.h | 29 +- libbcachefs/chardev.c | 6 +- libbcachefs/extents.c | 8 +- libbcachefs/fifo.h | 2 +- libbcachefs/journal.c | 446 +++++++++++++++------------- libbcachefs/journal.h | 29 +- libbcachefs/journal_io.c | 138 +++------ libbcachefs/journal_io.h | 1 - libbcachefs/journal_reclaim.c | 402 +++++++++++++++---------- libbcachefs/journal_reclaim.h | 7 +- libbcachefs/journal_types.h | 27 +- libbcachefs/recovery.c | 15 +- libbcachefs/replicas.c | 59 ++-- libbcachefs/str_hash.h | 40 +-- libbcachefs/super-io.c | 6 +- libbcachefs/super.c | 7 +- libbcachefs/sysfs.c | 14 +- 31 files changed, 1221 insertions(+), 1037 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 39d11479..6766622b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -09a546543006b60d44c4c51e7b40cd3ec7837a5e +75e8a078b85703322fcf558f75a6845c0ef5dbb0 diff --git a/cmd_migrate.c b/cmd_migrate.c index e9594ab7..4b6ceaa7 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -319,6 +319,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, struct bkey_i_extent *e; BKEY_PADDED(k) k; u64 b = sector_to_bucket(ca, physical); + struct bucket_mark m; struct disk_reservation res; unsigned sectors; int ret; @@ -337,7 +338,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, .gen = bucket(ca, b)->mark.gen, }); - bucket_set_dirty(ca, b); + bucket_cmpxchg(bucket(ca, b), m, m.dirty = true); ret = bch2_disk_reservation_get(c, &res, sectors, 1, BCH_DISK_RESERVATION_NOFAIL); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index ce42202f..f246319b 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -128,6 +128,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } +struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a) +{ + struct bkey_alloc_unpacked ret = { .gen = a->gen }; + const void *d = a->data; + unsigned idx = 0; + +#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); + BCH_ALLOC_FIELDS() +#undef x + return ret; +} + +static void bch2_alloc_pack(struct bkey_i_alloc *dst, + const struct bkey_alloc_unpacked src) +{ + unsigned idx = 0; + void *d = dst->v.data; + + dst->v.fields = 0; + dst->v.gen = src.gen; + +#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); + BCH_ALLOC_FIELDS() +#undef x + + set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v); +} + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); @@ -173,15 +201,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a) { const void *d = a->data; - unsigned idx = 0; + unsigned idx = 0, data_type, dirty_sectors, cached_sectors; + struct bucket_mark m; - g->_mark.gen = a->gen; - g->gen_valid = 1; g->io_time[READ] = get_alloc_field(a, &d, idx++); g->io_time[WRITE] = get_alloc_field(a, &d, idx++); - g->_mark.data_type = get_alloc_field(a, &d, idx++); - g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++); - g->_mark.cached_sectors = get_alloc_field(a, &d, idx++); + data_type = get_alloc_field(a, &d, idx++); + dirty_sectors = get_alloc_field(a, &d, idx++); + cached_sectors = get_alloc_field(a, &d, idx++); + g->oldest_gen = get_alloc_field(a, &d, idx++); + + bucket_cmpxchg(g, m, ({ + m.gen = a->gen; + m.data_type = data_type; + m.dirty_sectors = dirty_sectors; + m.cached_sectors = cached_sectors; + })); + + g->gen_valid = 1; } static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g, @@ -199,6 +236,7 @@ static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g, put_alloc_field(a, &d, idx++, m.data_type); put_alloc_field(a, &d, idx++, m.dirty_sectors); put_alloc_field(a, &d, idx++, m.cached_sectors); + put_alloc_field(a, &d, idx++, g->oldest_gen); set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v); } @@ -315,6 +353,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOMARK| flags, BTREE_INSERT_ENTRY(iter, &a->k_i)); if (ret) @@ -358,7 +397,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ? 0 : bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY, + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK, BTREE_INSERT_ENTRY(&iter, k)); err: bch2_btree_iter_unlock(&iter); @@ -824,6 +864,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca) return -1; } +/* + * returns sequence number of most recent journal entry that updated this + * bucket: + */ +static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) +{ + if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + return bucket_seq; + } else { + return 0; + } +} + +static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca, + struct btree_iter *iter, + u64 *journal_seq, unsigned flags) +{ +#if 0 + __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; +#else + /* hack: */ + __BKEY_PADDED(k, 8) alloc_key; +#endif + struct bkey_i_alloc *a; + struct bkey_alloc_unpacked u; + struct bucket_mark m; + struct bkey_s_c k; + bool invalidating_cached_data; + size_t b; + int ret; + + BUG_ON(!ca->alloc_heap.used || + !ca->alloc_heap.data[0].nr); + b = ca->alloc_heap.data[0].bucket; + + /* first, put on free_inc and mark as owned by allocator: */ + percpu_down_read_preempt_disable(&c->mark_lock); + spin_lock(&c->freelist_lock); + + verify_not_on_freelist(c, ca, b); + + BUG_ON(!fifo_push(&ca->free_inc, b)); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + m = bucket(ca, b)->mark; + + spin_unlock(&c->freelist_lock); + percpu_up_read_preempt_enable(&c->mark_lock); + + bch2_btree_iter_cond_resched(iter); + + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); +retry: + k = bch2_btree_iter_peek_slot(iter); + ret = btree_iter_err(k); + if (ret) + return ret; + + if (k.k && k.k->type == KEY_TYPE_alloc) + u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v); + else + memset(&u, 0, sizeof(u)); + + invalidating_cached_data = u.cached_sectors != 0; + + //BUG_ON(u.dirty_sectors); + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; + u.read_time = c->bucket_clock[READ].hand; + u.write_time = c->bucket_clock[WRITE].hand; + u.gen++; + + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + + ret = bch2_btree_insert_at(c, NULL, + invalidating_cached_data ? journal_seq : NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + flags, + BTREE_INSERT_ENTRY(iter, &a->k_i)); + if (ret == -EINTR) + goto retry; + + if (!ret) { + /* remove from alloc_heap: */ + struct alloc_heap_entry e, *top = ca->alloc_heap.data; + + top->bucket++; + top->nr--; + + if (!top->nr) + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + + /* + * Make sure we flush the last journal entry that updated this + * bucket (i.e. deleting the last reference) before writing to + * this bucket again: + */ + *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + } else { + size_t b2; + + /* remove from free_inc: */ + percpu_down_read_preempt_disable(&c->mark_lock); + spin_lock(&c->freelist_lock); + + bch2_mark_alloc_bucket(c, ca, b, false, + gc_pos_alloc(c, NULL), 0); + + BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); + BUG_ON(b != b2); + + spin_unlock(&c->freelist_lock); + percpu_up_read_preempt_enable(&c->mark_lock); + } + + return ret; +} + static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket, u64 *flush_seq) { @@ -844,18 +1020,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, percpu_up_read_preempt_enable(&c->mark_lock); - if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - *flush_seq = max(*flush_seq, bucket_seq); - } + *flush_seq = max(*flush_seq, bucket_journal_seq(c, m)); return m.cached_sectors != 0; } @@ -868,7 +1033,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) struct btree_iter iter; u64 journal_seq = 0; int ret = 0; - long b; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -876,14 +1040,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) /* Only use nowait if we've already invalidated at least one bucket: */ while (!ret && !fifo_full(&ca->free_inc) && - (b = next_alloc_bucket(ca)) >= 0) { - bool must_flush = - bch2_invalidate_one_bucket(c, ca, b, &journal_seq); - - ret = __bch2_alloc_write_key(c, ca, b, &iter, - must_flush ? &journal_seq : NULL, - !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0); - } + ca->alloc_heap.used) + ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq, + BTREE_INSERT_GC_LOCK_HELD| + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); bch2_btree_iter_unlock(&iter); @@ -1305,24 +1466,16 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; } -static void flush_held_btree_writes(struct bch_fs *c) +static bool flush_done(struct bch_fs *c) { struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; - bool nodes_blocked; + bool nodes_unwritten; size_t i; - struct closure cl; - - closure_init_stack(&cl); - - clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); again: - pr_debug("flushing dirty btree nodes"); cond_resched(); - closure_wait(&c->btree_interior_update_wait, &cl); - - nodes_blocked = false; + nodes_unwritten = false; rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) @@ -1334,24 +1487,25 @@ again: six_unlock_read(&b->lock); goto again; } else { - nodes_blocked = true; + nodes_unwritten = true; } } rcu_read_unlock(); - if (c->btree_roots_dirty) + if (c->btree_roots_dirty) { bch2_journal_meta(&c->journal); - - if (nodes_blocked) { - closure_sync(&cl); goto again; } - closure_wake_up(&c->btree_interior_update_wait); - closure_sync(&cl); + return !nodes_unwritten && + !bch2_btree_interior_updates_nr_pending(c); +} - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); +static void flush_held_btree_writes(struct bch_fs *c) +{ + clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); + + closure_wait_event(&c->btree_interior_update_wait, flush_done(c)); } static void allocator_start_issue_discards(struct bch_fs *c) @@ -1470,7 +1624,6 @@ not_enough: &journal_seq); fifo_push(&ca->free[RESERVE_BTREE], bu); - bucket_set_dirty(ca, bu); } } @@ -1517,7 +1670,6 @@ int bch2_fs_allocator_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i; - bool wrote; int ret; down_read(&c->gc_lock); @@ -1536,8 +1688,7 @@ int bch2_fs_allocator_start(struct bch_fs *c) } set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - - return bch2_alloc_write(c, false, &wrote); + return 0; } void bch2_fs_allocator_background_init(struct bch_fs *c) diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 26561b3b..65e9b373 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -5,6 +5,15 @@ #include "alloc_types.h" #include "debug.h" +struct bkey_alloc_unpacked { + u8 gen; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS() +#undef x +}; + +struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *); + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index f2f9015d..6568e8ac 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -723,7 +723,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head, static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) { u64 stranded = c->write_points_nr * c->bucket_size_max; - u64 free = bch2_fs_sectors_free(c); + u64 free = bch2_fs_usage_read_short(c).free; return stranded * factor > free; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 245d8322..052ec263 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -396,8 +396,6 @@ struct bch_dev { struct bucket_array __rcu *buckets[2]; unsigned long *buckets_nouse; unsigned long *buckets_written; - /* most out of date gen in the btree */ - u8 *oldest_gens; struct rw_semaphore bucket_lock; struct bch_dev_usage __percpu *usage[2]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d020cf74..56bf69eb 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -821,11 +821,12 @@ struct bch_alloc { } __attribute__((packed, aligned(8))); #define BCH_ALLOC_FIELDS() \ - x(read_time, 2) \ - x(write_time, 2) \ - x(data_type, 1) \ - x(dirty_sectors, 2) \ - x(cached_sectors, 2) + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) enum { #define x(name, bytes) BCH_ALLOC_FIELD_##name, @@ -835,12 +836,12 @@ enum { }; static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes, +#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, BCH_ALLOC_FIELDS() #undef x }; -#define x(name, bytes) + bytes +#define x(name, bits) + (bits / 8) static const unsigned BKEY_ALLOC_VAL_U64s_MAX = DIV_ROUND_UP(offsetof(struct bch_alloc, data) BCH_ALLOC_FIELDS(), sizeof(u64)); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index b1f5e8b1..5d6f6364 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -138,24 +138,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - size_t b = PTR_BUCKET_NR(ca, ptr); - struct bucket *g = PTR_BUCKET(ca, ptr); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, ptr, false); if (mustfix_fsck_err_on(!g->gen_valid, c, "found ptr with missing gen in alloc btree,\n" "type %u gen %u", k.k->type, ptr->gen)) { - g->_mark.gen = ptr->gen; - g->gen_valid = 1; - bucket_set_dirty(ca, b); + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->_mark.dirty = g->_mark.dirty = true; + g2->gen_valid = g->gen_valid = true; } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, "%u ptr gen in the future: %u > %u", k.k->type, ptr->gen, g->mark.gen)) { - g->_mark.gen = ptr->gen; - g->gen_valid = 1; - bucket_set_dirty(ca, b); + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->_mark.dirty = g->_mark.dirty = true; + g2->gen_valid = g->gen_valid = true; set_bit(BCH_FS_FIXED_GENS, &c->flags); } } @@ -163,10 +163,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - size_t b = PTR_BUCKET_NR(ca, ptr); + struct bucket *g = PTR_BUCKET(ca, ptr, true); - if (gen_after(ca->oldest_gens[b], ptr->gen)) - ca->oldest_gens[b] = ptr->gen; + if (gen_after(g->oldest_gen, ptr->gen)) + g->oldest_gen = ptr->gen; *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } @@ -230,12 +230,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, bch2_verify_btree_nr_keys(b); + gc_pos_set(c, gc_pos_btree_node(b)); + ret = btree_gc_mark_node(c, b, &max_stale, initial); if (ret) break; - gc_pos_set(c, gc_pos_btree_node(b)); - if (!initial) { if (max_stale > 64) bch2_btree_node_rewrite(c, &iter, @@ -483,88 +483,38 @@ static void bch2_gc_free(struct bch_fs *c) percpu_up_write(&c->mark_lock); } -static void bch2_gc_done_nocheck(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - - { - struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); - struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); - struct stripe *dst, *src; - - c->ec_stripes_heap.used = 0; - - while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && - (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { - *dst = *src; - - if (dst->alive) - bch2_stripes_heap_insert(c, dst, dst_iter.pos); - - genradix_iter_advance(&dst_iter, &c->stripes[0]); - genradix_iter_advance(&src_iter, &c->stripes[1]); - } - } - - for_each_member_device(ca, c, i) { - struct bucket_array *src = __bucket_array(ca, 1); - - memcpy(__bucket_array(ca, 0), src, - sizeof(struct bucket_array) + - sizeof(struct bucket) * src->nbuckets); - }; - - for_each_member_device(ca, c, i) { - unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); - struct bch_dev_usage *dst = (void *) - bch2_acc_percpu_u64s((void *) ca->usage[0], nr); - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage[1], nr); - - *dst = *src; - } - - { - unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + - c->replicas.nr; - struct bch_fs_usage *dst = (void *) - bch2_acc_percpu_u64s((void *) c->usage[0], nr); - struct bch_fs_usage *src = (void *) - bch2_acc_percpu_u64s((void *) c->usage[1], nr); - - memcpy(&dst->s.gc_start[0], - &src->s.gc_start[0], - nr * sizeof(u64) - offsetof(typeof(*dst), s.gc_start)); - } -} - static void bch2_gc_done(struct bch_fs *c, bool initial) { struct bch_dev *ca; + bool verify = !initial || + (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)); unsigned i; #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ - bch_err(c, _msg ": got %llu, should be %llu, fixing" \ - , ##__VA_ARGS__, dst->_f, src->_f); \ + if (verify) \ + bch_err(c, _msg ": got %llu, should be %llu, fixing"\ + , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ - bch_err_ratelimited(c, "stripe %zu has wrong "_msg \ - ": got %u, should be %u, fixing", \ - dst_iter.pos, ##__VA_ARGS__, \ - dst->_f, src->_f); \ + if (verify) \ + bch_err_ratelimited(c, "stripe %zu has wrong "_msg\ + ": got %u, should be %u, fixing", \ + dst_iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ dst->_f = src->_f; \ dst->dirty = true; \ } #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ - bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\ - ": got %u, should be %u, fixing", \ - i, b, dst->b[b].mark._f, src->b[b].mark._f); \ + if (verify) \ + bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\ + ": got %u, should be %u, fixing", i, b, \ + dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ + dst->b[b]._mark.dirty = true; \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) @@ -573,12 +523,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) percpu_down_write(&c->mark_lock); - if (initial && - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) { - bch2_gc_done_nocheck(c); - goto out; - } - { struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); @@ -629,6 +573,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) copy_bucket_field(stripe); copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); + + if (dst->b[b].oldest_gen != src->b[b].oldest_gen) { + dst->b[b].oldest_gen = src->b[b].oldest_gen; + dst->b[b]._mark.dirty = true; + } } }; @@ -641,44 +590,46 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) unsigned b; for (b = 0; b < BCH_DATA_NR; b++) - copy_dev_field(buckets[b], - "buckets[%s]", bch2_data_types[b]); - copy_dev_field(buckets_alloc, "buckets_alloc"); - copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets[b], "buckets[%s]", + bch2_data_types[b]); + copy_dev_field(buckets_alloc, "buckets_alloc"); + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); for (b = 0; b < BCH_DATA_NR; b++) - copy_dev_field(sectors[b], - "sectors[%s]", bch2_data_types[b]); - copy_dev_field(sectors_fragmented, - "sectors_fragmented"); + copy_dev_field(sectors[b], "sectors[%s]", + bch2_data_types[b]); + copy_dev_field(sectors_fragmented, "sectors_fragmented"); } { - unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + - c->replicas.nr; + unsigned nr = fs_usage_u64s(c); struct bch_fs_usage *dst = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr); struct bch_fs_usage *src = (void *) bch2_acc_percpu_u64s((void *) c->usage[1], nr); - copy_fs_field(s.hidden, "hidden"); - copy_fs_field(s.data, "data"); - copy_fs_field(s.cached, "cached"); - copy_fs_field(s.reserved, "reserved"); - copy_fs_field(s.nr_inodes, "nr_inodes"); + copy_fs_field(hidden, "hidden"); + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes, "nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(persistent_reserved[i], "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { - /* - * XXX: print out replicas entry - */ - copy_fs_field(data[i], "data[%i]", i); + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + char buf[80]; + + bch2_replicas_entry_to_text(&PBUF(buf), e); + + copy_fs_field(replicas[i], "%s", buf); } } -out: + percpu_up_write(&c->mark_lock); #undef copy_fs_field @@ -693,19 +644,18 @@ static int bch2_gc_start(struct bch_fs *c) struct bch_dev *ca; unsigned i; + percpu_down_write(&c->mark_lock); + /* * indicate to stripe code that we need to allocate for the gc stripes * radix tree, too */ gc_pos_set(c, gc_phase(GC_PHASE_START)); - percpu_down_write(&c->mark_lock); BUG_ON(c->usage[1]); - c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) + - sizeof(u64) * c->replicas.nr, - sizeof(u64), - GFP_KERNEL); + c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + sizeof(u64), GFP_KERNEL); percpu_up_write(&c->mark_lock); if (!c->usage[1]) @@ -740,8 +690,12 @@ static int bch2_gc_start(struct bch_fs *c) dst->first_bucket = src->first_bucket; dst->nbuckets = src->nbuckets; - for (b = 0; b < src->nbuckets; b++) - dst->b[b]._mark.gen = src->b[b].mark.gen; + for (b = 0; b < src->nbuckets; b++) { + dst->b[b]._mark.gen = + dst->b[b].oldest_gen = + src->b[b].mark.gen; + dst->b[b].gen_valid = src->b[b].gen_valid; + } }; percpu_up_write(&c->mark_lock); @@ -800,6 +754,8 @@ out: if (iter++ <= 2) { bch_info(c, "Fixed gens, restarting mark and sweep:"); clear_bit(BCH_FS_FIXED_GENS, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + bch2_gc_free(c); goto again; } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 18596dc8..b38722da 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -455,6 +455,7 @@ static inline bool btree_node_is_extents(struct btree *b) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { switch (type) { + case BKEY_TYPE_ALLOC: case BKEY_TYPE_BTREE: case BKEY_TYPE_EXTENTS: case BKEY_TYPE_INODES: @@ -489,7 +490,6 @@ enum btree_insert_ret { /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, BTREE_INSERT_ENOSPC, - BTREE_INSERT_NEED_GC_LOCK, BTREE_INSERT_NEED_MARK_REPLICAS, }; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 4bd07258..faacde9a 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -81,6 +81,7 @@ enum { __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_NOMARK, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, __BCH_HASH_SET_MUST_CREATE, @@ -107,12 +108,12 @@ enum { #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) -/* - * Insert is for journal replay: don't get journal reservations, or mark extents - * (bch_mark_key) - */ +/* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) +/* Don't call bch2_mark_key: */ +#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) + /* Don't block on allocation failure (for new btree nodes: */ #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 33b5cf40..b1b858de 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -483,7 +483,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, struct btree *b; struct disk_reservation disk_res = { 0, 0 }; unsigned sectors = nr_nodes * c->opts.btree_node_size; - int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD; + int ret, disk_res_flags = 0; if (flags & BTREE_INSERT_NOFAIL) disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; @@ -1086,8 +1086,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&old->key), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, - gc_pos_btree_root(b->btree_id)); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); @@ -1188,8 +1187,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bkey_disassemble(b, k, &tmp), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, - gc_pos_btree_node(b)); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); @@ -1564,7 +1562,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); /* Hack, because gc and splitting nodes doesn't mix yet: */ - if (!down_read_trylock(&c->gc_lock)) { + if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && + !down_read_trylock(&c->gc_lock)) { if (flags & BTREE_INSERT_NOUNLOCK) return -EINTR; @@ -1607,7 +1606,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, */ __bch2_btree_iter_downgrade(iter, 1); out: - up_read(&c->gc_lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); closure_sync(&cl); return ret; } @@ -1685,7 +1685,8 @@ retry: } /* We're changing btree topology, doesn't mix with gc: */ - if (!down_read_trylock(&c->gc_lock)) + if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && + !down_read_trylock(&c->gc_lock)) goto err_cycle_gc_lock; if (!bch2_btree_iter_upgrade(iter, U8_MAX, @@ -1745,7 +1746,8 @@ retry: bch2_btree_update_done(as); - up_read(&c->gc_lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); out: bch2_btree_iter_verify_locks(iter); @@ -1776,7 +1778,8 @@ err_cycle_gc_lock: err_unlock: six_unlock_intent(&m->lock); - up_read(&c->gc_lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); err: BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); @@ -1942,8 +1945,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, c->opts.btree_node_size * bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), - BCH_DISK_RESERVATION_NOFAIL| - BCH_DISK_RESERVATION_GC_LOCK_HELD); + BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); parent = btree_node_parent(iter, b); @@ -1989,8 +1991,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, - gc_pos_btree_root(b->btree_id)); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 0df894fc..da8c6987 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -415,6 +415,25 @@ static inline int btree_trans_cmp(struct btree_insert_entry l, btree_iter_cmp(l.iter, r.iter); } +static bool btree_trans_relock(struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_iter(trans, i) + return bch2_btree_iter_relock(i->iter); + return true; +} + +static void btree_trans_unlock(struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_iter(trans, i) { + bch2_btree_iter_unlock(i->iter); + break; + } +} + /* Normal update interface: */ static enum btree_insert_ret @@ -466,49 +485,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans, struct btree_iter *linked; unsigned u64s; int ret; - +retry: trans_for_each_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - /* reserve space for deferred updates */ - __trans_for_each_entry(trans, i, i->deferred) { - - } - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - u64s = 0; - trans_for_each_entry(trans, i) - u64s += jset_u64s(i->k->k.u64s); - - while ((ret = bch2_journal_res_get(&c->journal, - &trans->journal_res, u64s, - JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) { - struct btree_iter *iter = NULL; - - trans_for_each_iter(trans, i) - iter = i->iter; - - if (iter) - bch2_btree_iter_unlock(iter); - - ret = bch2_journal_res_get(&c->journal, - &trans->journal_res, u64s, - JOURNAL_RES_GET_CHECK); - if (ret) - return ret; - - if (iter && !bch2_btree_iter_relock(iter)) { - trans_restart(" (iter relock after journal res get blocked)"); - return -EINTR; - } - } - - if (ret) - return ret; - } - multi_lock_write(c, trans); if (race_fault()) { @@ -536,6 +518,36 @@ static inline int do_btree_insert_at(struct btree_insert *trans, } } + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + u64s = 0; + trans_for_each_entry(trans, i) + u64s += jset_u64s(i->k->k.u64s); + + ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_NONBLOCK); + if (likely(!ret)) + goto got_journal_res; + if (ret != -EAGAIN) + goto out; + + multi_unlock_write(trans); + btree_trans_unlock(trans); + + ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_CHECK); + if (ret) + return ret; + + if (!btree_trans_relock(trans)) { + trans_restart(" (iter relock after journal res get blocked)"); + return -EINTR; + } + + goto retry; + } +got_journal_res: if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) trans_for_each_entry(trans, i) @@ -623,6 +635,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans) /* for the sake of sanity: */ BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + bubble_sort(trans->entries, trans->nr, btree_trans_cmp); trans_for_each_entry(trans, i) @@ -715,18 +730,6 @@ err: ret = -EINTR; } break; - case BTREE_INSERT_NEED_GC_LOCK: - ret = -EINTR; - - if (!down_read_trylock(&c->gc_lock)) { - if (flags & BTREE_INSERT_NOUNLOCK) - goto out; - - bch2_btree_iter_unlock(trans->entries[0].iter); - down_read(&c->gc_lock); - } - up_read(&c->gc_lock); - break; case BTREE_INSERT_ENOSPC: ret = -ENOSPC; break; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 9f4872a9..377a8b0f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -116,14 +116,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; - unsigned i, nr; + unsigned i; percpu_down_write(&c->mark_lock); - nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr; - usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr); + usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], + fs_usage_u64s(c)); for (i = 0; i < BCH_REPLICAS_MAX; i++) - usage->s.reserved += usage->persistent_reserved[i]; + usage->reserved += usage->persistent_reserved[i]; for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = @@ -132,10 +132,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c) switch (e->data_type) { case BCH_DATA_BTREE: case BCH_DATA_USER: - usage->s.data += usage->data[i]; + usage->data += usage->replicas[i]; break; case BCH_DATA_CACHED: - usage->s.cached += usage->data[i]; + usage->cached += usage->replicas[i]; break; } } @@ -143,44 +143,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c) percpu_up_write(&c->mark_lock); } -#define bch2_usage_read_raw(_stats) \ -({ \ - typeof(*this_cpu_ptr(_stats)) _acc; \ - \ - memset(&_acc, 0, sizeof(_acc)); \ - acc_u64s_percpu((u64 *) &_acc, \ - (u64 __percpu *) _stats, \ - sizeof(_acc) / sizeof(u64)); \ - \ - _acc; \ -}) - struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) { - return bch2_usage_read_raw(ca->usage[0]); + struct bch_dev_usage ret; + + memset(&ret, 0, sizeof(ret)); + acc_u64s_percpu((u64 *) &ret, + (u64 __percpu *) ca->usage[0], + sizeof(ret) / sizeof(u64)); + + return ret; } struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage *ret; - unsigned nr = READ_ONCE(c->replicas.nr); + unsigned v, u64s = fs_usage_u64s(c); retry: - ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS); + ret = kzalloc(u64s * sizeof(u64), GFP_NOFS); if (unlikely(!ret)) return NULL; percpu_down_read_preempt_disable(&c->mark_lock); - if (unlikely(nr < c->replicas.nr)) { - nr = c->replicas.nr; + v = fs_usage_u64s(c); + if (unlikely(u64s != v)) { + u64s = v; percpu_up_read_preempt_enable(&c->mark_lock); kfree(ret); goto retry; } - acc_u64s_percpu((u64 *) ret, - (u64 __percpu *) c->usage[0], - sizeof(*ret) / sizeof(u64) + nr); + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); return ret; } @@ -197,27 +191,44 @@ static u64 avail_factor(u64 r) return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) { - return min(fs_usage.s.hidden + - fs_usage.s.data + - reserve_factor(fs_usage.s.reserved + - fs_usage.s.online_reserved), + return min(fs_usage->hidden + + fs_usage->data + + reserve_factor(fs_usage->reserved + + fs_usage->online_reserved), c->capacity); } +static struct bch_fs_usage_short +__bch2_fs_usage_read_short(struct bch_fs *c) +{ + struct bch_fs_usage_short ret; + u64 data, reserved; + + ret.capacity = c->capacity - + percpu_u64_get(&c->usage[0]->hidden); + + data = percpu_u64_get(&c->usage[0]->data); + reserved = percpu_u64_get(&c->usage[0]->reserved) + + percpu_u64_get(&c->usage[0]->online_reserved); + + ret.used = min(ret.capacity, data + reserve_factor(reserved)); + ret.free = ret.capacity - ret.used; + + ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes); + + return ret; +} + struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *c) { - struct bch_fs_usage_summarized usage = - bch2_usage_read_raw(&c->usage[0]->s); struct bch_fs_usage_short ret; - ret.capacity = READ_ONCE(c->capacity) - usage.hidden; - ret.used = min(ret.capacity, usage.data + - reserve_factor(usage.reserved + - usage.online_reserved)); - ret.nr_inodes = usage.nr_inodes; + percpu_down_read_preempt_disable(&c->mark_lock); + ret = __bch2_fs_usage_read_short(c); + percpu_up_read_preempt_enable(&c->mark_lock); return ret; } @@ -254,10 +265,9 @@ static bool bucket_became_unavailable(struct bucket_mark old, int bch2_fs_usage_apply(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res, - struct gc_pos gc_pos) + struct disk_reservation *disk_res) { - s64 added = fs_usage->s.data + fs_usage->s.reserved; + s64 added = fs_usage->data + fs_usage->reserved; s64 should_not_have_added; int ret = 0; @@ -277,19 +287,11 @@ int bch2_fs_usage_apply(struct bch_fs *c, if (added > 0) { disk_res->sectors -= added; - fs_usage->s.online_reserved -= added; + fs_usage->online_reserved -= added; } acc_u64s((u64 *) this_cpu_ptr(c->usage[0]), - (u64 *) fs_usage, - sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); - - if (gc_visited(c, gc_pos)) { - BUG_ON(!c->usage[1]); - acc_u64s((u64 *) this_cpu_ptr(c->usage[1]), - (u64 *) fs_usage, - sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); - } + (u64 *) fs_usage, fs_usage_u64s(c)); return ret; } @@ -300,7 +302,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, int nr, s64 size) { if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) - fs_usage->s.hidden += size; + fs_usage->hidden += size; dev_usage->buckets[type] += nr; } @@ -384,10 +386,10 @@ static inline void update_replicas(struct bch_fs *c, BUG_ON(!sectors); if (r->data_type == BCH_DATA_CACHED) - fs_usage->s.cached += sectors; + fs_usage->cached += sectors; else - fs_usage->s.data += sectors; - fs_usage->data[idx] += sectors; + fs_usage->data += sectors; + fs_usage->replicas[idx] += sectors; } static inline void update_cached_sectors(struct bch_fs *c, @@ -401,15 +403,28 @@ static inline void update_cached_sectors(struct bch_fs *c, update_replicas(c, fs_usage, &r.e, sectors); } -static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old, - bool gc) +#define do_mark_fn(fn, c, pos, flags, ...) \ +({ \ + int gc, ret = 0; \ + \ + percpu_rwsem_assert_held(&c->mark_lock); \ + \ + for (gc = 0; gc < 2 && !ret; gc++) \ + if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ + (gc && gc_visited(c, pos))) \ + ret = fn(c, __VA_ARGS__, gc); \ + ret; \ +}) + +static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *ret, + bool gc) { struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark new; + struct bucket_mark old, new; - *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ BUG_ON(!is_available_bucket(new)); new.owned_by_allocator = true; @@ -420,26 +435,29 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); - if (old->cached_sectors) + if (old.cached_sectors) update_cached_sectors(c, fs_usage, ca->dev_idx, - -old->cached_sectors); + -((s64) old.cached_sectors)); + + if (!gc) + *ret = old; + return 0; } void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old) { - percpu_rwsem_assert_held(&c->mark_lock); - - __bch2_invalidate_bucket(c, ca, b, old, false); + do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, + ca, b, old); if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); } -static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator, - bool gc) +static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) { struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); @@ -451,20 +469,70 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); + + return 0; } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { - percpu_rwsem_assert_held(&c->mark_lock); + do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, + ca, b, owned_by_allocator); +} - if (!(flags & BCH_BUCKET_MARK_GC)) - __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false); +static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + bool inserting, + struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags, + bool gc) +{ + struct bkey_alloc_unpacked u; + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark old, m; - if ((flags & BCH_BUCKET_MARK_GC) || - gc_visited(c, pos)) - __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true); + if (!inserting) + return 0; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ + if (flags & BCH_BUCKET_MARK_GC) + return 0; + + u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v); + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = __bucket(ca, k.k->p.offset, gc); + + /* + * this should currently only be getting called from the bucket + * invalidate path: + */ + BUG_ON(u.dirty_sectors); + BUG_ON(u.cached_sectors); + BUG_ON(!g->mark.owned_by_allocator); + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({ + m.gen = u.gen; + m.data_type = u.data_type; + m.dirty_sectors = u.dirty_sectors; + m.cached_sectors = u.cached_sectors; + })); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; + + if (old.cached_sectors) { + update_cached_sectors(c, fs_usage, ca->dev_idx, + -old.cached_sectors); + trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), + old.cached_sectors); + } + + return 0; } #define checked_add(a, b) \ @@ -474,9 +542,9 @@ do { \ BUG_ON((a) != _res); \ } while (0) -static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type type, - unsigned sectors, bool gc) +static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type type, + unsigned sectors, bool gc) { struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); @@ -490,6 +558,8 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, new.data_type = type; checked_add(new.dirty_sectors, sectors); })); + + return 0; } void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -501,15 +571,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, type != BCH_DATA_JOURNAL); if (likely(c)) { - percpu_rwsem_assert_held(&c->mark_lock); - - if (!(flags & BCH_BUCKET_MARK_GC)) - __bch2_mark_metadata_bucket(c, ca, b, type, sectors, - false); - if ((flags & BCH_BUCKET_MARK_GC) || - gc_visited(c, pos)) - __bch2_mark_metadata_bucket(c, ca, b, type, sectors, - true); + do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, + ca, b, type, sectors); } else { struct bucket *g; struct bucket_mark new; @@ -553,7 +616,7 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, * loop, to avoid racing with the start of gc clearing all the marks - GC does * that with the gc pos seqlock held. */ -static void bch2_mark_pointer(struct bch_fs *c, +static bool bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, @@ -581,7 +644,7 @@ static void bch2_mark_pointer(struct bch_fs *c, BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)); EBUG_ON(!p.ptr.cached && test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - return; + return true; } if (!p.ptr.cached) @@ -612,6 +675,8 @@ static void bch2_mark_pointer(struct bch_fs *c, bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); + + return false; } static int bch2_mark_stripe_ptr(struct bch_fs *c, @@ -694,13 +759,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors : ptr_disk_sectors_delta(p, sectors); - - bch2_mark_pointer(c, p, disk_sectors, data_type, - fs_usage, journal_seq, flags, gc); + bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, + fs_usage, journal_seq, flags, gc); if (p.ptr.cached) { - update_cached_sectors(c, fs_usage, p.ptr.dev, - disk_sectors); + if (disk_sectors && !stale) + update_cached_sectors(c, fs_usage, p.ptr.dev, + disk_sectors); } else if (!p.ec_nr) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; @@ -826,30 +891,31 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, unsigned journal_seq, unsigned flags, bool gc) { - int ret = 0; + if (!fs_usage || gc) + fs_usage = this_cpu_ptr(c->usage[gc]); switch (k.k->type) { + case KEY_TYPE_alloc: + return bch2_mark_alloc(c, k, inserting, + fs_usage, journal_seq, flags, gc); case KEY_TYPE_btree_ptr: - ret = bch2_mark_extent(c, k, inserting - ? c->opts.btree_node_size - : -c->opts.btree_node_size, - BCH_DATA_BTREE, - fs_usage, journal_seq, flags, gc); - break; + return bch2_mark_extent(c, k, inserting + ? c->opts.btree_node_size + : -c->opts.btree_node_size, + BCH_DATA_BTREE, + fs_usage, journal_seq, flags, gc); case KEY_TYPE_extent: - ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, - fs_usage, journal_seq, flags, gc); - break; + return bch2_mark_extent(c, k, sectors, BCH_DATA_USER, + fs_usage, journal_seq, flags, gc); case KEY_TYPE_stripe: - ret = bch2_mark_stripe(c, k, inserting, - fs_usage, journal_seq, flags, gc); - break; + return bch2_mark_stripe(c, k, inserting, + fs_usage, journal_seq, flags, gc); case KEY_TYPE_inode: if (inserting) - fs_usage->s.nr_inodes++; + fs_usage->nr_inodes++; else - fs_usage->s.nr_inodes--; - break; + fs_usage->nr_inodes--; + return 0; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -857,15 +923,13 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->s.reserved += sectors; + fs_usage->reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; - break; + return 0; } default: - break; + return 0; } - - return ret; } int bch2_mark_key_locked(struct bch_fs *c, @@ -875,26 +939,9 @@ int bch2_mark_key_locked(struct bch_fs *c, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - int ret; - - if (!(flags & BCH_BUCKET_MARK_GC)) { - ret = __bch2_mark_key(c, k, inserting, sectors, - fs_usage ?: this_cpu_ptr(c->usage[0]), - journal_seq, flags, false); - if (ret) - return ret; - } - - if ((flags & BCH_BUCKET_MARK_GC) || - gc_visited(c, pos)) { - ret = __bch2_mark_key(c, k, inserting, sectors, - this_cpu_ptr(c->usage[1]), - journal_seq, flags, true); - if (ret) - return ret; - } - - return 0; + return do_mark_fn(__bch2_mark_key, c, pos, flags, + k, inserting, sectors, fs_usage, + journal_seq, flags); } int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, @@ -932,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans, percpu_down_read_preempt_disable(&c->mark_lock); fs_usage = bch2_fs_usage_get_scratch(c); - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!(trans->flags & BTREE_INSERT_NOMARK)) bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), @@ -985,7 +1032,7 @@ void bch2_mark_update(struct btree_insert *trans, bch2_btree_node_iter_advance(&node_iter, b); } - if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) && + if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) && !warned_disk_usage && !xchg(&warned_disk_usage, 1)) { char buf[200]; @@ -1026,13 +1073,13 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c) { percpu_u64_set(&c->pcpu->sectors_available, 0); - return avail_factor(bch2_fs_sectors_free(c)); + return avail_factor(__bch2_fs_usage_read_short(c).free); } void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { percpu_down_read_preempt_disable(&c->mark_lock); - this_cpu_sub(c->usage[0]->s.online_reserved, + this_cpu_sub(c->usage[0]->online_reserved, res->sectors); percpu_up_read_preempt_enable(&c->mark_lock); @@ -1071,38 +1118,22 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, out: pcpu->sectors_available -= sectors; - this_cpu_add(c->usage[0]->s.online_reserved, sectors); + this_cpu_add(c->usage[0]->online_reserved, sectors); res->sectors += sectors; percpu_up_read_preempt_enable(&c->mark_lock); return 0; recalculate: - /* - * GC recalculates sectors_available when it starts, so that hopefully - * we don't normally end up blocking here: - */ - - /* - * Piss fuck, we can be called from extent_insert_fixup() with btree - * locks held: - */ - - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) { - if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD)) - down_read(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) - return -EINTR; - } - percpu_down_write(&c->mark_lock); + sectors_available = bch2_recalc_sectors_available(c); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(c->usage[0]->s.online_reserved, sectors); + this_cpu_add(c->usage[0]->online_reserved, sectors); res->sectors += sectors; ret = 0; } else { @@ -1112,9 +1143,6 @@ recalculate: percpu_up_write(&c->mark_lock); - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) - up_read(&c->gc_lock); - return ret; } @@ -1135,7 +1163,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) struct bucket_array *buckets = NULL, *old_buckets = NULL; unsigned long *buckets_nouse = NULL; unsigned long *buckets_written = NULL; - u8 *oldest_gens = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; @@ -1161,8 +1188,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || - !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8), - GFP_KERNEL|__GFP_ZERO)) || !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || @@ -1197,9 +1222,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets->b, old_buckets->b, n * sizeof(struct bucket)); - memcpy(oldest_gens, - ca->oldest_gens, - n * sizeof(u8)); memcpy(buckets_nouse, ca->buckets_nouse, BITS_TO_LONGS(n) * sizeof(unsigned long)); @@ -1211,7 +1233,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) rcu_assign_pointer(ca->buckets[0], buckets); buckets = old_buckets; - swap(ca->oldest_gens, oldest_gens); swap(ca->buckets_nouse, buckets_nouse); swap(ca->buckets_written, buckets_written); @@ -1255,8 +1276,6 @@ err: BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); kvpfree(buckets_written, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); - kvpfree(oldest_gens, - nbuckets * sizeof(u8)); if (buckets) call_rcu(&old_buckets->rcu, buckets_free_rcu); @@ -1276,7 +1295,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 19cf6525..0725aa94 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -16,13 +16,14 @@ #define bucket_cmpxchg(g, new, expr) \ ({ \ + struct bucket *_g = g; \ u64 _v = atomic64_read(&(g)->_mark.v); \ struct bucket_mark _old; \ \ do { \ (new).v.counter = _old.v.counter = _v; \ expr; \ - } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \ + } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ _old.v.counter, \ (new).v.counter)) != _old.v.counter);\ _old; \ @@ -56,18 +57,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline void bucket_set_dirty(struct bch_dev *ca, size_t b) -{ - struct bucket *g; - struct bucket_mark m; - - rcu_read_lock(); - g = bucket(ca, b); - bucket_cmpxchg(g, m, m.dirty = true); - rcu_read_unlock(); - -} - static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { @@ -86,7 +75,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) { - return bucket(ca, b)->mark.gen - ca->oldest_gens[b]; + struct bucket *g = bucket(ca, b); + + return g->mark.gen - g->oldest_gen; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -96,9 +87,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, } static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) + const struct bch_extent_ptr *ptr, + bool gc) { - return bucket(ca, PTR_BUCKET_NR(ca, ptr)); + return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); } static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, @@ -219,31 +211,28 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ +static inline unsigned fs_usage_u64s(struct bch_fs *c) +{ + + return sizeof(struct bch_fs_usage) / sizeof(u64) + + READ_ONCE(c->replicas.nr); +} + static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c) { - struct bch_fs_usage *ret; - - ret = this_cpu_ptr(c->usage_scratch); - - memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64)); + struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch); + memset(ret, 0, fs_usage_u64s(c) * sizeof(u64)); return ret; } struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); -static inline u64 bch2_fs_sectors_free(struct bch_fs *c) -{ - struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); - - return usage.capacity - usage.used; -} - /* key/bucket marking: */ void bch2_bucket_seq_cleanup(struct bch_fs *); @@ -257,8 +246,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -#define BCH_BUCKET_MARK_NOATOMIC (1 << 0) -#define BCH_BUCKET_MARK_GC (1 << 1) +#define BCH_BUCKET_MARK_GC (1 << 0) +#define BCH_BUCKET_MARK_NOATOMIC (1 << 1) int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, bool, s64, struct gc_pos, @@ -268,7 +257,7 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); + struct disk_reservation *); /* disk reservations: */ @@ -282,8 +271,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1) -#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2) int bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 56863c23..869a1314 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -38,6 +38,7 @@ struct bucket { }; u16 io_time[2]; + u8 oldest_gen; unsigned gen_valid:1; }; @@ -62,35 +63,33 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - /* summarized: */ - struct bch_fs_usage_summarized { - u64 online_reserved; + u64 online_reserved; - /* fields after online_reserved are cleared/recalculated by gc: */ - u64 gc_start[0]; + /* fields after online_reserved are cleared/recalculated by gc: */ + u64 gc_start[0]; - u64 hidden; - u64 data; - u64 cached; - u64 reserved; - u64 nr_inodes; + u64 hidden; + u64 data; + u64 cached; + u64 reserved; + u64 nr_inodes; - /* XXX: add stats for compression ratio */ + /* XXX: add stats for compression ratio */ #if 0 - u64 uncompressed; - u64 compressed; + u64 uncompressed; + u64 compressed; #endif - } s; /* broken out: */ u64 persistent_reserved[BCH_REPLICAS_MAX]; - u64 data[]; + u64 replicas[]; }; struct bch_fs_usage_short { u64 capacity; u64 used; + u64 free; u64 nr_inodes; }; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index b84ae5c9..4e33e7b8 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -402,10 +402,10 @@ static long bch2_ioctl_usage(struct bch_fs *c, if (!src) return -ENOMEM; - percpu_up_read_preempt_enable(&c->mark_lock); + dst.used = bch2_fs_sectors_used(c, src); + dst.online_reserved = src->online_reserved; - dst.used = bch2_fs_sectors_used(c, *src); - dst.online_reserved = src->s.online_reserved; + percpu_up_read_preempt_enable(&c->mark_lock); for (i = 0; i < BCH_REPLICAS_MAX; i++) { dst.persistent_reserved[i] = diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 0f075fa1..369b100a 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -979,10 +979,8 @@ bch2_extent_can_insert(struct btree_insert *trans, if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bch2_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - - if (trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; switch (bch2_disk_reservation_add(trans->c, trans->disk_res, @@ -991,8 +989,6 @@ bch2_extent_can_insert(struct btree_insert *trans, break; case -ENOSPC: return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; default: BUG(); } diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 9715ddbd..0982af02 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -100,7 +100,7 @@ do { \ ({ \ bool _r = !fifo_empty((fifo)); \ if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \ + (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ _r; \ }) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 8ff8cfa8..f108a282 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -17,23 +17,14 @@ #include -static bool journal_entry_is_open(struct journal *j) +static bool __journal_entry_is_open(union journal_res_state state) { - return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; } -void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) +static bool journal_entry_is_open(struct journal *j) { - struct journal_buf *w = journal_prev_buf(j); - - atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count); - - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); - - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); + return __journal_entry_is_open(j->reservations); } static void journal_pin_new_entry(struct journal *j, int count) @@ -77,39 +68,71 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -static enum { - JOURNAL_ENTRY_ERROR, - JOURNAL_ENTRY_INUSE, - JOURNAL_ENTRY_CLOSED, - JOURNAL_UNLOCKED, -} journal_buf_switch(struct journal *j, bool need_write_just_set) +void bch2_journal_halt(struct journal *j) +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return; + + new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); +} + +/* journal entry close/open: */ + +void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +{ + if (!need_write_just_set && + test_bit(JOURNAL_NEED_WRITE, &j->flags)) + bch2_time_stats_update(j->delay_time, + j->need_write_time); + + clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); +} + +/* + * Returns true if journal entry is now closed: + */ +static bool __journal_entry_close(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); + bool set_need_write = false; + unsigned sectors; lockdep_assert_held(&j->lock); do { old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) - return JOURNAL_ENTRY_CLOSED; + return true; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { /* this entry will never be written: */ closure_wake_up(&buf->wait); - return JOURNAL_ENTRY_ERROR; + return true; + } + + if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { + set_bit(JOURNAL_NEED_WRITE, &j->flags); + j->need_write_time = local_clock(); + set_need_write = true; } if (new.prev_buf_unwritten) - return JOURNAL_ENTRY_INUSE; - - /* - * avoid race between setting buf->data->u64s and - * journal_res_put starting write: - */ - journal_state_inc(&new); + return false; new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; @@ -119,15 +142,12 @@ static enum { } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - j->prev_buf_sectors = - vstruct_blocks_plus(buf->data, c->block_bits, - buf->u64s_reserved) * - c->opts.block_size; - BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); + sectors = vstruct_blocks_plus(buf->data, c->block_bits, + buf->u64s_reserved) << c->block_bits; + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; bkey_extent_init(&buf->key); @@ -150,7 +170,6 @@ static enum { * Hence, we want update/set last_seq on the current journal entry right * before we open a new one: */ - bch2_journal_reclaim_fast(j); buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); if (journal_entry_empty(buf->data)) @@ -163,32 +182,22 @@ static enum { bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); - spin_unlock(&j->lock); - /* ugh - might be called from __journal_res_get() under wait_event() */ - __set_current_state(TASK_RUNNING); - bch2_journal_buf_put(j, old.idx, need_write_just_set); + bch2_journal_space_available(j); - return JOURNAL_UNLOCKED; + bch2_journal_buf_put(j, old.idx, set_need_write); + return true; } -void bch2_journal_halt(struct journal *j) +static bool journal_entry_close(struct journal *j) { - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); + bool ret; - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return; + spin_lock(&j->lock); + ret = __journal_entry_close(j); + spin_unlock(&j->lock); - new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - journal_wake(j); - closure_wake_up(&journal_cur_buf(j)->wait); - closure_wake_up(&journal_prev_buf(j)->wait); + return ret; } /* @@ -196,46 +205,39 @@ void bch2_journal_halt(struct journal *j) * journal reservation - journal entry is open means journal is dirty: * * returns: - * 1: success - * 0: journal currently full (must wait) - * -EROFS: insufficient rw devices - * -EIO: journal error + * 0: success + * -ENOSPC: journal currently full, must invoke reclaim + * -EAGAIN: journal blocked, must wait + * -EROFS: insufficient rw devices or journal error */ static int journal_entry_open(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; - ssize_t u64s; - int sectors; + int u64s; u64 v; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); - if (!fifo_free(&j->pin)) - return 0; + if (j->blocked) + return -EAGAIN; - sectors = bch2_journal_entry_sectors(j); - if (sectors <= 0) - return sectors; + if (j->cur_entry_error) + return j->cur_entry_error; + + BUG_ON(!j->cur_entry_sectors); - buf->disk_sectors = sectors; buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); - sectors = min_t(unsigned, sectors, buf->size >> 9); - j->cur_buf_sectors = sectors; - - u64s = (sectors << 9) / sizeof(u64); - - /* Subtract the journal header */ - u64s -= sizeof(struct jset) / sizeof(u64); - u64s -= buf->u64s_reserved; - u64s = max_t(ssize_t, 0L, u64s); - - BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL); + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= le32_to_cpu(buf->data->u64s)) - return 0; + return -ENOSPC; /* * Must be set before marking the journal entry as open: @@ -246,11 +248,14 @@ static int journal_entry_open(struct journal *j) do { old.v = new.v = v; + EBUG_ON(journal_state_count(new, new.idx)); + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return -EIO; + return -EROFS; /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -263,37 +268,22 @@ static int journal_entry_open(struct journal *j) &j->write_work, msecs_to_jiffies(j->write_delay_ms)); journal_wake(j); - return 1; + return 0; } -static bool __journal_entry_close(struct journal *j) +static bool journal_quiesced(struct journal *j) { - bool set_need_write; + union journal_res_state state = READ_ONCE(j->reservations); + bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); - if (!journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return true; - } - - set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags); - if (set_need_write) - j->need_write_time = local_clock(); - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_INUSE: - spin_unlock(&j->lock); - return false; - default: - spin_unlock(&j->lock); - case JOURNAL_UNLOCKED: - return true; - } + if (!ret) + journal_entry_close(j); + return ret; } -static bool journal_entry_close(struct journal *j) +static void journal_quiesce(struct journal *j) { - spin_lock(&j->lock); - return __journal_entry_close(j); + wait_event(j->wait, journal_quiesced(j)); } static void journal_write_work(struct work_struct *work) @@ -337,7 +327,11 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; + if (bch2_journal_error(j)) + return -EROFS; + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call journal_entry_close() @@ -355,56 +349,43 @@ retry: */ buf = journal_cur_buf(j); if (journal_entry_is_open(j) && - buf->size >> 9 < buf->disk_sectors && - buf->size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->size << 1); + buf->buf_size >> 9 < buf->disk_sectors && + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - /* - * Close the current journal entry if necessary, then try to start a new - * one: - */ - switch (journal_buf_switch(j, false)) { - case JOURNAL_ENTRY_ERROR: - spin_unlock(&j->lock); - return -EROFS; - case JOURNAL_ENTRY_INUSE: + if (journal_entry_is_open(j) && + !__journal_entry_close(j)) { /* - * The current journal entry is still open, but we failed to get - * a journal reservation because there's not enough space in it, - * and we can't close it and start another because we haven't - * finished writing out the previous entry: + * We failed to get a reservation on the current open journal + * entry because it's full, and we can't close it because + * there's still a previous one in flight: */ - spin_unlock(&j->lock); trace_journal_entry_full(c); - goto blocked; - case JOURNAL_ENTRY_CLOSED: - break; - case JOURNAL_UNLOCKED: - goto retry; + ret = -EAGAIN; + } else { + ret = journal_entry_open(j); } - /* We now have a new, closed journal buf - see if we can open it: */ - ret = journal_entry_open(j); + if ((ret == -EAGAIN || ret == -ENOSPC) && + !j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; + spin_unlock(&j->lock); - if (ret < 0) - return ret; - if (ret) + if (!ret) goto retry; + if (ret == -ENOSPC) { + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + trace_journal_full(c); + if (!(flags & JOURNAL_RES_GET_NONBLOCK)) + bch2_journal_reclaim_work(&j->reclaim_work.work); + ret = -EAGAIN; + } - /* Journal's full, we have to wait */ - - /* - * Direct reclaim - can't rely on reclaim from work item - * due to freezing.. - */ - bch2_journal_reclaim_work(&j->reclaim_work.work); - - trace_journal_full(c); -blocked: - if (!j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - return -EAGAIN; + return ret; } /* @@ -422,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, { int ret; - wait_event(j->wait, + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -EAGAIN || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; @@ -441,9 +422,9 @@ void bch2_journal_entry_res_resize(struct journal *j, j->entry_u64s_reserved += d; if (d <= 0) - goto out_unlock; + goto out; - j->cur_entry_u64s -= d; + j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); smp_mb(); state = READ_ONCE(j->reservations); @@ -454,15 +435,12 @@ void bch2_journal_entry_res_resize(struct journal *j, * Not enough room in current journal entry, have to flush it: */ __journal_entry_close(j); - goto out; + } else { + journal_cur_buf(j)->u64s_reserved += d; } - - journal_cur_buf(j)->u64s_reserved += d; -out_unlock: - spin_unlock(&j->lock); out: + spin_unlock(&j->lock); res->u64s += d; - return; } /* journal flushing: */ @@ -492,47 +470,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) { struct bch_fs *c = container_of(j, struct bch_fs, journal); int ret; -retry: + spin_lock(&j->lock); - if (seq < journal_cur_seq(j) || + /* + * Can't try to open more than one sequence number ahead: + */ + BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); + + if (journal_cur_seq(j) > seq || journal_entry_is_open(j)) { spin_unlock(&j->lock); return 0; } - if (journal_cur_seq(j) < seq) { - switch (journal_buf_switch(j, false)) { - case JOURNAL_ENTRY_ERROR: - spin_unlock(&j->lock); - return -EROFS; - case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ - trace_journal_entry_full(c); - goto blocked; - case JOURNAL_ENTRY_CLOSED: - break; - case JOURNAL_UNLOCKED: - goto retry; - } + if (journal_cur_seq(j) < seq && + !__journal_entry_close(j)) { + /* haven't finished writing out the previous one: */ + trace_journal_entry_full(c); + ret = -EAGAIN; + } else { + BUG_ON(journal_cur_seq(j) != seq); + + ret = journal_entry_open(j); } - BUG_ON(journal_cur_seq(j) < seq); - - ret = journal_entry_open(j); - if (ret) { - spin_unlock(&j->lock); - return ret < 0 ? ret : 0; - } -blocked: - if (!j->res_get_blocked_start) + if ((ret == -EAGAIN || ret == -ENOSPC) && + !j->res_get_blocked_start) j->res_get_blocked_start = local_clock() ?: 1; - closure_wait(&j->async_wait, cl); + if (ret == -EAGAIN || ret == -ENOSPC) + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); - bch2_journal_reclaim_work(&j->reclaim_work.work); - return -EAGAIN; + if (ret == -ENOSPC) { + trace_journal_full(c); + bch2_journal_reclaim_work(&j->reclaim_work.work); + ret = -EAGAIN; + } + + return ret; } static int journal_seq_error(struct journal *j, u64 seq) @@ -615,8 +593,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, if (seq == journal_cur_seq(j)) __journal_entry_close(j); - else - spin_unlock(&j->lock); + spin_unlock(&j->lock); } static int journal_seq_flushed(struct journal *j, u64 seq) @@ -628,8 +605,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq) if (seq == journal_cur_seq(j)) __journal_entry_close(j); - else - spin_unlock(&j->lock); + spin_unlock(&j->lock); return ret; } @@ -721,6 +697,26 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } +/* block/unlock the journal: */ + +void bch2_journal_unblock(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked--; + spin_unlock(&j->lock); + + journal_wake(j); +} + +void bch2_journal_block(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked++; + spin_unlock(&j->lock); + + journal_quiesce(j); +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -743,7 +739,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); + nr + sizeof(*journal_buckets) / sizeof(u64)); if (!journal_buckets) goto err; @@ -806,9 +802,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ja->nr++; bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); if (c) { spin_unlock(&c->journal.lock); @@ -859,7 +855,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, */ if (bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0)) { + bucket_to_sector(ca, nr - ja->nr), 1, 0)) { mutex_unlock(&c->sb_lock); return -ENOSPC; } @@ -930,8 +926,7 @@ void bch2_fs_journal_stop(struct journal *j) c->btree_roots_dirty) bch2_journal_meta(j); - BUG_ON(journal_entry_is_open(j) || - j->reservations.prev_buf_unwritten); + journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && test_bit(JOURNAL_NOT_EMPTY, &j->flags)); @@ -957,7 +952,7 @@ void bch2_fs_journal_start(struct journal *j) journal_pin_new_entry(j, 0); /* - * journal_buf_switch() only inits the next journal entry when it + * __journal_entry_close() only inits the next journal entry when it * closes an open journal entry - the very first journal entry gets * initialized here: */ @@ -966,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j) c->last_bucket_seq_cleanup = journal_cur_seq(j); + bch2_journal_space_available(j); spin_unlock(&j->lock); /* @@ -975,7 +971,7 @@ void bch2_fs_journal_start(struct journal *j) */ bch2_journal_seq_blacklist_write(j); - queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); + queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); } /* init/exit: */ @@ -1021,8 +1017,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].size); - kvpfree(j->buf[0].data, j->buf[0].size); + kvpfree(j->buf[1].data, j->buf[1].buf_size); + kvpfree(j->buf[0].data, j->buf[0].buf_size); free_fifo(&j->pin); } @@ -1046,8 +1042,8 @@ int bch2_fs_journal_init(struct journal *j) lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; @@ -1060,8 +1056,8 @@ int bch2_fs_journal_init(struct journal *j) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) { + !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || + !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ret = -ENOMEM; goto out; } @@ -1078,35 +1074,54 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) { struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state *s = &j->reservations; + union journal_res_state s; struct bch_dev *ca; unsigned iter; rcu_read_lock(); spin_lock(&j->lock); + s = READ_ONCE(j->reservations); pr_buf(&out, "active journal entries:\t%llu\n" "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" - "reservation count:\t%u\n" - "reservation offset:\t%u\n" - "current entry u64s:\t%u\n" - "io in flight:\t\t%i\n" - "need write:\t\t%i\n" - "dirty:\t\t\t%i\n" - "replay done:\t\t%i\n", + "current entry:\t\t", fifo_used(&j->pin), journal_cur_seq(j), journal_last_seq(j), - j->last_seq_ondisk, - journal_state_count(*s, s->idx), - s->cur_entry_offset, - j->cur_entry_u64s, - s->prev_buf_unwritten, + j->last_seq_ondisk); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: + pr_buf(&out, "error\n"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: + pr_buf(&out, "closed\n"); + break; + default: + pr_buf(&out, "%u/%u\n", + s.cur_entry_offset, + j->cur_entry_u64s); + break; + } + + pr_buf(&out, + "current entry refs:\t%u\n" + "prev entry unwritten:\t", + journal_state_count(s, s.idx)); + + if (s.prev_buf_unwritten) + pr_buf(&out, "yes, ref %u\n", + journal_state_count(s, !s.idx)); + else + pr_buf(&out, "no\n"); + + pr_buf(&out, + "need write:\t\t%i\n" + "replay done:\t\t%i\n", test_bit(JOURNAL_NEED_WRITE, &j->flags), - journal_entry_is_open(j), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); for_each_member_device_rcu(ca, c, iter, @@ -1119,9 +1134,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) pr_buf(&out, "dev %u:\n" "\tnr\t\t%u\n" + "\tavailable\t%u:%u\n" "\tcur_idx\t\t%u (seq %llu)\n" "\tlast_idx\t%u (seq %llu)\n", iter, ja->nr, + bch2_journal_dev_buckets_available(j, ja), + ja->sectors_free, ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 50d864a3..71929bd6 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -178,6 +178,11 @@ static inline unsigned jset_u64s(unsigned u64s) return u64s + sizeof(struct jset_entry) / sizeof(u64); } +static inline int journal_entry_overhead(struct journal *j) +{ + return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; +} + static inline struct jset_entry * bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) { @@ -222,7 +227,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res * id, 0, k, k->k.u64s); } -void bch2_journal_buf_put_slowpath(struct journal *, bool); +void __bch2_journal_buf_put(struct journal *, bool); static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, bool need_write_just_set) @@ -233,17 +238,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, .buf0_count = idx == 0, .buf1_count = idx == 1, }).v, &j->reservations.counter); - - EBUG_ON(s.idx != idx && !s.prev_buf_unwritten); - - /* - * Do not initiate a journal write if the journal is in an error state - * (previous journal entry write may have failed) - */ - if (s.idx != idx && - !journal_state_count(s, idx) && - s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL) - bch2_journal_buf_put_slowpath(j, need_write_just_set); + if (!journal_state_count(s, idx)) { + EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); + __bch2_journal_buf_put(j, need_write_just_set); + } } /* @@ -291,6 +289,8 @@ static inline int journal_res_get_fast(struct journal *j, if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; + EBUG_ON(!journal_state_count(new, new.idx)); + if (flags & JOURNAL_RES_GET_CHECK) return 1; @@ -330,6 +330,8 @@ out: return 0; } +/* journal_entry_res: */ + void bch2_journal_entry_res_resize(struct journal *, struct journal_entry_res *, unsigned); @@ -367,6 +369,9 @@ static inline void bch2_journal_set_replay_done(struct journal *j) set_bit(JOURNAL_REPLAY_DONE, &j->flags); } +void bch2_journal_unblock(struct journal *); +void bch2_journal_block(struct journal *); + ssize_t bch2_journal_print_debug(struct journal *, char *); ssize_t bch2_journal_print_pins(struct journal *, char *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 0f1f8e15..16cb6be8 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -825,7 +825,6 @@ fsck_err: int bch2_journal_replay(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; - struct journal_entry_pin_list *pin_list; struct bkey_i *k, *_n; struct jset_entry *entry; struct journal_replay *i, *n; @@ -854,7 +853,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) ret = bch2_btree_insert(c, entry->btree_id, k, &disk_res, NULL, BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); } if (ret) { @@ -866,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) cond_resched(); } - pin_list = journal_seq_pin(j, j->replay_journal_seq); - - if (atomic_dec_and_test(&pin_list->count)) - journal_wake(j); + bch2_journal_pin_put(j, j->replay_journal_seq); } j->replay_journal_seq = 0; @@ -884,82 +881,6 @@ err: /* journal write: */ -static unsigned journal_dev_buckets_available(struct journal *j, - struct journal_device *ja) -{ - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (available && - journal_last_seq(j) <= ja->bucket_seq[ja->last_idx]) - --available; - - return available; -} - -/* returns number of sectors available for next journal entry: */ -int bch2_journal_entry_sectors(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned sectors_available = UINT_MAX; - unsigned i, nr_online = 0, nr_devs = 0; - - lockdep_assert_held(&j->lock); - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_this_device, sectors_this_device; - - if (!ja->nr) - continue; - - buckets_this_device = journal_dev_buckets_available(j, ja); - sectors_this_device = ja->sectors_free; - - nr_online++; - - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - if (j->prev_buf_sectors >= sectors_this_device) { - if (!buckets_this_device) - continue; - - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; - } - - sectors_this_device -= j->prev_buf_sectors; - - if (buckets_this_device) - sectors_this_device = ca->mi.bucket_size; - - if (!sectors_this_device) - continue; - - sectors_available = min(sectors_available, - sectors_this_device); - nr_devs++; - } - rcu_read_unlock(); - - if (nr_online < c->opts.metadata_replicas_required) - return -EROFS; - - if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) - return 0; - - return sectors_available; -} - static void __journal_write_alloc(struct journal *j, struct journal_buf *w, struct dev_alloc_list *devs_sorted, @@ -1033,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &c->rw_devs[BCH_DATA_JOURNAL]); - spin_lock(&j->lock); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -1049,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, if (sectors > ja->sectors_free && sectors <= ca->mi.bucket_size && - journal_dev_buckets_available(j, ja)) { + bch2_journal_dev_buckets_available(j, ja)) { ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->sectors_free = ca->mi.bucket_size; } @@ -1058,10 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); done: - if (replicas >= replicas_want) - j->prev_buf_sectors = 0; - - spin_unlock(&j->lock); rcu_read_unlock(); return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; @@ -1116,17 +1032,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; - if (buf->size >= new_size) + if (buf->buf_size >= new_size) return; new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); if (!new_buf) return; - memcpy(new_buf, buf->data, buf->size); - kvpfree(buf->data, buf->size); + memcpy(new_buf, buf->data, buf->buf_size); + kvpfree(buf->data, buf->buf_size); buf->data = new_buf; - buf->size = new_size; + buf->buf_size = new_size; } static void journal_write_done(struct closure *cl) @@ -1166,7 +1082,7 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); + mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); out: /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1220,20 +1136,22 @@ void bch2_journal_write(struct closure *cl) struct bch_extent_ptr *ptr; bool validate_before_checksum = false; unsigned i, sectors, bytes, u64s; + int ret; + + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); - start = vstruct_last(w->data); + start = vstruct_last(jset); end = bch2_journal_super_entries_add_common(c, start); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); - le32_add_cpu(&w->data->u64s, u64s); - BUG_ON(vstruct_sectors(jset, c->block_bits) > - w->disk_sectors); + le32_add_cpu(&jset->u64s, u64s); + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); journal_write_compact(jset); @@ -1271,12 +1189,28 @@ void bch2_journal_write(struct closure *cl) goto err; sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > j->prev_buf_sectors); + BUG_ON(sectors > w->sectors); - bytes = vstruct_bytes(w->data); - memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); + bytes = vstruct_bytes(jset); + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - if (journal_write_alloc(j, w, sectors)) { + spin_lock(&j->lock); + ret = journal_write_alloc(j, w, sectors); + + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + spin_unlock(&j->lock); + + if (ret) { bch2_journal_halt(j); bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); @@ -1316,7 +1250,7 @@ void bch2_journal_write(struct closure *cl) trace_journal_write(bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } for_each_rw_member(ca, c, i) diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index d0a652cf..ec7b49b8 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -39,7 +39,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *); void bch2_journal_entries_free(struct list_head *); int bch2_journal_replay(struct bch_fs *, struct list_head *); -int bch2_journal_entry_sectors(struct journal *); void bch2_journal_write(struct closure *); #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index a795e888..b928b8c8 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -1,15 +1,213 @@ #include "bcachefs.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +/* Free space calculations: */ + +unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; + + /* + * Allocator startup needs some journal space before we can do journal + * replay: + */ + if (available && + test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) + available--; + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (available && + journal_last_seq(j) <= ja->bucket_seq[ja->last_idx]) + --available; + + return available; +} + +void bch2_journal_space_available(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned sectors_next_entry = UINT_MAX; + unsigned sectors_total = UINT_MAX; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs = 0; + unsigned unwritten_sectors = j->reservations.prev_buf_unwritten + ? journal_prev_buf(j)->sectors + : 0; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_this_device, sectors_this_device; + + if (!ja->nr) + continue; + + nr_online++; + + buckets_this_device = bch2_journal_dev_buckets_available(j, ja); + sectors_this_device = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + if (unwritten_sectors >= sectors_this_device) { + if (!buckets_this_device) + continue; + + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + sectors_this_device -= unwritten_sectors; + + if (sectors_this_device < ca->mi.bucket_size && + buckets_this_device) { + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + if (!sectors_this_device) + continue; + + sectors_next_entry = min(sectors_next_entry, + sectors_this_device); + + sectors_total = min(sectors_total, + buckets_this_device * ca->mi.bucket_size + + sectors_this_device); + + max_entry_size = min_t(unsigned, max_entry_size, + ca->mi.bucket_size); + + nr_devs++; + } + rcu_read_unlock(); + + if (nr_online < c->opts.metadata_replicas_required) { + ret = -EROFS; + sectors_next_entry = 0; + } else if (!sectors_next_entry || + nr_devs < min_t(unsigned, nr_online, + c->opts.metadata_replicas)) { + ret = -ENOSPC; + sectors_next_entry = 0; + } else if (!fifo_free(&j->pin)) { + ret = -ENOSPC; + sectors_next_entry = 0; + } + + j->cur_entry_sectors = sectors_next_entry; + j->cur_entry_error = ret; + + if (!ret) + journal_wake(j); +} + +/* Discards - last part of journal reclaim: */ + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->nr && + ja->last_idx != ja->cur_idx && + ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk; + spin_unlock(&j->lock); + + return ret; +} + +/* + * Advance ja->last_idx as long as it points to buckets that are no longer + * dirty, issuing discards if necessary: + */ +static void journal_do_discards(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned iter; + + mutex_lock(&j->reclaim_lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->last_idx]), + ca->mi.bucket_size, GFP_NOIO, 0); + + spin_lock(&j->lock); + ja->last_idx = (ja->last_idx + 1) % ja->nr; + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + } + } + + mutex_unlock(&j->reclaim_lock); +} + /* * Journal entry pinning - machinery for holding a reference on a given journal * entry, holding it open to ensure it gets replayed during recovery: */ +static void bch2_journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } + + if (popped) + bch2_journal_space_available(j); +} + +void bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) { + spin_lock(&j->lock); + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + } +} + static inline void __journal_pin_add(struct journal *j, u64 seq, struct journal_entry_pin *pin, @@ -24,10 +222,7 @@ static inline void __journal_pin_add(struct journal *j, pin->seq = seq; pin->flush = flush_fn; - if (flush_fn) - list_add(&pin->list, &pin_list->list); - else - INIT_LIST_HEAD(&pin->list); + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); /* * If the journal is currently full, we might want to call flush_fn @@ -129,88 +324,55 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) * data off of a specific device: */ -/** - * bch2_journal_reclaim_fast - do the fast part of journal reclaim - * - * Called from IO submission context, does not block. Cleans up after btree - * write completions by advancing the journal pin and each cache's last_idx, - * kicking off discards and background reclaim as necessary. - */ -void bch2_journal_reclaim_fast(struct journal *j) -{ - struct journal_entry_pin_list temp; - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!fifo_empty(&j->pin) && - !atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!fifo_pop(&j->pin, temp)); - popped = true; - } - - if (popped) - journal_wake(j); -} - -static void journal_pin_mark_flushing(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - lockdep_assert_held(&j->reclaim_lock); - - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; -} - -static void journal_pin_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - pin->flush(j, pin, seq); - - BUG_ON(j->flush_in_progress != pin); - j->flush_in_progress = NULL; - wake_up(&j->pin_flush_wait); -} - static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - /* no need to iterate over empty fifo entries: */ - bch2_journal_reclaim_fast(j); + spin_lock(&j->lock); + + BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count)); fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) - if (*seq > seq_to_flush || + if (*seq > max_seq || (ret = list_first_entry_or_null(&pin_list->list, struct journal_entry_pin, list))) break; - return ret; -} + if (ret) { + list_move(&ret->list, &pin_list->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = ret; + j->last_flushed = jiffies; + } -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - bool ret; - - spin_lock(&j->lock); - ret = ja->nr && - (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); spin_unlock(&j->lock); return ret; } +static void journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) +{ + struct journal_entry_pin *pin; + u64 seq; + + lockdep_assert_held(&j->reclaim_lock); + + while ((pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq))) { + if (min_nr) + min_nr--; + + pin->flush(j, pin, seq); + + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); + } +} + /** * bch2_journal_reclaim_work - free up journal buckets * @@ -235,104 +397,44 @@ void bch2_journal_reclaim_work(struct work_struct *work) struct bch_fs, journal.reclaim_work); struct journal *j = &c->journal; struct bch_dev *ca; - struct journal_entry_pin *pin; - u64 seq, seq_to_flush = 0; - unsigned iter, bucket_to_flush; - unsigned long next_flush; - bool reclaim_lock_held = false, need_flush; + unsigned iter, bucket_to_flush, min_nr = 0; + u64 seq_to_flush = 0; + + journal_do_discards(j); + + mutex_lock(&j->reclaim_lock); + spin_lock(&j->lock); - /* - * Advance last_idx to point to the oldest journal entry containing - * btree node updates that have not yet been written out - */ for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; if (!ja->nr) continue; - while (should_discard_bucket(j, ja)) { - if (!reclaim_lock_held) { - /* - * ugh: - * might be called from __journal_res_get() - * under wait_event() - have to go back to - * TASK_RUNNING before doing something that - * would block, but only if we're doing work: - */ - __set_current_state(TASK_RUNNING); - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - /* recheck under reclaim_lock: */ - continue; - } - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->last_idx]), - ca->mi.bucket_size, GFP_NOIO, 0); - - spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; - spin_unlock(&j->lock); - - journal_wake(j); - } - - /* - * Write out enough btree nodes to free up 50% journal - * buckets - */ - spin_lock(&j->lock); + /* Try to keep the journal at most half full: */ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; seq_to_flush = max_t(u64, seq_to_flush, ja->bucket_seq[bucket_to_flush]); - spin_unlock(&j->lock); } /* Also flush if the pin fifo is more than half full */ - spin_lock(&j->lock); seq_to_flush = max_t(s64, seq_to_flush, (s64) journal_cur_seq(j) - (j->pin.size >> 1)); + spin_unlock(&j->lock); /* * If it's been longer than j->reclaim_delay_ms since we last flushed, * make sure to flush at least one journal pin: */ - next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); - need_flush = time_after(jiffies, next_flush); + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; - while ((pin = journal_get_next_pin(j, need_flush - ? U64_MAX - : seq_to_flush, &seq))) { - if (!reclaim_lock_held) { - spin_unlock(&j->lock); - __set_current_state(TASK_RUNNING); - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - spin_lock(&j->lock); - continue; - } + journal_flush_pins(j, seq_to_flush, min_nr); - journal_pin_mark_flushing(j, pin, seq); - spin_unlock(&j->lock); - - journal_pin_flush(j, pin, seq); - - need_flush = false; - j->last_flushed = jiffies; - - spin_lock(&j->lock); - } - - spin_unlock(&j->lock); - - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); + mutex_unlock(&j->reclaim_lock); if (!test_bit(BCH_FS_RO, &c->flags)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, @@ -341,8 +443,6 @@ void bch2_journal_reclaim_work(struct work_struct *work) static int journal_flush_done(struct journal *j, u64 seq_to_flush) { - struct journal_entry_pin *pin; - u64 pin_seq; int ret; ret = bch2_journal_error(j); @@ -350,16 +450,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush) return ret; mutex_lock(&j->reclaim_lock); + + journal_flush_pins(j, seq_to_flush, 0); + spin_lock(&j->lock); - - while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) { - journal_pin_mark_flushing(j, pin, pin_seq); - spin_unlock(&j->lock); - - journal_pin_flush(j, pin, pin_seq); - - spin_lock(&j->lock); - } /* * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 287590cd..1d688d6f 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -3,6 +3,10 @@ #define JOURNAL_PIN (32 * 1024) +unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *); +void bch2_journal_space_available(struct journal *); + static inline bool journal_pin_active(struct journal_entry_pin *pin) { return pin->seq != 0; @@ -16,6 +20,8 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } +void bch2_journal_pin_put(struct journal *, u64); + void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, journal_pin_flush_fn); void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, @@ -27,7 +33,6 @@ void bch2_journal_pin_add_if_older(struct journal *, journal_pin_flush_fn); void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -void bch2_journal_reclaim_fast(struct journal *); void bch2_journal_reclaim_work(struct work_struct *); void bch2_journal_flush_pins(struct journal *, u64); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index a91662f6..8772e53f 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -21,8 +21,10 @@ struct journal_buf { struct closure_waitlist wait; - unsigned size; - unsigned disk_sectors; + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ unsigned u64s_reserved; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; @@ -128,9 +130,20 @@ struct journal { unsigned long flags; union journal_res_state reservations; + + /* Max size of current journal entry */ unsigned cur_entry_u64s; - unsigned prev_buf_sectors; - unsigned cur_buf_sectors; + unsigned cur_entry_sectors; + + /* + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ + int cur_entry_error; + + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + unsigned buf_size_want; /* @@ -141,6 +154,9 @@ struct journal { spinlock_t lock; + /* if nonzero, we may not open a new journal entry: */ + unsigned blocked; + /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; @@ -155,9 +171,6 @@ struct journal { u64 seq_ondisk; u64 last_seq_ondisk; - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - /* * FIFO of journal entries whose btree updates have not yet been * written out. diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 7e50547c..77ab464a 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -82,7 +82,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v)); break; case FS_USAGE_INODES: - percpu_u64_set(&c->usage[0]->s.nr_inodes, + percpu_u64_set(&c->usage[0]->nr_inodes, le64_to_cpu(u->v)); break; case FS_USAGE_KEY_VERSION: @@ -406,22 +406,19 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - ret = bch2_gc(c, &journal, true); - if (ret) - goto err; - - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - err = "unable to allocate journal buckets"; - for_each_online_member(ca, c, i) - if (bch2_dev_journal_alloc(ca)) { + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); + if (ret) { percpu_ref_put(&ca->io_ref); goto err; } + } /* * journal_res_get() will crash if called before this has diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 4d0c9718..99283b10 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -244,14 +244,14 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, *dst = *src; for (src_idx = 0; src_idx < src_r->nr; src_idx++) { - if (!src->data[src_idx]) + if (!src->replicas[src_idx]) continue; dst_idx = __replicas_entry_idx(dst_r, cpu_replicas_entry(src_r, src_idx)); BUG_ON(dst_idx < 0); - dst->data[dst_idx] = src->data[src_idx]; + dst->replicas[dst_idx] = src->replicas[src_idx]; } } @@ -261,39 +261,37 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, static int replicas_table_update(struct bch_fs *c, struct bch_replicas_cpu *new_r) { - struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL }; + struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; + struct bch_fs_usage __percpu *new_scratch = NULL; unsigned bytes = sizeof(struct bch_fs_usage) + sizeof(u64) * new_r->nr; - unsigned i; int ret = -ENOMEM; - for (i = 0; i < 3; i++) { - if (i < 2 && !c->usage[i]) - continue; + if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO)) || + (c->usage[1] && + !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO))) || + !(new_scratch = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO))) + goto err; - new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO); - if (!new_usage[i]) - goto err; - } + if (c->usage[0]) + __replicas_table_update(new_usage[0], new_r, + c->usage[0], &c->replicas); + if (c->usage[1]) + __replicas_table_update(new_usage[1], new_r, + c->usage[1], &c->replicas); - for (i = 0; i < 2; i++) { - if (!c->usage[i]) - continue; - - __replicas_table_update(new_usage[i], new_r, - c->usage[i], &c->replicas); - - swap(c->usage[i], new_usage[i]); - } - - swap(c->usage_scratch, new_usage[2]); - - swap(c->replicas, *new_r); + swap(c->usage[0], new_usage[0]); + swap(c->usage[1], new_usage[1]); + swap(c->usage_scratch, new_scratch); + swap(c->replicas, *new_r); ret = 0; err: - for (i = 0; i < 3; i++) - free_percpu(new_usage[i]); + free_percpu(new_scratch); + free_percpu(new_usage[1]); + free_percpu(new_usage[0]); return ret; } @@ -456,7 +454,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) if (__replicas_has_entry(&c->replicas_gc, e)) continue; - v = percpu_u64_get(&c->usage[0]->data[i]); + v = percpu_u64_get(&c->usage[0]->replicas[i]); if (!v) continue; @@ -557,7 +555,7 @@ int bch2_replicas_set_usage(struct bch_fs *c, BUG_ON(ret < 0); } - percpu_u64_set(&c->usage[0]->data[idx], sectors); + percpu_u64_set(&c->usage[0]->replicas[idx], sectors); return 0; } @@ -974,5 +972,6 @@ int bch2_fs_replicas_init(struct bch_fs *c) { c->journal.entry_u64s_reserved += reserve_journal_replicas(c, &c->replicas); - return 0; + + return replicas_table_update(c, &c->replicas); } diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 1f343e64..a1ca837b 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -125,7 +125,7 @@ struct bch_hash_desc { bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); }; -static inline struct btree_iter * +static __always_inline struct btree_iter * bch2_hash_lookup(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, @@ -159,7 +159,7 @@ bch2_hash_lookup(struct btree_trans *trans, return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT); } -static inline struct btree_iter * +static __always_inline struct btree_iter * bch2_hash_hole(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, @@ -185,10 +185,11 @@ bch2_hash_hole(struct btree_trans *trans, return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC); } -static inline int bch2_hash_needs_whiteout(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *start) +static __always_inline +int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *start) { struct btree_iter *iter; struct bkey_s_c k; @@ -211,10 +212,11 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans, return btree_iter_err(k); } -static inline int __bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - u64 inode, struct bkey_i *insert, int flags) +static __always_inline +int __bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, struct bkey_i *insert, int flags) { struct btree_iter *iter, *slot = NULL; struct bkey_s_c k; @@ -276,10 +278,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc, inode, insert, flags)); } -static inline int bch2_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter) +static __always_inline +int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter) { struct bkey_i *delete; int ret; @@ -300,10 +303,11 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans, return 0; } -static inline int bch2_hash_delete(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - u64 inode, const void *key) +static __always_inline +int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) { struct btree_iter *iter; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index b88750ff..71d97c57 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -136,7 +136,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) sb->bio = bio; } - new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); if (!new_sb) return -ENOMEM; @@ -923,7 +923,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_down_read_preempt_disable(&c->mark_lock); { - u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes); + u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes); struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -970,7 +970,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - u64 sectors = percpu_u64_get(&c->usage[0]->data[i]); + u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]); struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index a8eb1615..1528f77e 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -567,7 +567,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_sb_field_members *mi; struct bch_fs *c; - unsigned i, iter_size, fs_usage_size; + unsigned i, iter_size; const char *err; pr_verbose_init(opts, ""); @@ -661,9 +661,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct btree_node_iter_set); - fs_usage_size = sizeof(struct bch_fs_usage) + - sizeof(u64) * c->replicas.nr; - if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || !(c->copygc_wq = alloc_workqueue("bcache_copygc", @@ -680,8 +677,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) || - !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 7e3aebed..b56db15d 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -243,17 +243,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity); pr_buf(&out, "hidden:\t\t\t\t%llu\n", - fs_usage->s.hidden); + fs_usage->hidden); pr_buf(&out, "data:\t\t\t\t%llu\n", - fs_usage->s.data); + fs_usage->data); pr_buf(&out, "cached:\t\t\t\t%llu\n", - fs_usage->s.cached); + fs_usage->cached); pr_buf(&out, "reserved:\t\t\t%llu\n", - fs_usage->s.reserved); + fs_usage->reserved); pr_buf(&out, "nr_inodes:\t\t\t%llu\n", - fs_usage->s.nr_inodes); + fs_usage->nr_inodes); pr_buf(&out, "online reserved:\t\t%llu\n", - fs_usage->s.online_reserved); + fs_usage->online_reserved); for (i = 0; i < ARRAY_SIZE(fs_usage->persistent_reserved); @@ -269,7 +269,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) pr_buf(&out, "\t"); bch2_replicas_entry_to_text(&out, e); - pr_buf(&out, ":\t%llu\n", fs_usage->data[i]); + pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]); } percpu_up_read_preempt_enable(&c->mark_lock);