From 018de5aa899937a9dc3bc8cb9819cb218a59abf3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 4 May 2018 14:04:31 -0400 Subject: [PATCH] Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc warning --- .bcachefs_revision | 2 +- libbcachefs/alloc.c | 1 + libbcachefs/bcachefs.h | 35 +- libbcachefs/bcachefs_format.h | 53 +- libbcachefs/btree_io.c | 120 +- libbcachefs/btree_io.h | 2 +- libbcachefs/btree_iter.c | 4 +- libbcachefs/btree_update_interior.c | 1 + libbcachefs/btree_update_leaf.c | 13 +- libbcachefs/buckets.h | 3 +- libbcachefs/debug.c | 12 +- libbcachefs/error.c | 10 +- libbcachefs/error.h | 6 +- libbcachefs/extents.c | 89 +- libbcachefs/extents.h | 12 +- libbcachefs/extents_types.h | 1 - libbcachefs/fs-io.c | 271 ++- libbcachefs/fs.c | 1 + libbcachefs/fs.h | 2 + libbcachefs/io.c | 924 ++++++--- libbcachefs/io.h | 33 +- libbcachefs/io_types.h | 21 +- libbcachefs/journal.c | 2974 +++++---------------------- libbcachefs/journal.h | 128 +- libbcachefs/journal_io.c | 1423 +++++++++++++ libbcachefs/journal_io.h | 45 + libbcachefs/journal_reclaim.c | 411 ++++ libbcachefs/journal_reclaim.h | 36 + libbcachefs/journal_seq_blacklist.c | 358 ++++ libbcachefs/journal_seq_blacklist.h | 13 + libbcachefs/journal_types.h | 8 +- libbcachefs/move.c | 31 +- libbcachefs/super.c | 49 +- libbcachefs/super.h | 21 +- libbcachefs/sysfs.c | 100 +- libbcachefs/util.c | 201 +- libbcachefs/util.h | 101 +- 37 files changed, 4216 insertions(+), 3299 deletions(-) create mode 100644 libbcachefs/journal_io.c create mode 100644 libbcachefs/journal_io.h create mode 100644 libbcachefs/journal_reclaim.c create mode 100644 libbcachefs/journal_reclaim.h create mode 100644 libbcachefs/journal_seq_blacklist.c create mode 100644 libbcachefs/journal_seq_blacklist.h diff --git a/.bcachefs_revision b/.bcachefs_revision index a7c36b9e..37d51b2f 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -edf5f38218f699e53913a549465f35d36c4418f7 +ed4aea2ad4fa1b3891684cbd071d1a1ae9094342 diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 16bdc48c..256adb51 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -69,6 +69,7 @@ #include "extents.h" #include "io.h" #include "journal.h" +#include "journal_io.h" #include "super-io.h" #include diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index bc10324f..206c30f4 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -271,17 +271,19 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -/* name, frequency_units, duration_units */ -#define BCH_TIME_STATS() \ - BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \ - BCH_TIME_STAT(btree_gc, sec, ms) \ - BCH_TIME_STAT(btree_split, sec, us) \ - BCH_TIME_STAT(btree_sort, ms, us) \ - BCH_TIME_STAT(btree_read, ms, us) \ - BCH_TIME_STAT(journal_write, us, us) \ - BCH_TIME_STAT(journal_delay, ms, us) \ - BCH_TIME_STAT(journal_blocked, sec, ms) \ - BCH_TIME_STAT(journal_flush_seq, us, us) +#define BCH_TIME_STATS() \ + BCH_TIME_STAT(btree_node_mem_alloc) \ + BCH_TIME_STAT(btree_gc) \ + BCH_TIME_STAT(btree_split) \ + BCH_TIME_STAT(btree_sort) \ + BCH_TIME_STAT(btree_read) \ + BCH_TIME_STAT(data_write) \ + BCH_TIME_STAT(data_read) \ + BCH_TIME_STAT(data_promote) \ + BCH_TIME_STAT(journal_write) \ + BCH_TIME_STAT(journal_delay) \ + BCH_TIME_STAT(journal_blocked) \ + BCH_TIME_STAT(journal_flush_seq) #include "alloc_types.h" #include "buckets_types.h" @@ -416,7 +418,12 @@ struct bch_dev { struct work_struct io_error_work; /* The rest of this all shows up in sysfs */ - atomic_t latency[2]; + atomic64_t cur_latency[2]; + struct time_stats io_latency[2]; + +#define CONGESTED_MAX 1024 + atomic_t congested; + u64 congested_last; struct io_count __percpu *io_done; }; @@ -644,6 +651,7 @@ struct bch_fs { struct bio_set bio_write; struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; + struct rhashtable promote_table; mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_NR]; @@ -708,12 +716,13 @@ struct bch_fs { unsigned copy_gc_enabled:1; unsigned rebalance_enabled:1; unsigned rebalance_percent; + bool promote_whole_extents; #define BCH_DEBUG_PARAM(name, description) bool name; BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ +#define BCH_TIME_STAT(name) \ struct time_stats name##_time; BCH_TIME_STATS() #undef BCH_TIME_STAT diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index eed6fb85..48d14a30 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, - struct bch_sb, flags[1], 28, 32); LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, + struct bch_sb, flags[2], 0, 4); + /* Features: */ enum bch_sb_features { BCH_FEATURE_LZ4 = 0, @@ -1193,29 +1194,41 @@ struct jset_entry { }; }; +#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) + +#define BCH_JSET_ENTRY_TYPES() \ + x(btree_keys, 0) \ + x(btree_root, 1) \ + x(prio_ptrs, 2) \ + x(blacklist, 3) \ + x(blacklist_v2, 4) + +enum { +#define x(f, nr) BCH_JSET_ENTRY_##f = nr, + BCH_JSET_ENTRY_TYPES() +#undef x + BCH_JSET_ENTRY_NR +}; + +/* + * Journal sequence numbers can be blacklisted: bsets record the max sequence + * number of all the journal entries they contain updates for, so that on + * recovery we can ignore those bsets that contain index updates newer that what + * made it into the journal. + * + * This means that we can't reuse that journal_seq - we have to skip it, and + * then record that we skipped it so that the next time we crash and recover we + * don't think there was a missing journal entry. + */ struct jset_entry_blacklist { struct jset_entry entry; __le64 seq; }; -#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) - -enum { - JOURNAL_ENTRY_BTREE_KEYS = 0, - JOURNAL_ENTRY_BTREE_ROOT = 1, - JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */ - - /* - * Journal sequence numbers can be blacklisted: bsets record the max - * sequence number of all the journal entries they contain updates for, - * so that on recovery we can ignore those bsets that contain index - * updates newer that what made it into the journal. - * - * This means that we can't reuse that journal_seq - we have to skip it, - * and then record that we skipped it so that the next time we crash and - * recover we don't think there was a missing journal entry. - */ - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3, +struct jset_entry_blacklist_v2 { + struct jset_entry entry; + __le64 start; + __le64 end; }; /* diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 0525c3b8..1aa94229 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -13,7 +13,8 @@ #include "error.h" #include "extents.h" #include "io.h" -#include "journal.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "super-io.h" #include @@ -947,6 +948,7 @@ enum btree_validate_ret { #define btree_err(type, c, b, i, msg, ...) \ ({ \ + __label__ out; \ char _buf[300], *out = _buf, *end = out + sizeof(_buf); \ \ out += btree_err_msg(c, b, i, b->written, write, out, end - out);\ @@ -956,7 +958,11 @@ enum btree_validate_ret { write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ mustfix_fsck_err(c, "%s", _buf); \ - } else { \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ bch_err(c, "%s", _buf); \ \ switch (type) { \ @@ -976,7 +982,17 @@ enum btree_validate_ret { ret = BCH_FSCK_ERRORS_NOT_FIXED; \ goto fsck_err; \ } \ + break; \ + case WRITE: \ + bch_err(c, "corrupt metadata before write: %s", _buf); \ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ + break; \ } \ +out: \ true; \ }) @@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_devs_mask avoid; + bool can_retry; memset(&avoid, 0, sizeof(avoid)); goto start; - do { + while (1) { bch_info(c, "retrying read"); + ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio); - bio_set_dev(bio, rb->pick.ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); - submit_bio_wait(bio); -start: - bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read"); - percpu_ref_put(&rb->pick.ca->io_ref); - __set_bit(rb->pick.ca->dev_idx, avoid.d); - rb->pick = bch2_btree_pick_ptr(c, b, &avoid); + if (rb->have_ioref) { + bio_set_dev(bio, ca->disk_sb.bdev); + submit_bio_wait(bio); + } else { + bio->bi_status = BLK_STS_REMOVED; + } +start: + bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; + + __set_bit(rb->pick.ptr.dev, avoid.d); + can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca))) - goto out; - } while (!IS_ERR_OR_NULL(rb->pick.ca)); + !bch2_btree_node_read_done(c, b, can_retry)) + break; - set_btree_node_read_error(b); -out: - if (!IS_ERR_OR_NULL(rb->pick.ca)) - percpu_ref_put(&rb->pick.ca->io_ref); + if (!can_retry) { + set_btree_node_read_error(b); + break; + } + } bch2_time_stats_update(&c->btree_read_time, rb->start_time); bio_put(&rb->bio); @@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio) { struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; - bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ); + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time, READ); + } - INIT_WORK(&rb->work, btree_node_read_work); queue_work(system_unbound_wq, &rb->work); } @@ -1377,41 +1407,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, { struct extent_pick_ptr pick; struct btree_read_bio *rb; + struct bch_dev *ca; struct bio *bio; + int ret; trace_btree_read(c, b); - pick = bch2_btree_pick_ptr(c, b, NULL); - if (bch2_fs_fatal_err_on(!pick.ca, c, + ret = bch2_btree_pick_ptr(c, b, NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, "btree node read error: no device to read from")) { set_btree_node_read_error(b); return; } + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->pick = pick; - bio_set_dev(bio, pick.ca->disk_sb.bdev); + INIT_WORK(&rb->work, btree_node_read_work); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = b; bch2_bio_map(bio, b->data); - this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE], - bio_sectors(bio)); - set_btree_node_read_in_flight(b); - if (sync) { - submit_bio_wait(bio); - bio->bi_private = b; - btree_node_read_work(&rb->work); + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); + + if (sync) { + submit_bio_wait(bio); + + bio->bi_private = b; + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); + } } else { - bio->bi_end_io = btree_node_read_endio; - bio->bi_private = b; - submit_bio(bio); + bio->bi_status = BLK_STS_REMOVED; + + if (sync) + btree_node_read_work(&rb->work); + else + queue_work(system_unbound_wq, &rb->work); + } } @@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio) struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_write_bio *orig = parent ?: wbio; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); unsigned long flags; - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); if (bio->bi_status == BLK_STS_REMOVED || bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, ca->dev_idx); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); } - if (wbio->have_io_ref) + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); if (parent) { diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 01df817d..947685f9 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -12,8 +12,8 @@ struct btree_iter; struct btree_read_bio { struct bch_fs *c; - unsigned submit_time_us; u64 start_time; + unsigned have_ioref:1; struct extent_pick_ptr pick; struct work_struct work; struct bio bio; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 465aadba..69cad3bb 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter) struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; BKEY_PADDED(k) tmp; - unsigned nr = iter->level > 1 ? 1 : 8; + unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags) + ? (iter->level > 1 ? 0 : 2) + : (iter->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(iter, iter->level); while (nr) { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 63696920..adba3092 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -12,6 +12,7 @@ #include "buckets.h" #include "extents.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "replicas.h" #include "super-io.h" diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 53b39de5..92fb5f61 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -8,6 +8,7 @@ #include "debug.h" #include "extents.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include @@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans, EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); - if (likely(trans->journal_res.ref)) { + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { u64 seq = trans->journal_res.seq; bool needs_whiteout = insert->k.needs_whiteout; @@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans, btree_bset_last(b)->journal_seq = cpu_to_le64(seq); } - if (unlikely(!journal_pin_active(&w->journal))) - bch2_journal_pin_add(j, &trans->journal_res, - &w->journal, + if (unlikely(!journal_pin_active(&w->journal))) { + u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + ? trans->journal_res.seq + : j->replay_journal_seq; + + bch2_journal_pin_add(j, seq, &w->journal, btree_node_write_idx(b) == 0 ? btree_node_flush0 : btree_node_flush1); + } if (unlikely(!btree_node_dirty(b))) set_btree_node_dirty(b); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 399a853c..01f0b314 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, u64 total = ca->mi.nbuckets - ca->mi.first_bucket; if (WARN_ONCE(stats.buckets_unavailable > total, - "buckets_unavailable overflow\n")) + "buckets_unavailable overflow (%llu > %llu)\n", + stats.buckets_unavailable, total)) return 0; return total - stats.buckets_unavailable; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 7190990d..71f649bc 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct btree_node *n_ondisk, *n_sorted, *n_inmemory; struct bset *sorted, *inmemory; struct extent_pick_ptr pick; + struct bch_dev *ca; struct bio *bio; if (c->opts.nochanges) @@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->btree_id = b->btree_id; bch2_btree_keys_init(v, &c->expensive_debug_checks); - pick = bch2_btree_pick_ptr(c, b, NULL); - if (IS_ERR_OR_NULL(pick.ca)) + if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0) + return; + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) return; bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); - bio_set_dev(bio, pick.ca->disk_sb.bdev); + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); @@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&pick.ca->io_ref); + percpu_ref_put(&ca->io_ref); memcpy(n_ondisk, n_sorted, btree_bytes(c)); diff --git a/libbcachefs/error.c b/libbcachefs/error.c index ca2a06e2..2a357fc3 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -3,20 +3,22 @@ #include "io.h" #include "super.h" -void bch2_inconsistent_error(struct bch_fs *c) +bool bch2_inconsistent_error(struct bch_fs *c) { set_bit(BCH_FS_ERROR, &c->flags); switch (c->opts.errors) { case BCH_ON_ERROR_CONTINUE: - break; + return false; case BCH_ON_ERROR_RO: if (bch2_fs_emergency_read_only(c)) bch_err(c, "emergency read only"); - break; + return true; case BCH_ON_ERROR_PANIC: panic(bch2_fmt(c, "panic after error")); - break; + return true; + default: + BUG(); } } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index ac3e96d2..ababaee0 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -45,13 +45,13 @@ do { \ * BCH_ON_ERROR_CONTINUE mode */ -void bch2_inconsistent_error(struct bch_fs *); +bool bch2_inconsistent_error(struct bch_fs *); #define bch2_fs_inconsistent(c, ...) \ -do { \ +({ \ bch_err(c, __VA_ARGS__); \ bch2_inconsistent_error(c); \ -} while (0) +}) #define bch2_fs_inconsistent_on(cond, c, ...) \ ({ \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c5d1e7cb..9efaa1ff 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -588,58 +588,51 @@ out: return out - buf; } -static inline bool dev_latency_better(struct bch_dev *dev1, - struct bch_dev *dev2) +static inline bool dev_latency_better(struct bch_fs *c, + const struct bch_extent_ptr *ptr1, + const struct bch_extent_ptr *ptr2) { - unsigned l1 = atomic_read(&dev1->latency[READ]); - unsigned l2 = atomic_read(&dev2->latency[READ]); + struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev); + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); /* Pick at random, biased in favor of the faster device: */ return bch2_rand_range(l1 + l2) > l1; } -static void extent_pick_read_device(struct bch_fs *c, - struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) +static int extent_pick_read_device(struct bch_fs *c, + struct bkey_s_c_extent e, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; + struct bch_dev *ca; + int ret = 0; extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr->cached && ptr_stale(ca, ptr)) continue; - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) + if (avoid && test_bit(ptr->dev, avoid->d)) continue; - if (avoid) { - if (test_bit(ca->dev_idx, avoid->d)) - continue; - - if (pick->ca && - test_bit(pick->ca->dev_idx, avoid->d)) - goto use; - } - - if (pick->ca && !dev_latency_better(ca, pick->ca)) + if (ret && !dev_latency_better(c, ptr, &pick->ptr)) continue; -use: - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (pick->ca) - percpu_ref_put(&pick->ca->io_ref); *pick = (struct extent_pick_ptr) { .ptr = *ptr, .crc = crc, - .ca = ca, }; + + ret = 1; } + + return ret; } /* Btree ptrs */ @@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, #undef p } -struct extent_pick_ptr -bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid) +int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { - struct extent_pick_ptr pick = { .ca = NULL }; - - extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, &pick); - - return pick; + return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), + avoid, pick); } /* Extents */ @@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to * other devices, it will still pick a pointer from avoid. */ -void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *ret) +int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) { - struct bkey_s_c_extent e; + int ret; switch (k.k->type) { case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_COOKIE: - ret->ca = NULL; - return; + return 0; case KEY_TYPE_ERROR: - ret->ca = ERR_PTR(-EIO); - return; + return -EIO; case BCH_EXTENT: case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - ret->ca = NULL; + ret = extent_pick_read_device(c, bkey_s_c_to_extent(k), + avoid, pick); - extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret); + if (!ret && !bkey_extent_is_cached(k.k)) + ret = -EIO; - if (!ret->ca && !bkey_extent_is_cached(e.k)) - ret->ca = ERR_PTR(-EIO); - return; + return ret; case BCH_RESERVATION: - ret->ca = NULL; - return; + return 0; default: BUG(); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 8dc15484..338e9e01 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct btree *, struct btree_node_iter_large *); -struct extent_pick_ptr -bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, - struct bch_devs_mask *avoid); +int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *); -void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_devs_mask *, - struct extent_pick_ptr *); +int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_devs_mask *, + struct extent_pick_ptr *); enum btree_insert_ret bch2_insert_fixup_extent(struct btree_insert *, diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index 15805cd2..76139f93 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked { struct extent_pick_ptr { struct bch_extent_ptr ptr; struct bch_extent_crc_unpacked crc; - struct bch_dev *ca; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d1473f2a..a2455b42 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c, if (!res->sectors) return; - mutex_lock(&inode->ei_update_lock); + mutex_lock(&inode->ei_quota_lock); BUG_ON(res->sectors > inode->ei_quota_reserved); bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) res->sectors), BCH_QUOTA_PREALLOC); inode->ei_quota_reserved -= res->sectors; - mutex_unlock(&inode->ei_update_lock); + mutex_unlock(&inode->ei_quota_lock); res->sectors = 0; } @@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c, { int ret; - mutex_lock(&inode->ei_update_lock); + mutex_lock(&inode->ei_quota_lock); ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK); if (likely(!ret)) { inode->ei_quota_reserved += sectors; res->sectors += sectors; } - mutex_unlock(&inode->ei_update_lock); + mutex_unlock(&inode->ei_quota_lock); return ret; } @@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c, return __bch2_write_inode(c, inode, inode_set_size, &new_size); } -static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, int sectors) +static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, int sectors) { + mutex_lock(&inode->ei_quota_lock); #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { BUG_ON(sectors > quota_res->sectors); @@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, } #endif inode->v.i_blocks += sectors; -} - -static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, int sectors) -{ - mutex_lock(&inode->ei_update_lock); - __i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_update_lock); + mutex_unlock(&inode->ei_quota_lock); } /* i_sectors accounting: */ @@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) if (h->new_i_size != U64_MAX) i_size_write(&h->inode->v, h->new_i_size); - __i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); + i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h); mutex_unlock(&h->inode->ei_update_lock); @@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset, int bch2_releasepage(struct page *page, gfp_t gfp_mask) { + /* XXX: this can't take locks that are held while we allocate memory */ EBUG_ON(!PageLocked(page)); EBUG_ON(PageWriteback(page)); @@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page) int ret; prefetchw(&page->flags); - page_state_init_for_read(page); ret = add_to_page_cache_lru(page, iter->mapping, page->index, GFP_NOFS); + if (!ret) + page_state_init_for_read(page); + put_page(page); return ret; } @@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + rbio->c = c; + rbio->start_time = local_clock(); + while (1) { - struct extent_pick_ptr pick; BKEY_PADDED(k) tmp; struct bkey_s_c k; unsigned bytes; - bool is_last; bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector)); @@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_unlock(iter); k = bkey_i_to_s_c(&tmp.k); - bch2_extent_pick_ptr(c, k, NULL, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - bio_endio(bio); - return; - } + if (readpages_iter) { + bool want_full_extent = false; + + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + + extent_for_each_ptr_crc(e, ptr, crc) + want_full_extent |= !!crc.csum_type | + !!crc.compression_type; + } - if (readpages_iter) readpage_bio_extend(readpages_iter, bio, k.k->p.offset, - pick.ca && - (pick.crc.csum_type || - pick.crc.compression_type)); + want_full_extent); + } bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) - bio->bi_iter.bi_sector) << 9; - is_last = bytes == bio->bi_iter.bi_size; swap(bio->bi_iter.bi_size, bytes); + if (bytes == bio->bi_iter.bi_size) + flags |= BCH_READ_LAST_FRAGMENT; + if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(bio, k); - if (pick.ca) { - if (!is_last) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } + bch2_read_extent(c, rbio, k, flags); - bch2_read_extent(c, rbio, bkey_s_c_to_extent(k), - &pick, flags); - } else { - zero_fill_bio(bio); - - if (is_last) - bio_endio(bio); - } - - if (is_last) + if (flags & BCH_READ_LAST_FRAGMENT) return; swap(bio->bi_iter.bi_size, bytes); @@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping, return copied; } +#define WRITE_BATCH_PAGES 32 + +static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, + loff_t pos, unsigned len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct page *pages[WRITE_BATCH_PAGES]; + unsigned long index = pos >> PAGE_SHIFT; + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + unsigned i, copied = 0, nr_pages_copied = 0; + int ret = 0; + + BUG_ON(!len); + BUG_ON(nr_pages > ARRAY_SIZE(pages)); + + for (i = 0; i < nr_pages; i++) { + pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); + if (!pages[i]) { + nr_pages = i; + ret = -ENOMEM; + goto out; + } + } + + if (offset && !PageUptodate(pages[0])) { + ret = bch2_read_single_page(pages[0], mapping); + if (ret) + goto out; + } + + if ((pos + len) & (PAGE_SIZE - 1) && + !PageUptodate(pages[nr_pages - 1])) { + if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { + zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); + } else { + ret = bch2_read_single_page(pages[nr_pages - 1], mapping); + if (ret) + goto out; + } + } + + for (i = 0; i < nr_pages; i++) { + ret = bch2_get_page_reservation(c, inode, pages[i], true); + + if (ret && !PageUptodate(pages[i])) { + ret = bch2_read_single_page(pages[i], mapping); + if (ret) + goto out; + + ret = bch2_get_page_reservation(c, inode, pages[i], true); + } + + if (ret) + goto out; + } + + if (mapping_writably_mapped(mapping)) + for (i = 0; i < nr_pages; i++) + flush_dcache_page(pages[i]); + + while (copied < len) { + struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); + unsigned pg_bytes = min_t(unsigned, len - copied, + PAGE_SIZE - pg_offset); + unsigned pg_copied = iov_iter_copy_from_user_atomic(page, + iter, pg_offset, pg_bytes); + + if (!pg_copied) + break; + + flush_dcache_page(page); + iov_iter_advance(iter, pg_copied); + copied += pg_copied; + } + + if (!copied) + goto out; + + nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); + inode->ei_last_dirtied = (unsigned long) current; + + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + + if (copied < len && + ((offset + copied) & (PAGE_SIZE - 1))) { + struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; + + if (!PageUptodate(page)) { + zero_user(page, 0, PAGE_SIZE); + copied -= (offset + copied) & (PAGE_SIZE - 1); + } + } +out: + for (i = 0; i < nr_pages_copied; i++) { + if (!PageUptodate(pages[i])) + SetPageUptodate(pages[i]); + if (!PageDirty(pages[i])) + set_page_dirty(pages[i]); + unlock_page(pages[i]); + put_page(pages[i]); + } + + for (i = nr_pages_copied; i < nr_pages; i++) { + if (!PageDirty(pages[i])) + bch2_put_page_reservation(c, inode, pages[i]); + unlock_page(pages[i]); + put_page(pages[i]); + } + + return copied ?: ret; +} + +static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; + + pagecache_add_get(&mapping->add_lock); + + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE * WRITE_BATCH_PAGES - offset); +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE - offset); + + if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + ret = -EFAULT; + break; + } + } + + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + + cond_resched(); + + if (unlikely(ret == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(iter)); + goto again; + } + pos += ret; + written += ret; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); + + pagecache_add_put(&mapping->add_lock); + + return written ? written : ret; +} + /* O_DIRECT reads */ static void bch2_dio_read_complete(struct closure *cl) @@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = iocb->ki_flags & IOCB_DIRECT ? bch2_direct_write(iocb, from) - : generic_perform_write(file, from, iocb->ki_pos); + : bch2_buffered_write(iocb, from); if (likely(ret > 0)) iocb->ki_pos += ret; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c7e842ee..fb30f0d9 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); + mutex_init(&inode->ei_quota_lock); inode->ei_journal_seq = 0; return &inode->v; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index fddfb2d2..fbbc7a3a 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -15,6 +15,8 @@ struct bch_inode_info { u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; + + struct mutex ei_quota_lock; struct bch_qid ei_qid; struct bch_hash_info ei_str_hash; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 27e45081..bb656522 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -14,6 +14,7 @@ #include "compress.h" #include "clock.h" #include "debug.h" +#include "disk_groups.h" #include "error.h" #include "extents.h" #include "io.h" @@ -30,14 +31,71 @@ #include -/* Allocate, free from mempool: */ - -void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) +static bool bch2_target_congested(struct bch_fs *c, u16 target) { + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target); + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) +{ + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) +{ + atomic64_t *latency = &ca->cur_latency[rw]; u64 now = local_clock(); - unsigned io_latency = (now >> 10) - submit_time_us; - atomic_t *latency = &ca->latency[rw]; - unsigned old, new, v = atomic_read(latency); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); do { old = v; @@ -51,10 +109,16 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) now & ~(~0 << 5)) break; - new = ewma_add((u64) old, io_latency, 6); - } while ((v = atomic_cmpxchg(latency, old, new)) != old); + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +/* Allocate, free from mempool: */ + void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bio_vec *bv; @@ -169,22 +233,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } n->c = c; - n->ca = ca; - n->submit_time_us = local_clock_us(); + n->dev = ptr->dev; + n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; if (!journal_flushes_device(ca)) n->bio.bi_opf |= REQ_FUA; - if (likely(percpu_ref_tryget(&ca->io_ref))) { + if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); - n->have_io_ref = true; bio_set_dev(&n->bio, ca->disk_sb.bdev); submit_bio(&n->bio); } else { - n->have_io_ref = false; n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } @@ -196,15 +259,18 @@ static void __bch2_write(struct closure *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&op->c->journal); + op->error = bch2_journal_error(&c->journal); if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(op->c, &op->res); - percpu_ref_put(&op->c->writes); + bch2_disk_reservation_put(c, &op->res); + percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); + bch2_time_stats_update(&c->data_write_time, op->start_time); + closure_return(cl); } @@ -318,15 +384,15 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; - - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) - set_bit(ca->dev_idx, op->failed.d); + set_bit(wbio->dev, op->failed.d); - if (wbio->have_io_ref) + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); percpu_ref_put(&ca->io_ref); + } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -821,6 +887,8 @@ void bch2_write(struct closure *cl) BUG_ON(!bkey_cmp(op->pos, POS_MAX)); BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); + op->start_time = local_clock(); + memset(&op->failed, 0, sizeof(op->failed)); bch2_keylist_init(&op->insert_keys, op->inline_keys); @@ -844,92 +912,27 @@ void bch2_write(struct closure *cl) struct promote_op { struct closure cl; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + struct migrate_write write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; -static void promote_done(struct closure *cl) +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) { - struct promote_op *op = - container_of(cl, struct promote_op, cl); - struct bch_fs *c = op->write.op.c; - - percpu_ref_put(&c->writes); - bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); - kfree(op); -} - -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - struct closure *cl = &op->cl; - struct bio *bio = &op->write.op.wbio.bio; - - BUG_ON(!rbio->split || !rbio->bounce); - - if (!percpu_ref_tryget(&c->writes)) - return; - - trace_promote(&rbio->bio); - - /* we now own pages: */ - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - rbio->promote = NULL; - - bch2_migrate_read_done(&op->write, rbio); - - closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); - closure_return_with_destructor(cl, promote_done); -} - -/* - * XXX: multiple promotes can race with each other, wastefully. Keep a list of - * outstanding promotes? - */ -static struct promote_op *promote_alloc(struct bch_read_bio *rbio, - struct bkey_s_c k) -{ - struct bch_fs *c = rbio->c; - struct promote_op *op; - struct bio *bio; - /* data might have to be decompressed in the write path: */ - unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size, - PAGE_SECTORS); - int ret; - - BUG_ON(!rbio->bounce); - BUG_ON(pages < rbio->bio.bi_vcnt); - - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, - GFP_NOIO); - if (!op) - return NULL; - - bio = &op->write.op.wbio.bio; - bio_init(bio, bio->bi_inline_vecs, pages); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - - ret = bch2_migrate_write_init(c, &op->write, - writepoint_hashed((unsigned long) current), - rbio->opts, - DATA_PROMOTE, - (struct data_opts) { - .target = rbio->opts.promote_target - }, - k); - BUG_ON(ret); - - return op; -} - -static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, - unsigned flags, u16 target) -{ - if (!target) + if (!opts.promote_target) return false; if (!(flags & BCH_READ_MAY_PROMOTE)) @@ -938,15 +941,182 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, if (percpu_ref_is_dying(&c->writes)) return false; - return bch2_extent_has_target(c, e, target) == NULL; + if (!bkey_extent_is_data(k.k)) + return false; + + if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target)) + return false; + + if (bch2_target_congested(c, opts.promote_target)) + return false; + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return false; + + return true; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + percpu_ref_put(&c->writes); + kfree(op); +} + +static void promote_done(struct closure *cl) +{ + struct promote_op *op = + container_of(cl, struct promote_op, cl); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->data_promote_time, op->start_time); + + bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + struct closure *cl = &op->cl; + struct bio *bio = &op->write.op.wbio.bio; + + trace_promote(&rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_migrate_read_done(&op->write, rbio); + + closure_init(cl, NULL); + closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_return_with_destructor(cl, promote_done); +} + +noinline +static struct promote_op *__promote_alloc(struct bch_fs *c, + struct bpos pos, + struct extent_pick_ptr *pick, + struct bch_io_opts opts, + unsigned rbio_sectors, + struct bch_read_bio **rbio) +{ + struct promote_op *op = NULL; + struct bio *bio; + unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS); + /* data might have to be decompressed in the write path: */ + unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size, + PAGE_SECTORS); + int ret; + + if (!percpu_ref_tryget(&c->writes)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages, + GFP_NOIO); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * promotes require bouncing, but if the extent isn't + * checksummed/compressed it might be too big for the mempool: + */ + if (rbio_sectors > c->sb.encoded_extent_max) { + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * rbio_pages, + GFP_NOIO); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, + rbio_pages); + + (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9; + bch2_bio_map(&(*rbio)->bio, NULL); + + if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + } + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, bio->bi_inline_vecs, wbio_pages); + + ret = bch2_migrate_write_init(c, &op->write, + writepoint_hashed((unsigned long) current), + opts, + DATA_PROMOTE, + (struct data_opts) { + .target = opts.promote_target + }, + bkey_s_c_null); + BUG_ON(ret); + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + percpu_ref_put(&c->writes); + return NULL; +} + +static inline struct promote_op *promote_alloc(struct bch_fs *c, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_pick_ptr *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + unsigned sectors = promote_full + ? pick->crc.compressed_size + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + + if (!should_promote(c, k, pos, opts, flags)) + return NULL; + + promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; + + *bounce = true; + *read_full = promote_full; + return promote; } /* Read */ -static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, u64, - struct bch_devs_mask *, unsigned); - #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -979,38 +1149,144 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { - struct bch_read_bio *parent = rbio->parent; - - BUG_ON(!rbio->split); + BUG_ON(rbio->bounce && !rbio->split); if (rbio->promote) - kfree(rbio->promote); + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - bio_put(&rbio->bio); - return parent; + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; } static void bch2_rbio_done(struct bch_read_bio *rbio) { - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; - - if (rbio->split) - rbio = bch2_rbio_free(rbio); + bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time); bio_endio(&rbio->bio); } +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct btree_iter iter; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + rbio->pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k)) { + bch2_btree_iter_unlock(&iter); + goto err; + } + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + if (!bkey_extent_is_data(k.k) || + !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; + goto out; +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: + bch2_rbio_done(rbio); +} + +static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; +retry: + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS, k) { + BKEY_PADDED(k) tmp; + unsigned bytes; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + bytes = min_t(unsigned, bvec_iter.bi_size, + (k.k->p.offset - bvec_iter.bi_sector) << 9); + swap(bvec_iter.bi_size, bytes); + + ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + switch (ret) { + case READ_RETRY: + goto retry; + case READ_ERR: + goto err; + }; + + if (bytes == bvec_iter.bi_size) + goto out; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } + + /* + * If we get here, it better have been because there was an error + * reading a btree node + */ + ret = bch2_btree_iter_unlock(&iter); + BUG_ON(!ret); + __bcache_io_error(c, "btree IO error %i", ret); +err: + rbio->bio.bi_status = BLK_STS_IOERR; +out: + bch2_rbio_done(rbio); +} + static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->pos.inode; struct bch_devs_mask avoid; trace_read_retry(&rbio->bio); @@ -1018,26 +1294,19 @@ static void bch2_rbio_retry(struct work_struct *work) memset(&avoid, 0, sizeof(avoid)); if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ca->dev_idx, avoid.d); + __set_bit(rbio->pick.ptr.dev, avoid.d); - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; + rbio->bio.bi_status = 0; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - else - rbio->bio.bi_status = 0; + rbio = bch2_rbio_free(rbio); - if (!(flags & BCH_READ_NODECODE)) - flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); else - __bch2_read(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &avoid, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1049,7 +1318,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, return; if (retry == READ_ERR) { - bch2_rbio_parent(rbio)->bio.bi_status = error; + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; bch2_rbio_done(rbio); } else { bch2_rbio_punt(rbio, bch2_rbio_retry, @@ -1121,12 +1392,13 @@ out: bch2_btree_iter_unlock(&iter); } -static bool should_narrow_crcs(struct bkey_s_c_extent e, +static bool should_narrow_crcs(struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(e, pick->crc); + bkey_extent_is_data(k.k) && + bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc); } /* Inner part that may run in process context */ @@ -1134,8 +1406,10 @@ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); @@ -1191,10 +1465,13 @@ static void __bch2_read_endio(struct work_struct *work) */ bch2_encrypt_bio(c, crc.csum_type, nonce, src); promote_start(rbio->promote, rbio); + rbio->promote = NULL; } nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); + } return; csum_err: /* @@ -1208,7 +1485,7 @@ csum_err: return; } - bch2_dev_io_error(rbio->pick.ca, + bch2_dev_io_error(ca, "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, @@ -1227,25 +1504,27 @@ static void bch2_read_endio(struct bio *bio) { struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ); - - percpu_ref_put(&rbio->pick.ca->io_ref); + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (rbio->pick.ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) { + ptr_stale(ca, &rbio->pick.ptr))) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -1266,76 +1545,97 @@ static void bch2_read_endio(struct bio *bio) } int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, unsigned flags) + struct bvec_iter iter, struct bkey_s_c k, + struct bch_devs_mask *avoid, unsigned flags) { - struct bch_read_bio *rbio; - bool split = false, bounce = false, read_full = false; - bool promote = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(e.k); - int ret = 0; + struct extent_pick_ptr pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos pos = bkey_start_pos(k.k); + int pick_ret; - lg_local_lock(&c->usage_lock); - bucket_io_clock_reset(c, pick->ca, - PTR_BUCKET_NR(pick->ca, &pick->ptr), READ); - lg_local_unlock(&c->usage_lock); + pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); - narrow_crcs = should_narrow_crcs(e, pick, flags); + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) + goto no_device; + + if (pick_ret > 0) + ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (flags & BCH_READ_NODECODE) { - BUG_ON(iter.bi_size < pick->crc.compressed_size << 9); - iter.bi_size = pick->crc.compressed_size << 9; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_sector = pos.offset; + iter.bi_size = pick.crc.compressed_size << 9; goto noclone; } + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = should_narrow_crcs(k, &pick, flags); + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector || - e.k->p.offset < bvec_iter_end_sector(iter)); + EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || + k.k->p.offset < bvec_iter_end_sector(iter)); - if (pick->crc.compression_type != BCH_COMPRESSION_NONE || - (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick->crc.csum_type) && + if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; bounce = true; } - promote = should_promote(c, e, flags, orig->opts.promote_target); - /* could also set read_full */ - if (promote) - bounce = true; + promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick->crc.compression_type); - EBUG_ON(pick->crc.csum_type && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - bvec_iter_sectors(iter) != pick->crc.live_size || - pick->crc.offset || + EBUG_ON(pick.crc.compression_type); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || iter.bi_sector != pos.offset)); - pick->ptr.offset += pick->crc.offset + + pick.ptr.offset += pick.crc.offset + (iter.bi_sector - pos.offset); - pick->crc.compressed_size = bvec_iter_sectors(iter); - pick->crc.uncompressed_size = bvec_iter_sectors(iter); - pick->crc.offset = 0; - pick->crc.live_size = bvec_iter_sectors(iter); + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); pos.offset = iter.bi_sector; } - if (bounce) { - unsigned sectors = pick->crc.compressed_size; + if (rbio) { + /* promote already allocated bounce rbio */ + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split), + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read_split), orig->opts); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - split = true; + rbio->bounce = true; + rbio->split = true; } else if (flags & BCH_READ_MUST_CLONE) { /* * Have to clone if there were any splits, due to error @@ -1349,156 +1649,138 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; - split = true; + rbio->split = true; } else { noclone: rbio = orig; rbio->bio.bi_iter = iter; - split = false; BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size); + BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; - if (split) + rbio->submit_time = local_clock(); + if (rbio->split) rbio->parent = orig; else rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; - rbio->submit_time_us = local_clock_us(); rbio->flags = flags; - rbio->bounce = bounce; - rbio->split = split; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - rbio->devs_have = bch2_extent_devs(e); - rbio->pick = *pick; + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; rbio->pos = pos; - rbio->version = e.k->version; - rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL; + rbio->version = k.k->version; + rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev); rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick->ptr.offset; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; - if (bounce) + if (rbio->bounce) trace_read_bounce(&rbio->bio); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], + + if (!rbio->have_ioref) + goto no_device_postclone; + + lg_local_lock(&c->usage_lock); + bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); + lg_local_unlock(&c->usage_lock); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (!(flags & BCH_READ_LAST_FRAGMENT)) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } + submit_bio(&rbio->bio); + return 0; } else { + int ret; + submit_bio_wait(&rbio->bio); rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); ret = rbio->retry; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - if (!ret) - bch2_rbio_done(rbio); + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + __set_bit(pick.ptr.dev, avoid->d); + ret = READ_RETRY; + } + + return ret; } - return ret; -} +no_device_postclone: + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + bch2_rbio_free(rbio); +no_device: + __bcache_io_error(c, "no device to read from"); -static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) -{ - struct extent_pick_ptr pick; - struct btree_iter iter; - BKEY_PADDED(k) tmp; - struct bkey_s_c k; - int ret; + if (likely(!(flags & BCH_READ_IN_RETRY))) { + orig->bio.bi_status = BLK_STS_IOERR; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); -retry: - k = bch2_btree_iter_peek_slot(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); - goto err; + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; + } else { + return READ_ERR; } - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); - - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || - bkey_start_offset(k.k) != bvec_iter.bi_sector) - goto err; - - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } - - if (!pick.ca) - goto err; - - if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) { - percpu_ref_put(&pick.ca->io_ref); - goto err; - - } - - ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - bio_endio(&rbio->bio); - return; - }; - - return; -err: +hole: /* - * extent we wanted to read no longer exists, or - * was merged or partially overwritten (and thus - * possibly bigger than the memory that was - * originally allocated) + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: */ - rbio->bio.bi_status = BLK_STS_AGAIN; - bio_endio(&rbio->bio); - return; + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) +void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_iter iter; struct bkey_s_c k; + unsigned flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED; int ret; - EBUG_ON(flags & BCH_READ_NODECODE); -retry: + BUG_ON(rbio->_state); + BUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_IN_RETRY); + + rbio->c = c; + rbio->start_time = local_clock(); + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), + POS(inode, rbio->bio.bi_iter.bi_sector), BTREE_ITER_SLOTS, k) { BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - struct bvec_iter fragment; + unsigned bytes; /* * Unlock the iterator while the btree node's lock is still in @@ -1508,49 +1790,20 @@ retry: k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&iter); - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } + bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, + (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); + swap(rbio->bio.bi_iter.bi_size, bytes); - fragment = bvec_iter; - fragment.bi_size = (min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector) << 9; + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; - if (pick.ca) { - if (fragment.bi_size != bvec_iter.bi_size) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } + bch2_read_extent(c, rbio, k, flags); - ret = __bch2_read_extent(c, rbio, fragment, - bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - return; - }; - } else { - zero_fill_bio_iter(&rbio->bio, fragment); - - if (fragment.bi_size == bvec_iter.bi_size) - bio_endio(&rbio->bio); - } - - if (fragment.bi_size == bvec_iter.bi_size) + if (flags & BCH_READ_LAST_FRAGMENT) return; - bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size); + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); } /* @@ -1560,5 +1813,34 @@ retry: ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); + bch2_rbio_done(rbio); +} + +void bch2_fs_io_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->sb.encoded_extent_max) / + PAGE_SECTORS, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) + return -ENOMEM; + + return 0; } diff --git a/libbcachefs/io.h b/libbcachefs/io.h index a0c795ab..8b1411c6 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t); -void bch2_latency_acct(struct bch_dev *, unsigned, int); +void bch2_latency_acct(struct bch_dev *, u64, int); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *); @@ -99,40 +99,28 @@ struct cache_promote_op; struct extent_pick_ptr; int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c_extent e, struct extent_pick_ptr *, - unsigned); -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - u64, struct bch_devs_mask *, unsigned); + struct bkey_s_c, struct bch_devs_mask *, unsigned); +void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, BCH_READ_USER_MAPPED = 1 << 2, BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 4, - BCH_READ_MUST_CLONE = 1 << 5, - BCH_READ_IN_RETRY = 1 << 6, + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, }; static inline void bch2_read_extent(struct bch_fs *c, struct bch_read_bio *rbio, - struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, + struct bkey_s_c k, unsigned flags) { - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags); -} - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inode) -{ - BUG_ON(rbio->_state); - __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags); } static inline struct bch_read_bio *rbio_init(struct bio *bio, @@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, return rbio; } +void bch2_fs_io_exit(struct bch_fs *); +int bch2_fs_io_init(struct bch_fs *); + #endif /* _BCACHEFS_IO_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index a022ab33..28281ea6 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -14,6 +14,8 @@ struct bch_read_bio { struct bch_fs *c; + u64 start_time; + u64 submit_time; /* * Reads will often have to be split, and if the extent being read from @@ -35,17 +37,19 @@ struct bch_read_bio { */ struct bvec_iter bvec_iter; - unsigned submit_time_us; - u8 flags; + u16 flags; union { struct { - u8 bounce:1, + u16 bounce:1, split:1, + kmalloc:1, + have_ioref:1, narrow_crcs:1, + hole:1, retry:2, context:2; }; - u8 _state; + u16 _state; }; struct bch_devs_list devs_have; @@ -66,20 +70,20 @@ struct bch_read_bio { struct bch_write_bio { struct bch_fs *c; - struct bch_dev *ca; struct bch_write_bio *parent; + u64 submit_time; + struct bch_devs_list failed; u8 order; + u8 dev; unsigned split:1, bounce:1, put_bio:1, - have_io_ref:1, + have_ioref:1, used_mempool:1; - unsigned submit_time_us; - struct bio bio; }; @@ -87,6 +91,7 @@ struct bch_write_op { struct closure cl; struct bch_fs *c; struct workqueue_struct *io_wq; + u64 start_time; unsigned written; /* sectors */ u16 flags; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b525a85c..ea67af3d 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -7,1160 +7,16 @@ #include "bcachefs.h" #include "alloc.h" #include "bkey_methods.h" -#include "buckets.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "checksum.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" +#include "buckets.h" #include "journal.h" -#include "replicas.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "super-io.h" -#include "vstructs.h" #include -static void journal_write(struct closure *); -static void journal_reclaim_fast(struct journal *); -static void journal_pin_add_entry(struct journal *, - struct journal_entry_pin_list *, - struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void journal_wake(struct journal *j) -{ - wake_up(&j->wait); - closure_wake_up(&j->async_wait); -} - -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - -static inline u64 journal_cur_seq(struct journal *j) -{ - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - return j->pin.back - 1; -} - -static inline u64 journal_pin_seq(struct journal *j, - struct journal_entry_pin_list *pin_list) -{ - return fifo_entry_idx_abs(&j->pin, pin_list); -} - -u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) -{ - u64 ret = 0; - - spin_lock(&j->lock); - if (journal_pin_active(pin)) - ret = journal_pin_seq(j, pin->pin_list); - spin_unlock(&j->lock); - - return ret; -} - -static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf, - unsigned type, enum btree_id id, - unsigned level, - const void *data, size_t u64s) -{ - struct jset *jset = buf->data; - - bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s), - type, id, level, data, u64s); - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -} - -static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch2_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id); - - if (!entry) - return NULL; - - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - -static void bch2_journal_add_btree_root(struct journal_buf *buf, - enum btree_id id, struct bkey_i *k, - unsigned level) -{ - bch2_journal_add_entry_noreservation(buf, - JOURNAL_ENTRY_BTREE_ROOT, id, level, - k, k->k.u64s); -} - -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; - - closure_init_stack(&cl); - - for (i = 0;; i++) { - struct btree_iter iter; - struct btree *b; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); - - __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); - - b = bch2_btree_iter_peek_node(&iter); - - /* The node might have already been rewritten: */ - - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); - if (ret) { - bch2_btree_iter_unlock(&iter); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; - } - } - - bch2_btree_iter_unlock(&iter); - } - - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); - } - - mutex_lock(&j->blacklist_lock); - - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); - - mutex_unlock(&j->blacklist_lock); -} - -static struct journal_seq_blacklist * -journal_seq_blacklist_find(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq == bl->seq) - return bl; - - return NULL; -} - -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq) -{ - struct journal_seq_blacklist *bl; - - lockdep_assert_held(&j->blacklist_lock); - - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ - - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; - - bl->seq = seq; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; -} - -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) -{ - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq, i; - int ret = 0; - - if (!seq) - return 0; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); - - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); - - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - bch2_fs_inconsistent_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); - - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; - - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - for (i = journal_seq + 1; i <= seq; i++) { - bl = journal_seq_blacklist_find(j, i) ?: - bch2_journal_seq_blacklisted_new(j, i); - if (!bl) { - ret = -ENOMEM; - goto out; - } - } - } - - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max(bl->nr_entries * 2, 8UL) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } - - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: - mutex_unlock(&j->blacklist_lock); - return ret; -} - -/* - * Journal replay/recovery: - * - * This code is all driven from bch2_fs_start(); we first read the journal - * entries, do some other stuff, then we mark all the keys in the journal - * entries (same as garbage collection would), then we replay them - reinserting - * them into the cache in precisely the same order as they appear in the - * journal. - * - * We only journal keys that go in leaf nodes, which simplifies things quite a - * bit. - */ - -struct journal_list { - struct closure cl; - struct mutex lock; - struct list_head *head; - int ret; -}; - -#define JOURNAL_ENTRY_ADD_OK 0 -#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 - -/* - * Given a journal entry we just read, add it to the list of journal entries to - * be replayed: - */ -static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_list *jlist, struct jset *j) -{ - struct journal_replay *i, *pos; - struct list_head *where; - size_t bytes = vstruct_bytes(j); - __le64 last_seq; - int ret; - - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; - } - - /* Drop entries we don't need anymore */ - list_for_each_entry_safe(i, pos, jlist->head, list) { - if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) - break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } - - list_for_each_entry_reverse(i, jlist->head, list) { - /* Duplicate? */ - if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - fsck_err_on(bytes != vstruct_bytes(&i->j) || - memcmp(j, &i->j, bytes), c, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - goto found; - } - - if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { - where = &i->list; - goto add; - } - } - - where = jlist->head; -add: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) { - ret = -ENOMEM; - goto out; - } - - list_add(&i->list, where); - i->devs.nr = 0; - memcpy(&i->j, j, bytes); -found: - if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) - bch2_dev_list_add_dev(&i->devs, ca->dev_idx); - else - fsck_err_on(1, c, "duplicate journal entries on same device"); - ret = JOURNAL_ENTRY_ADD_OK; -out: -fsck_err: - return ret; -} - -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - -/* this fills in a range with empty jset_entries: */ -static void journal_entry_null_range(void *start, void *end) -{ - struct jset_entry *entry; - - for (entry = start; entry != end; entry = vstruct_next(entry)) - memset(entry, 0, sizeof(*entry)); -} - -static int journal_validate_key(struct bch_fs *c, struct jset *jset, - struct jset_entry *entry, - struct bkey_i *k, enum bkey_type key_type, - const char *type) -{ - void *next = vstruct_next(entry); - const char *invalid; - char buf[160]; - int ret = 0; - - if (mustfix_fsck_err_on(!k->k.u64s, c, - "invalid %s in journal: k->u64s 0", type)) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (mustfix_fsck_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), c, - "invalid %s in journal: extends past end of journal entry", - type)) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in journal: bad format %u", - type, k->k.format)) { - le16_add_cpu(&entry->u64s, -k->k.u64s); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) - bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); - - invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); - if (invalid) { - bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), - bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", - type, invalid, buf); - - le16_add_cpu(&entry->u64s, -k->k.u64s); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } -fsck_err: - return ret; -} - -#define JOURNAL_ENTRY_REREAD 5 -#define JOURNAL_ENTRY_NONE 6 -#define JOURNAL_ENTRY_BAD 7 - -#define journal_entry_err(c, msg, ...) \ -({ \ - if (write == READ) { \ - mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ - } else { \ - bch_err(c, "detected corrupt metadata before write:\n" \ - msg, ##__VA_ARGS__); \ - ret = BCH_FSCK_ERRORS_NOT_FIXED; \ - goto fsck_err; \ - } \ - true; \ -}) - -#define journal_entry_err_on(cond, c, msg, ...) \ - ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) - -static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset, - int write) -{ - struct jset_entry *entry; - int ret = 0; - - vstruct_for_each(jset, entry) { - void *next = vstruct_next(entry); - struct bkey_i *k; - - if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(jset), c, - "journal entry extends past end of jset")) { - jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); - break; - } - - switch (entry->type) { - case JOURNAL_ENTRY_BTREE_KEYS: - vstruct_for_each(entry, k) { - ret = journal_validate_key(c, jset, entry, k, - bkey_type(entry->level, - entry->btree_id), - "key"); - if (ret) - goto fsck_err; - } - break; - - case JOURNAL_ENTRY_BTREE_ROOT: - k = entry->start; - - if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, c, - "invalid btree root journal entry: wrong number of keys")) { - /* - * we don't want to null out this jset_entry, - * just the contents, so that later we can tell - * we were _supposed_ to have a btree root - */ - entry->u64s = 0; - journal_entry_null_range(vstruct_next(entry), next); - continue; - } - - ret = journal_validate_key(c, jset, entry, k, - BKEY_TYPE_BTREE, "btree root"); - if (ret) - goto fsck_err; - break; - - case JOURNAL_ENTRY_PRIO_PTRS: - break; - - case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED: - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, - vstruct_next(entry)); - } - - break; - default: - journal_entry_err(c, "invalid journal entry type %u", - entry->type); - journal_entry_null_range(entry, vstruct_next(entry)); - break; - } - } - -fsck_err: - return ret; -} - -static int journal_entry_validate(struct bch_fs *c, - struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read, - int write) -{ - size_t bytes = vstruct_bytes(jset); - struct bch_csum csum; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) { - bch_err(c, "unknown journal entry version %u", - le32_to_cpu(jset->version)); - return BCH_FSCK_UNKNOWN_VERSION; - } - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, - "journal entry too big (%zu bytes), sector %lluu", - bytes, sector)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } - - if (bytes > sectors_read << 9) - return JOURNAL_ENTRY_REREAD; - - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, - "journal entry with unknown csum type %llu sector %lluu", - JSET_CSUM_TYPE(jset), sector)) - return JOURNAL_ENTRY_BAD; - - csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, - "journal checksum bad, sector %llu", sector)) { - /* XXX: retry IO, when we start retrying checksum errors */ - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } - - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - - if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, - "invalid journal entry: last_seq > seq")) - jset->last_seq = jset->seq; - - return 0; -fsck_err: - return ret; -} - -struct journal_read_buf { - void *data; - size_t size; -}; - -static int journal_read_buf_realloc(struct journal_read_buf *b, - size_t new_size) -{ - void *n; - - /* the bios are sized for this many pages, max: */ - if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -ENOMEM; - - new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); - if (!n) - return -ENOMEM; - - kvpfree(b->data, b->size); - b->data = n; - b->size = new_size; - return 0; -} - -static int journal_read_bucket(struct bch_dev *ca, - struct journal_read_buf *buf, - struct journal_list *jlist, - unsigned bucket, u64 *seq, bool *entries_found) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio; - struct jset *j = NULL; - unsigned sectors, sectors_read = 0; - u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; - bool saw_bad = false; - int ret = 0; - - pr_debug("reading %u", bucket); - - while (offset < end) { - if (!sectors_read) { -reread: sectors_read = min_t(unsigned, - end - offset, buf->size >> 9); - - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = offset; - bio->bi_iter.bi_size = sectors_read << 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch2_bio_map(bio, buf->data); - - ret = submit_bio_wait(bio); - - if (bch2_dev_io_err_on(ret, ca, - "journal read from sector %llu", - offset) || - bch2_meta_read_fault("journal")) - return -EIO; - - j = buf->data; - } - - ret = journal_entry_validate(c, j, offset, - end - offset, sectors_read, - READ); - switch (ret) { - case BCH_FSCK_OK: - break; - case JOURNAL_ENTRY_REREAD: - if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, - vstruct_bytes(j)); - if (ret) - return ret; - } - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - return 0; - sectors = c->opts.block_size; - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; - sectors = c->opts.block_size; - goto next_block; - default: - return ret; - } - - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j); - mutex_unlock(&jlist->lock); - - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - return ret; - } - - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); - - sectors = vstruct_sectors(j, c->block_bits); -next_block: - pr_debug("next"); - offset += sectors; - sectors_read -= sectors; - j = ((void *) j) + (sectors << 9); - } - - return 0; -} - -static void bch2_journal_read_device(struct closure *cl) -{ -#define read_bucket(b) \ - ({ \ - bool entries_found = false; \ - ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ - &entries_found); \ - if (ret) \ - goto err; \ - __set_bit(b, bitmap); \ - entries_found; \ - }) - - struct journal_device *ja = - container_of(cl, struct journal_device, read); - struct bch_dev *ca = container_of(ja, struct bch_dev, journal); - struct journal_list *jlist = - container_of(cl->parent, struct journal_list, cl); - struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); - struct journal_read_buf buf = { NULL, 0 }; - - DECLARE_BITMAP(bitmap, ja->nr); - unsigned i, l, r; - u64 seq = 0; - int ret; - - if (!ja->nr) - goto out; - - bitmap_zero(bitmap, ja->nr); - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); - if (ret) - goto err; - - pr_debug("%u journal buckets", ja->nr); - - /* - * If the device supports discard but not secure discard, we can't do - * the fancy fibonacci hash/binary search because the live journal - * entries might not form a contiguous range: - */ - for (i = 0; i < ja->nr; i++) - read_bucket(i); - goto search_done; - - if (!blk_queue_nonrot(q)) - goto linear_scan; - - /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries - */ - for (i = 0; i < ja->nr; i++) { - l = (i * 2654435769U) % ja->nr; - - if (test_bit(l, bitmap)) - break; - - if (read_bucket(l)) - goto bsearch; - } - - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); -linear_scan: - for (l = find_first_zero_bit(bitmap, ja->nr); - l < ja->nr; - l = find_next_zero_bit(bitmap, ja->nr, l + 1)) - if (read_bucket(l)) - goto bsearch; - - /* no journal entries on this device? */ - if (l == ja->nr) - goto out; -bsearch: - /* Binary search */ - r = find_next_bit(bitmap, ja->nr, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); - - while (l + 1 < r) { - unsigned m = (l + r) >> 1; - u64 cur_seq = seq; - - read_bucket(m); - - if (cur_seq != seq) - l = m; - else - r = m; - } - -search_done: - /* - * Find the journal bucket with the highest sequence number: - * - * If there's duplicate journal entries in multiple buckets (which - * definitely isn't supposed to happen, but...) - make sure to start - * cur_idx at the last of those buckets, so we don't deadlock trying to - * allocate - */ - seq = 0; - - for (i = 0; i < ja->nr; i++) - if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { - /* - * When journal_next_bucket() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - seq = ja->bucket_seq[i]; - } - - /* - * Set last_idx to indicate the entire journal is full and needs to be - * reclaimed - journal reclaim will immediately reclaim whatever isn't - * pinned when it first runs: - */ - ja->last_idx = (ja->cur_idx + 1) % ja->nr; - - /* - * Read buckets in reverse order until we stop finding more journal - * entries: - */ - for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; - i != ja->cur_idx; - i = (i + ja->nr - 1) % ja->nr) - if (!test_bit(i, bitmap) && - !read_bucket(i)) - break; -out: - kvpfree(buf.data, buf.size); - percpu_ref_put(&ca->io_ref); - closure_return(cl); -err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); - goto out; -#undef read_bucket -} - -void bch2_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } -} - -static int journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - struct journal_entry_pin_list *p) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *entry; - struct journal_seq_blacklist *bl; - u64 seq; - - for_each_jset_entry_type(entry, &i->j, - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - seq = le64_to_cpu(bl_entry->seq); - - bch_verbose(c, "blacklisting existing journal seq %llu", seq); - - bl = bch2_journal_seq_blacklisted_new(j, seq); - if (!bl) - return -ENOMEM; - - journal_pin_add_entry(j, p, &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } - - return 0; -} - -static inline bool journal_has_keys(struct list_head *list) -{ - struct journal_replay *i; - struct jset_entry *entry; - struct bkey_i *k, *_n; - - list_for_each_entry(i, list, list) - for_each_jset_key(k, _n, entry, &i->j) - return true; - - return false; -} - -int bch2_journal_read(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct journal_list jlist; - struct journal_replay *i; - struct journal_entry_pin_list *p; - struct bch_dev *ca; - u64 cur_seq, end_seq, seq; - unsigned iter, keys = 0, entries = 0; - size_t nr; - bool degraded = false; - int ret = 0; - - closure_init_stack(&jlist.cl); - mutex_init(&jlist.lock); - jlist.head = list; - jlist.ret = 0; - - for_each_member_device(ca, c, iter) { - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) - continue; - - if ((ca->mi.state == BCH_MEMBER_STATE_RW || - ca->mi.state == BCH_MEMBER_STATE_RO) && - percpu_ref_tryget(&ca->io_ref)) - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_unbound_wq, - &jlist.cl); - else - degraded = true; - } - - closure_sync(&jlist.cl); - - if (jlist.ret) - return jlist.ret; - - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - - fsck_err_on(c->sb.clean && journal_has_keys(list), c, - "filesystem marked clean but journal has keys to replay"); - - list_for_each_entry(i, list, list) { - ret = journal_entry_validate_entries(c, &i->j, READ); - if (ret) - goto fsck_err; - - /* - * If we're mounting in degraded mode - if we didn't read all - * the devices - this is wrong: - */ - - if (!degraded && - (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, - i->devs), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL))) { - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); - if (ret) - return ret; - } - } - - i = list_last_entry(list, struct journal_replay, list); - - nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%zu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); - j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - - j->pin.front = le64_to_cpu(i->j.last_seq); - j->pin.back = le64_to_cpu(i->j.seq) + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } - - mutex_lock(&j->blacklist_lock); - - list_for_each_entry(i, list, list) { - p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - atomic_set(&p->count, 1); - p->devs = i->devs; - - if (journal_seq_blacklist_read(j, i, p)) { - mutex_unlock(&j->blacklist_lock); - return -ENOMEM; - } - } - - mutex_unlock(&j->blacklist_lock); - - cur_seq = journal_last_seq(j); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; - bool blacklisted; - - mutex_lock(&j->blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - journal_seq_blacklist_find(j, cur_seq)) - cur_seq++; - - blacklisted = journal_seq_blacklist_find(j, - le64_to_cpu(i->j.seq)); - mutex_unlock(&j->blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - journal_last_seq(j), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; - - for_each_jset_key(k, _n, entry, &i->j) - keys++; - entries++; - } - - bch_info(c, "journal read done, %i keys in %i entries, seq %llu", - keys, entries, journal_cur_seq(j)); -fsck_err: - return ret; -} - -int bch2_journal_mark(struct bch_fs *c, struct list_head *list) -{ - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; - int ret; - - list_for_each_entry(r, list, list) - for_each_jset_key(k, n, j, &r->j) { - enum bkey_type type = bkey_type(j->level, j->btree_id); - struct bkey_s_c k_s_c = bkey_i_to_s_c(k); - - if (btree_type_has_ptrs(type)) { - ret = bch2_btree_mark_key_initial(c, type, k_s_c); - if (ret) - return ret; - } - } - - return 0; -} - static bool journal_entry_is_open(struct journal *j) { return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; @@ -1174,15 +30,15 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) if (!need_write_just_set && test_bit(JOURNAL_NEED_WRITE, &j->flags)) - __bch2_time_stats_update(j->delay_time, - j->need_write_time); + bch2_time_stats_update(j->delay_time, + j->need_write_time); #if 0 - closure_call(&j->io, journal_write, NULL, NULL); + closure_call(&j->io, bch2_journal_write, NULL, NULL); #else /* Shut sparse up: */ closure_init(&j->io, NULL); - set_closure_fn(&j->io, journal_write, NULL); - journal_write(&j->io); + set_closure_fn(&j->io, bch2_journal_write, NULL); + bch2_journal_write(&j->io); #endif } @@ -1269,8 +125,8 @@ static enum { c->opts.block_size; BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); - journal_reclaim_fast(j); - /* XXX: why set this here, and not in journal_write()? */ + bch2_journal_reclaim_fast(j); + /* XXX: why set this here, and not in bch2_journal_write()? */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); journal_pin_new_entry(j, 1); @@ -1285,6 +141,8 @@ static enum { bch2_bucket_seq_cleanup(c); } + c->bucket_journal_seq++; + /* ugh - might be called from __journal_res_get() under wait_event() */ __set_current_state(TASK_RUNNING); bch2_journal_buf_put(j, old.idx, need_write_just_set); @@ -1311,96 +169,6 @@ void bch2_journal_halt(struct journal *j) closure_wake_up(&journal_prev_buf(j)->wait); } -static unsigned journal_dev_buckets_available(struct journal *j, - struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - - /* - * Hack to avoid a deadlock during journal replay: - * journal replay might require setting a new btree - * root, which requires writing another journal entry - - * thus, if the journal is full (and this happens when - * replaying the first journal bucket's entries) we're - * screwed. - * - * So don't let the journal fill up unless we're in - * replay: - */ - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - available = max((int) available - 2, 0); - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) - available = max((int) available - 1, 0); - - return available; -} - -/* returns number of sectors available for next journal entry: */ -static int journal_entry_sectors(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - unsigned sectors_available = UINT_MAX; - unsigned i, nr_online = 0, nr_devs = 0; - - lockdep_assert_held(&j->lock); - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_required = 0; - - if (!ja->nr) - continue; - - sectors_available = min_t(unsigned, sectors_available, - ca->mi.bucket_size); - - /* - * Note that we don't allocate the space for a journal entry - * until we write it out - thus, if we haven't started the write - * for the previous entry we have to make sure we have space for - * it too: - */ - if (bch2_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ja->sectors_free) - buckets_required++; - - if (j->prev_buf_sectors + sectors_available > - ja->sectors_free) - buckets_required++; - } else { - if (j->prev_buf_sectors + sectors_available > - ca->mi.bucket_size) - buckets_required++; - - buckets_required++; - } - - if (journal_dev_buckets_available(j, ca) >= buckets_required) - nr_devs++; - nr_online++; - } - rcu_read_unlock(); - - if (nr_online < c->opts.metadata_replicas_required) - return -EROFS; - - if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) - return 0; - - return sectors_available; -} - /* * should _only_ called from journal_res_get() - when we actually want a * journal reservation - journal entry is open means journal is dirty: @@ -1425,7 +193,7 @@ static int journal_entry_open(struct journal *j) if (!fifo_free(&j->pin)) return 0; - sectors = journal_entry_sectors(j); + sectors = bch2_journal_entry_sectors(j); if (sectors <= 0) return sectors; @@ -1468,8 +236,8 @@ static int journal_entry_open(struct journal *j) old.v, new.v)) != old.v); if (j->res_get_blocked_start) - __bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); + bch2_time_stats_update(j->blocked_time, + j->res_get_blocked_start); j->res_get_blocked_start = 0; mod_delayed_work(system_freezable_wq, @@ -1479,110 +247,444 @@ static int journal_entry_open(struct journal *j) return 1; } -void bch2_journal_start(struct bch_fs *c) +/* + * returns true if there's nothing to flush and no journal write still in flight + */ +static bool journal_flush_write(struct journal *j) { - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl; - u64 new_seq = 0; + bool ret; - list_for_each_entry(bl, &j->seq_blacklist, list) - new_seq = max(new_seq, bl->seq); + spin_lock(&j->lock); + ret = !j->reservations.prev_buf_unwritten; + + if (!journal_entry_is_open(j)) { + spin_unlock(&j->lock); + return ret; + } + + set_bit(JOURNAL_NEED_WRITE, &j->flags); + if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED) + ret = false; + else + spin_unlock(&j->lock); + return ret; +} + +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(work, struct journal, write_work.work); + + journal_flush_write(j); +} + +/* + * Given an inode number, if that inode number has data in the journal that + * hasn't yet been flushed, return the journal sequence number that needs to be + * flushed: + */ +u64 bch2_inode_journal_seq(struct journal *j, u64 inode) +{ + size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); + u64 seq = 0; + + if (!test_bit(h, j->buf[0].has_inode) && + !test_bit(h, j->buf[1].has_inode)) + return 0; + + spin_lock(&j->lock); + if (test_bit(h, journal_cur_buf(j)->has_inode)) + seq = journal_cur_seq(j); + else if (test_bit(h, journal_prev_buf(j)->has_inode)) + seq = journal_cur_seq(j) - 1; + spin_unlock(&j->lock); + + return seq; +} + +static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned u64s_min, unsigned u64s_max) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; + int ret; +retry: + ret = journal_res_get_fast(j, res, u64s_min, u64s_max); + if (ret) + return ret; + + spin_lock(&j->lock); + /* + * Recheck after taking the lock, so we don't race with another thread + * that just did journal_entry_open() and call journal_entry_close() + * unnecessarily + */ + ret = journal_res_get_fast(j, res, u64s_min, u64s_max); + if (ret) { + spin_unlock(&j->lock); + return 1; + } + + /* + * If we couldn't get a reservation because the current buf filled up, + * and we had room for a bigger entry on disk, signal that we want to + * realloc the journal bufs: + */ + buf = journal_cur_buf(j); + if (journal_entry_is_open(j) && + buf->size >> 9 < buf->disk_sectors && + buf->size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->size << 1); + + /* + * Close the current journal entry if necessary, then try to start a new + * one: + */ + switch (journal_buf_switch(j, false)) { + case JOURNAL_ENTRY_ERROR: + spin_unlock(&j->lock); + return -EROFS; + case JOURNAL_ENTRY_INUSE: + /* haven't finished writing out the previous one: */ + spin_unlock(&j->lock); + trace_journal_entry_full(c); + goto blocked; + case JOURNAL_ENTRY_CLOSED: + break; + case JOURNAL_UNLOCKED: + goto retry; + } + + /* We now have a new, closed journal buf - see if we can open it: */ + ret = journal_entry_open(j); + spin_unlock(&j->lock); + + if (ret < 0) + return ret; + if (ret) + goto retry; + + /* Journal's full, we have to wait */ + + /* + * Direct reclaim - can't rely on reclaim from work item + * due to freezing.. + */ + bch2_journal_reclaim_work(&j->reclaim_work.work); + + trace_journal_full(c); +blocked: + if (!j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; + return 0; +} + +/* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. + * Journal write is the structure used set up journal writes. The calling + * function will then add its keys to the structure, queuing them for the next + * write. + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. + */ +int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + unsigned u64s_min, unsigned u64s_max) +{ + int ret; + + wait_event(j->wait, + (ret = __journal_res_get(j, res, u64s_min, + u64s_max))); + return ret < 0 ? ret : 0; +} + +u64 bch2_journal_last_unwritten_seq(struct journal *j) +{ + u64 seq; + + spin_lock(&j->lock); + seq = journal_cur_seq(j); + if (j->reservations.prev_buf_unwritten) + seq--; + spin_unlock(&j->lock); + + return seq; +} + +/** + * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't + * open yet, or wait if we cannot + * + * used by the btree interior update machinery, when it needs to write a new + * btree root - every journal entry contains the roots of all the btrees, so it + * doesn't need to bother with getting a journal reservation + */ +int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) +{ + int ret; + + spin_lock(&j->lock); + BUG_ON(seq > journal_cur_seq(j)); + + if (seq < journal_cur_seq(j) || + journal_entry_is_open(j)) { + spin_unlock(&j->lock); + return 1; + } + + ret = journal_entry_open(j); + if (!ret) + closure_wait(&j->async_wait, parent); + spin_unlock(&j->lock); + + if (!ret) + bch2_journal_reclaim_work(&j->reclaim_work.work); + + return ret; +} + +/** + * bch2_journal_wait_on_seq - wait for a journal entry to be written + * + * does _not_ cause @seq to be written immediately - if there is no other + * activity to cause the relevant journal entry to be filled up or flushed it + * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is + * configurable). + */ +void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) +{ + spin_lock(&j->lock); + + BUG_ON(seq > journal_cur_seq(j)); + + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return; + } + + if (seq == journal_cur_seq(j)) { + if (!closure_wait(&journal_cur_buf(j)->wait, parent)) + BUG(); + } else if (seq + 1 == journal_cur_seq(j) && + j->reservations.prev_buf_unwritten) { + if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + BUG(); + + smp_mb(); + + /* check if raced with write completion (or failure) */ + if (!j->reservations.prev_buf_unwritten || + bch2_journal_error(j)) + closure_wake_up(&journal_prev_buf(j)->wait); + } + + spin_unlock(&j->lock); +} + +/** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary + */ +void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) +{ + struct journal_buf *buf; spin_lock(&j->lock); - set_bit(JOURNAL_STARTED, &j->flags); + BUG_ON(seq > journal_cur_seq(j)); - while (journal_cur_seq(j) < new_seq) - journal_pin_new_entry(j, 0); + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return; + } - /* - * journal_buf_switch() only inits the next journal entry when it - * closes an open journal entry - the very first journal entry gets - * initialized here: - */ - journal_pin_new_entry(j, 1); - bch2_journal_buf_init(j); + if (seq == journal_cur_seq(j)) { + bool set_need_write = false; + + buf = journal_cur_buf(j); + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); + + if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { + j->need_write_time = local_clock(); + set_need_write = true; + } + + switch (journal_buf_switch(j, set_need_write)) { + case JOURNAL_ENTRY_ERROR: + if (parent) + closure_wake_up(&buf->wait); + break; + case JOURNAL_ENTRY_CLOSED: + /* + * Journal entry hasn't been opened yet, but caller + * claims it has something + */ + BUG(); + case JOURNAL_ENTRY_INUSE: + break; + case JOURNAL_UNLOCKED: + return; + } + } else if (parent && + seq + 1 == journal_cur_seq(j) && + j->reservations.prev_buf_unwritten) { + buf = journal_prev_buf(j); + + if (!closure_wait(&buf->wait, parent)) + BUG(); + + smp_mb(); + + /* check if raced with write completion (or failure) */ + if (!j->reservations.prev_buf_unwritten || + bch2_journal_error(j)) + closure_wake_up(&buf->wait); + } + + spin_unlock(&j->lock); +} + +static int journal_seq_flushed(struct journal *j, u64 seq) +{ + struct journal_buf *buf; + int ret = 1; + + spin_lock(&j->lock); + BUG_ON(seq > journal_cur_seq(j)); + + if (seq == journal_cur_seq(j)) { + bool set_need_write = false; + + ret = 0; + + buf = journal_cur_buf(j); + + if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { + j->need_write_time = local_clock(); + set_need_write = true; + } + + switch (journal_buf_switch(j, set_need_write)) { + case JOURNAL_ENTRY_ERROR: + ret = -EIO; + break; + case JOURNAL_ENTRY_CLOSED: + /* + * Journal entry hasn't been opened yet, but caller + * claims it has something + */ + BUG(); + case JOURNAL_ENTRY_INUSE: + break; + case JOURNAL_UNLOCKED: + return 0; + } + } else if (seq + 1 == journal_cur_seq(j) && + j->reservations.prev_buf_unwritten) { + ret = bch2_journal_error(j); + } spin_unlock(&j->lock); - /* - * Adding entries to the next journal entry before allocating space on - * disk for the next journal entry - this is ok, because these entries - * only have to go down with the next journal entry we write: - */ - list_for_each_entry(bl, &j->seq_blacklist, list) - if (!bl->written) { - bch2_journal_add_entry_noreservation(journal_cur_buf(j), - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, - 0, 0, &bl->seq, 1); - - journal_pin_add_entry(j, - &fifo_peek_back(&j->pin), - &bl->pin, - journal_seq_blacklist_flush); - bl->written = true; - } - - queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); -} - -int bch2_journal_replay(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - int ret = 0; - - list_for_each_entry_safe(i, n, list, list) { - j->replay_pin_list = - journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - for_each_jset_key(k, _n, entry, &i->j) { - - if (entry->btree_id == BTREE_ID_ALLOC) { - /* - * allocation code handles replay for - * BTREE_ID_ALLOC keys: - */ - ret = bch2_alloc_replay_key(c, k->k.p); - } else { - /* - * We might cause compressed extents to be - * split, so we need to pass in a - * disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - - ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); - } - - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - goto err; - } - - cond_resched(); - } - - if (atomic_dec_and_test(&j->replay_pin_list->count)) - journal_wake(j); - } - - j->replay_pin_list = NULL; - - bch2_journal_set_replay_done(j); - ret = bch2_journal_flush_all_pins(j); -err: - bch2_journal_entries_free(list); return ret; } +int bch2_journal_flush_seq(struct journal *j, u64 seq) +{ + u64 start_time = local_clock(); + int ret, ret2; + + ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); + + bch2_time_stats_update(j->flush_seq_time, start_time); + + return ret ?: ret2 < 0 ? ret2 : 0; +} + +/** + * bch2_journal_meta_async - force a journal entry to be written + */ +void bch2_journal_meta_async(struct journal *j, struct closure *parent) +{ + struct journal_res res; + unsigned u64s = jset_u64s(0); + + memset(&res, 0, sizeof(res)); + + bch2_journal_res_get(j, &res, u64s, u64s); + bch2_journal_res_put(j, &res); + + bch2_journal_flush_seq_async(j, res.seq, parent); +} + +int bch2_journal_meta(struct journal *j) +{ + struct journal_res res; + unsigned u64s = jset_u64s(0); + int ret; + + memset(&res, 0, sizeof(res)); + + ret = bch2_journal_res_get(j, &res, u64s, u64s); + if (ret) + return ret; + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + +/* + * bch2_journal_flush_async - if there is an open journal entry, or a journal + * still being written, write it and wait for the write to complete + */ +void bch2_journal_flush_async(struct journal *j, struct closure *parent) +{ + u64 seq, journal_seq; + + spin_lock(&j->lock); + journal_seq = journal_cur_seq(j); + + if (journal_entry_is_open(j)) { + seq = journal_seq; + } else if (journal_seq) { + seq = journal_seq - 1; + } else { + spin_unlock(&j->lock); + return; + } + spin_unlock(&j->lock); + + bch2_journal_flush_seq_async(j, seq, parent); +} + +int bch2_journal_flush(struct journal *j) +{ + u64 seq, journal_seq; + + spin_lock(&j->lock); + journal_seq = journal_cur_seq(j); + + if (journal_entry_is_open(j)) { + seq = journal_seq; + } else if (journal_seq) { + seq = journal_seq - 1; + } else { + spin_unlock(&j->lock); + return 0; + } + spin_unlock(&j->lock); + + return bch2_journal_flush_seq(j, seq); +} + +/* allocate journal on a device: */ + static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bool new_fs, struct closure *cl) { @@ -1745,1161 +847,6 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); } -/* Journalling */ - -/** - * journal_reclaim_fast - do the fast part of journal reclaim - * - * Called from IO submission context, does not block. Cleans up after btree - * write completions by advancing the journal pin and each cache's last_idx, - * kicking off discards and background reclaim as necessary. - */ -static void journal_reclaim_fast(struct journal *j) -{ - struct journal_entry_pin_list temp; - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!fifo_pop(&j->pin, temp)); - popped = true; - } - - if (popped) - journal_wake(j); -} - -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, marking it as dirty: - */ - -static inline void __journal_pin_add(struct journal *j, - struct journal_entry_pin_list *pin_list, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - BUG_ON(journal_pin_active(pin)); - BUG_ON(!atomic_read(&pin_list->count)); - - atomic_inc(&pin_list->count); - pin->pin_list = pin_list; - pin->flush = flush_fn; - - if (flush_fn) - list_add(&pin->list, &pin_list->list); - else - INIT_LIST_HEAD(&pin->list); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - journal_wake(j); -} - -static void journal_pin_add_entry(struct journal *j, - struct journal_entry_pin_list *pin_list, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_add(struct journal *j, - struct journal_res *res, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - struct journal_entry_pin_list *pin_list = res->ref - ? journal_seq_pin(j, res->seq) - : j->replay_pin_list; - - spin_lock(&j->lock); - __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock(&j->lock); -} - -static inline void __journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - struct journal_entry_pin_list *pin_list = pin->pin_list; - - if (!journal_pin_active(pin)) - return; - - pin->pin_list = NULL; - list_del_init(&pin->list); - - /* - * Unpinning a journal entry make make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - */ - if (atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin)) - journal_reclaim_fast(j); -} - -void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - spin_lock(&j->lock); - __journal_pin_drop(j, pin); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_add_if_older(struct journal *j, - struct journal_entry_pin *src_pin, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - if (journal_pin_active(src_pin) && - (!journal_pin_active(pin) || - journal_pin_seq(j, src_pin->pin_list) < - journal_pin_seq(j, pin->pin_list))) { - __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); - } - - spin_unlock(&j->lock); -} - -static struct journal_entry_pin * -__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret; - u64 iter; - - /* no need to iterate over empty fifo entries: */ - journal_reclaim_fast(j); - - fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { - if (iter > seq_to_flush) - break; - - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) { - /* must be list_del_init(), see bch2_journal_pin_drop() */ - list_move(&ret->list, &pin_list->flushed); - *seq = iter; - return ret; - } - } - - return NULL; -} - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) -{ - struct journal_entry_pin *ret; - - spin_lock(&j->lock); - ret = __journal_get_next_pin(j, seq_to_flush, seq); - spin_unlock(&j->lock); - - return ret; -} - -static int journal_flush_done(struct journal *j, u64 seq_to_flush, - struct journal_entry_pin **pin, - u64 *pin_seq) -{ - int ret; - - *pin = NULL; - - ret = bch2_journal_error(j); - if (ret) - return ret; - - spin_lock(&j->lock); - /* - * If journal replay hasn't completed, the unreplayed journal entries - * hold refs on their corresponding sequence numbers - */ - ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || - !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || - journal_last_seq(j) > seq_to_flush || - (fifo_used(&j->pin) == 1 && - atomic_read(&fifo_peek_front(&j->pin).count) == 1); - spin_unlock(&j->lock); - - return ret; -} - -int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin *pin; - u64 pin_seq; - bool flush; - - if (!test_bit(JOURNAL_STARTED, &j->flags)) - return 0; -again: - wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); - if (pin) { - /* flushing a journal pin might cause a new one to be added: */ - pin->flush(j, pin, pin_seq); - goto again; - } - - spin_lock(&j->lock); - flush = journal_last_seq(j) != j->last_seq_ondisk || - (seq_to_flush == U64_MAX && c->btree_roots_dirty); - spin_unlock(&j->lock); - - return flush ? bch2_journal_meta(j) : 0; -} - -int bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); -} - -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - bool ret; - - spin_lock(&j->lock); - ret = ja->nr && - (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); - spin_unlock(&j->lock); - - return ret; -} - -/** - * journal_reclaim_work - free up journal buckets - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -static void journal_reclaim_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, journal.reclaim_work); - struct journal *j = &c->journal; - struct bch_dev *ca; - struct journal_entry_pin *pin; - u64 seq, seq_to_flush = 0; - unsigned iter, bucket_to_flush; - unsigned long next_flush; - bool reclaim_lock_held = false, need_flush; - - /* - * Advance last_idx to point to the oldest journal entry containing - * btree node updates that have not yet been written out - */ - for_each_rw_member(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - while (should_discard_bucket(j, ja)) { - if (!reclaim_lock_held) { - /* - * ugh: - * might be called from __journal_res_get() - * under wait_event() - have to go back to - * TASK_RUNNING before doing something that - * would block, but only if we're doing work: - */ - __set_current_state(TASK_RUNNING); - - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - /* recheck under reclaim_lock: */ - continue; - } - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->last_idx]), - ca->mi.bucket_size, GFP_NOIO, 0); - - spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; - spin_unlock(&j->lock); - - journal_wake(j); - } - - /* - * Write out enough btree nodes to free up 50% journal - * buckets - */ - spin_lock(&j->lock); - bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; - seq_to_flush = max_t(u64, seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - spin_unlock(&j->lock); - } - - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); - - /* Also flush if the pin fifo is more than half full */ - spin_lock(&j->lock); - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); - need_flush = time_after(jiffies, next_flush); - - while ((pin = journal_get_next_pin(j, need_flush - ? U64_MAX - : seq_to_flush, &seq))) { - __set_current_state(TASK_RUNNING); - pin->flush(j, pin, seq); - need_flush = false; - - j->last_flushed = jiffies; - } - - if (!test_bit(BCH_FS_RO, &c->flags)) - queue_delayed_work(system_freezable_wq, &j->reclaim_work, - msecs_to_jiffies(j->reclaim_delay_ms)); -} - -/** - * journal_next_bucket - move on to the next journal bucket if possible - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned sectors) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - struct journal_device *ja; - struct bch_dev *ca; - struct dev_alloc_list devs_sorted; - unsigned i, replicas, replicas_want = - READ_ONCE(c->opts.metadata_replicas); - - spin_lock(&j->lock); - e = bkey_i_to_s_extent(&j->key); - - /* - * Drop any pointers to devices that have been removed, are no longer - * empty, or filled up their current journal bucket: - * - * Note that a device may have had a small amount of free space (perhaps - * one sector) that wasn't enough for the smallest possible journal - * entry - that's why we drop pointers to devices <= current free space, - * i.e. whichever device was limiting the current journal entry size. - */ - extent_for_each_ptr_backwards(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - if (ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors) - __bch2_extent_drop_ptr(e, ptr); - else - ca->journal.sectors_free -= sectors; - } - - replicas = bch2_extent_nr_ptrs(e.c); - - rcu_read_lock(); - devs_sorted = bch2_wp_alloc_list(c, &j->wp, - &c->rw_devs[BCH_DATA_JOURNAL]); - - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - if (!ca) - continue; - - if (!ca->mi.durability) - continue; - - ja = &ca->journal; - if (!ja->nr) - continue; - - if (replicas >= replicas_want) - break; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (bch2_extent_has_device(e.c, ca->dev_idx) || - !journal_dev_buckets_available(j, ca) || - sectors > ca->mi.bucket_size) - continue; - - j->wp.next_alloc[ca->dev_idx] += U32_MAX; - bch2_wp_rescale(c, ca, &j->wp); - - ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - - extent_ptr_append(bkey_i_to_extent(&j->key), - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]), - .dev = ca->dev_idx, - }); - - replicas += ca->mi.durability; - } - rcu_read_unlock(); - - j->prev_buf_sectors = 0; - - bkey_copy(&w->key, &j->key); - spin_unlock(&j->lock); - - if (replicas < c->opts.metadata_replicas_required) - return -EROFS; - - BUG_ON(!replicas); - - return 0; -} - -static void journal_write_compact(struct jset *jset) -{ - struct jset_entry *i, *next, *prev = NULL; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each_safe(jset, i, next) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == JOURNAL_ENTRY_BTREE_KEYS && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); - } - - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - -static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -{ - /* we aren't holding j->lock: */ - unsigned new_size = READ_ONCE(j->buf_size_want); - void *new_buf; - - if (buf->size >= new_size) - return; - - new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); - if (!new_buf) - return; - - memcpy(new_buf, buf->data, buf->size); - kvpfree(buf->data, buf->size); - buf->data = new_buf; - buf->size = new_size; -} - -static void journal_write_done(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_prev_buf(j); - struct bch_devs_list devs = - bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); - - if (!devs.nr) { - bch_err(c, "unable to write journal to sufficient devices"); - goto err; - } - - if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) - goto err; -out: - __bch2_time_stats_update(j->write_time, j->write_start_time); - - spin_lock(&j->lock); - j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); - - journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs; - - /* - * Updating last_seq_ondisk may let journal_reclaim_work() discard more - * buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); - - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); - - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); - - closure_wake_up(&w->wait); - journal_wake(j); - - if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) - mod_delayed_work(system_freezable_wq, &j->write_work, 0); - spin_unlock(&j->lock); - return; -err: - bch2_fatal_error(c); - bch2_journal_halt(j); - goto out; -} - -static void journal_write_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - struct journal *j = &ca->fs->journal; - - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || - bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_prev_buf(j); - unsigned long flags; - - spin_lock_irqsave(&j->err_lock, flags); - bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx); - spin_unlock_irqrestore(&j->err_lock, flags); - } - - closure_put(&j->io); - percpu_ref_put(&ca->io_ref); -} - -static void journal_write(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); - struct jset *jset; - struct bio *bio; - struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes; - - journal_buf_realloc(j, w); - jset = w->data; - - j->write_start_time = local_clock(); - mutex_lock(&c->btree_root_lock); - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; - - if (r->alive) - bch2_journal_add_btree_root(w, i, &r->key, r->level); - } - c->btree_roots_dirty = false; - mutex_unlock(&c->btree_root_lock); - - journal_write_compact(jset); - - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(BCACHE_JSET_VERSION); - - SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && - journal_entry_validate_entries(c, jset, WRITE)) - goto err; - - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - - jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), - journal_nonce(jset), jset); - - if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && - journal_entry_validate_entries(c, jset, WRITE)) - goto err; - - sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > j->prev_buf_sectors); - - bytes = vstruct_bytes(w->data); - memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); - - if (journal_write_alloc(j, w, sectors)) { - bch2_journal_halt(j); - bch_err(c, "Unable to allocate journal write"); - bch2_fatal_error(c); - continue_at(cl, journal_write_done, system_highpri_wq); - } - - /* - * XXX: we really should just disable the entire journal in nochanges - * mode - */ - if (c->opts.nochanges) - goto no_io; - - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], - sectors); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_iter.bi_size = sectors << 9; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch2_bio_map(bio, jset); - - trace_journal_write(bio); - closure_bio_submit(bio, cl); - - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); - } - - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - -no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) - ptr->offset += sectors; - - continue_at(cl, journal_write_done, system_highpri_wq); -err: - bch2_inconsistent_error(c); - continue_at(cl, journal_write_done, system_highpri_wq); -} - -/* - * returns true if there's nothing to flush and no journal write still in flight - */ -static bool journal_flush_write(struct journal *j) -{ - bool ret; - - spin_lock(&j->lock); - ret = !j->reservations.prev_buf_unwritten; - - if (!journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return ret; - } - - set_bit(JOURNAL_NEED_WRITE, &j->flags); - if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED) - ret = false; - else - spin_unlock(&j->lock); - return ret; -} - -static void journal_write_work(struct work_struct *work) -{ - struct journal *j = container_of(work, struct journal, write_work.work); - - journal_flush_write(j); -} - -/* - * Given an inode number, if that inode number has data in the journal that - * hasn't yet been flushed, return the journal sequence number that needs to be - * flushed: - */ -u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; - - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; - - spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = journal_cur_seq(j); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = journal_cur_seq(j) - 1; - spin_unlock(&j->lock); - - return seq; -} - -static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - int ret; -retry: - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) - return ret; - - spin_lock(&j->lock); - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call journal_entry_close() - * unnecessarily - */ - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) { - spin_unlock(&j->lock); - return 1; - } - - /* - * If we couldn't get a reservation because the current buf filled up, - * and we had room for a bigger entry on disk, signal that we want to - * realloc the journal bufs: - */ - buf = journal_cur_buf(j); - if (journal_entry_is_open(j) && - buf->size >> 9 < buf->disk_sectors && - buf->size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->size << 1); - - /* - * Close the current journal entry if necessary, then try to start a new - * one: - */ - switch (journal_buf_switch(j, false)) { - case JOURNAL_ENTRY_ERROR: - spin_unlock(&j->lock); - return -EROFS; - case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ - spin_unlock(&j->lock); - trace_journal_entry_full(c); - goto blocked; - case JOURNAL_ENTRY_CLOSED: - break; - case JOURNAL_UNLOCKED: - goto retry; - } - - /* We now have a new, closed journal buf - see if we can open it: */ - ret = journal_entry_open(j); - spin_unlock(&j->lock); - - if (ret < 0) - return ret; - if (ret) - goto retry; - - /* Journal's full, we have to wait */ - - /* - * Direct reclaim - can't rely on reclaim from work item - * due to freezing.. - */ - journal_reclaim_work(&j->reclaim_work.work); - - trace_journal_full(c); -blocked: - if (!j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - return 0; -} - -/* - * Essentially the entry function to the journaling code. When bcachefs is doing - * a btree insert, it calls this function to get the current journal write. - * Journal write is the structure used set up journal writes. The calling - * function will then add its keys to the structure, queuing them for the next - * write. - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. - */ -int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) -{ - int ret; - - wait_event(j->wait, - (ret = __journal_res_get(j, res, u64s_min, - u64s_max))); - return ret < 0 ? ret : 0; -} - -u64 bch2_journal_last_unwritten_seq(struct journal *j) -{ - u64 seq; - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - if (j->reservations.prev_buf_unwritten) - seq--; - spin_unlock(&j->lock); - - return seq; -} - -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) -{ - int ret; - - spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (seq < journal_cur_seq(j) || - journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return 1; - } - - ret = journal_entry_open(j); - if (!ret) - closure_wait(&j->async_wait, parent); - spin_unlock(&j->lock); - - if (!ret) - journal_reclaim_work(&j->reclaim_work.work); - - return ret; -} - -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) -{ - spin_lock(&j->lock); - - BUG_ON(seq > journal_cur_seq(j)); - - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == journal_cur_seq(j)) { - if (!closure_wait(&journal_cur_buf(j)->wait, parent)) - BUG(); - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) - BUG(); - - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); - } - - spin_unlock(&j->lock); -} - -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) -{ - struct journal_buf *buf; - - spin_lock(&j->lock); - - BUG_ON(seq > journal_cur_seq(j)); - - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - buf = journal_cur_buf(j); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - if (parent) - closure_wake_up(&buf->wait); - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return; - } - } else if (parent && - seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - buf = journal_prev_buf(j); - - if (!closure_wait(&buf->wait, parent)) - BUG(); - - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - - spin_unlock(&j->lock); -} - -static int journal_seq_flushed(struct journal *j, u64 seq) -{ - struct journal_buf *buf; - int ret = 1; - - spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - ret = 0; - - buf = journal_cur_buf(j); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - ret = -EIO; - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return 0; - } - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - ret = bch2_journal_error(j); - } - - spin_unlock(&j->lock); - - return ret; -} - -int bch2_journal_flush_seq(struct journal *j, u64 seq) -{ - u64 start_time = local_clock(); - int ret, ret2; - - ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); - - bch2_time_stats_update(j->flush_seq_time, start_time); - - return ret ?: ret2 < 0 ? ret2 : 0; -} - -void bch2_journal_meta_async(struct journal *j, struct closure *parent) -{ - struct journal_res res; - unsigned u64s = jset_u64s(0); - - memset(&res, 0, sizeof(res)); - - bch2_journal_res_get(j, &res, u64s, u64s); - bch2_journal_res_put(j, &res); - - bch2_journal_flush_seq_async(j, res.seq, parent); -} - -int bch2_journal_meta(struct journal *j) -{ - struct journal_res res; - unsigned u64s = jset_u64s(0); - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch2_journal_res_get(j, &res, u64s, u64s); - if (ret) - return ret; - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq); -} - -void bch2_journal_flush_async(struct journal *j, struct closure *parent) -{ - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return; - } - spin_unlock(&j->lock); - - bch2_journal_flush_seq_async(j, seq, parent); -} - -int bch2_journal_flush(struct journal *j) -{ - u64 seq, journal_seq; - - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return 0; - } - spin_unlock(&j->lock); - - return bch2_journal_flush_seq(j, seq); -} - -int bch2_journal_flush_device(struct journal *j, int dev_idx) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - struct bch_devs_list devs; - u64 iter, seq = 0; - int ret = 0; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(p, &j->pin, iter) - if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) - seq = iter; - spin_unlock(&j->lock); - - ret = bch2_journal_flush_pins(j, seq); - if (ret) - return ret; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); - - seq = 0; - - spin_lock(&j->lock); - while (!ret && seq < j->pin.back) { - seq = max(seq, journal_last_seq(j)); - devs = journal_seq_pin(j, seq)->devs; - seq++; - - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); - spin_lock(&j->lock); - } - spin_unlock(&j->lock); - - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - /* startup/shutdown: */ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) @@ -2936,6 +883,43 @@ void bch2_fs_journal_stop(struct journal *j) cancel_delayed_work_sync(&j->reclaim_work); } +void bch2_fs_journal_start(struct journal *j) +{ + struct journal_seq_blacklist *bl; + u64 blacklist = 0; + + list_for_each_entry(bl, &j->seq_blacklist, list) + blacklist = max(blacklist, bl->end); + + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); + + while (journal_cur_seq(j) < blacklist) + journal_pin_new_entry(j, 0); + + /* + * journal_buf_switch() only inits the next journal entry when it + * closes an open journal entry - the very first journal entry gets + * initialized here: + */ + journal_pin_new_entry(j, 1); + bch2_journal_buf_init(j); + + spin_unlock(&j->lock); + + /* + * Adding entries to the next journal entry before allocating space on + * disk for the next journal entry - this is ok, because these entries + * only have to go down with the next journal entry we write: + */ + bch2_journal_seq_blacklist_write(j); + + queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); +} + +/* init/exit: */ + void bch2_dev_journal_exit(struct bch_dev *ca) { kfree(ca->journal.bio); @@ -2994,7 +978,7 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); + INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); mutex_init(&j->blacklist_lock); INIT_LIST_HEAD(&j->seq_blacklist); mutex_init(&j->reclaim_lock); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index cf5cc9ba..4cec7bb5 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -112,72 +112,37 @@ #include "journal_types.h" -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - struct list_head list; - struct bch_devs_list devs; - /* must be last: */ - struct jset j; -}; - -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (entry->type == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define for_each_jset_key(k, _n, entry, jset) \ - for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ - vstruct_for_each_safe(entry, k, _n) - -#define JOURNAL_PIN (32 * 1024) - -static inline bool journal_pin_active(struct journal_entry_pin *pin) -{ - return pin->pin_list != NULL; -} - -static inline struct journal_entry_pin_list * -journal_seq_pin(struct journal *j, u64 seq) -{ - return &j->pin.data[seq & j->pin.mask]; -} - -u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); - -void bch2_journal_pin_add(struct journal *, struct journal_res *, - struct journal_entry_pin *, journal_pin_flush_fn); -void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void bch2_journal_pin_add_if_older(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); -int bch2_journal_flush_pins(struct journal *, u64); -int bch2_journal_flush_all_pins(struct journal *); - -struct closure; struct bch_fs; -struct keylist; -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); +static inline void journal_wake(struct journal *j) +{ + wake_up(&j->wait); + closure_wake_up(&j->async_wait); +} -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + return j->buf + j->reservations.idx; +} + +static inline struct journal_buf *journal_prev_buf(struct journal *j) +{ + return j->buf + !j->reservations.idx; +} + +/* Sequence number of oldest dirty journal entry */ + +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} + +static inline u64 journal_cur_seq(struct journal *j) +{ + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + return j->pin.back - 1; +} u64 bch2_inode_journal_seq(struct journal *, u64); @@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s) return u64s + sizeof(struct jset_entry) / sizeof(u64); } -static inline void bch2_journal_add_entry_at(struct journal_buf *buf, - unsigned offset, - unsigned type, enum btree_id id, - unsigned level, - const void *data, size_t u64s) +static inline struct jset_entry * +bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) { - struct jset_entry *entry = vstruct_idx(buf->data, offset); + struct jset *jset = buf->data; + struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); memset(entry, 0, sizeof(*entry)); - entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->type = type; + entry->u64s = cpu_to_le16(u64s); - memcpy_u64s(entry->_data, data, u64s); + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); + + return entry; } static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, @@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res const void *data, unsigned u64s) { struct journal_buf *buf = &j->buf[res->idx]; + struct jset_entry *entry = vstruct_idx(buf->data, res->offset); unsigned actual = jset_u64s(u64s); EBUG_ON(!res->ref); EBUG_ON(actual > res->u64s); - bch2_journal_add_entry_at(buf, res->offset, type, - id, level, data, u64s); res->offset += actual; res->u64s -= actual; + + memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(u64s); + entry->type = type; + entry->btree_id = id; + entry->level = level; + memcpy_u64s(entry->_data, data, u64s); } static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, enum btree_id id, const struct bkey_i *k) { - bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS, + bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, id, 0, k, k->k.u64s); } @@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j, while (res->u64s) bch2_journal_add_entry(j, res, - JOURNAL_ENTRY_BTREE_KEYS, + BCH_JSET_ENTRY_btree_keys, 0, 0, NULL, 0); bch2_journal_buf_put(j, res->idx, false); @@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); int bch2_journal_meta(struct journal *); -int bch2_journal_flush_device(struct journal *, int); void bch2_journal_halt(struct journal *); @@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca) return true; } -void bch2_journal_start(struct bch_fs *); int bch2_journal_mark(struct bch_fs *, struct list_head *); void bch2_journal_entries_free(struct list_head *); -int bch2_journal_read(struct bch_fs *, struct list_head *); int bch2_journal_replay(struct bch_fs *, struct list_head *); static inline void bch2_journal_set_replay_done(struct journal *j) @@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); +void bch2_fs_journal_start(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c new file mode 100644 index 00000000..2fd0d646 --- /dev/null +++ b/libbcachefs/journal_io.c @@ -0,0 +1,1423 @@ +#include "bcachefs.h" +#include "alloc.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" + +#include + +static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, + enum btree_id id) +{ + struct jset_entry *entry; + + for_each_jset_entry_type(entry, j, type) + if (entry->btree_id == id) + return entry; + + return NULL; +} + +struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry = + bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id); + + if (!entry) + return NULL; + + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + *level = entry->level; + return k; +} + +struct journal_list { + struct closure cl; + struct mutex lock; + struct list_head *head; + int ret; +}; + +#define JOURNAL_ENTRY_ADD_OK 0 +#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 + +/* + * Given a journal entry we just read, add it to the list of journal entries to + * be replayed: + */ +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_list *jlist, struct jset *j) +{ + struct journal_replay *i, *pos; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + __le64 last_seq; + int ret; + + last_seq = !list_empty(jlist->head) + ? list_last_entry(jlist->head, struct journal_replay, + list)->j.last_seq + : 0; + + /* Is this entry older than the range we need? */ + if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + list_for_each_entry_safe(i, pos, jlist->head, list) { + if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) + break; + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + } + + list_for_each_entry_reverse(i, jlist->head, list) { + /* Duplicate? */ + if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + fsck_err_on(bytes != vstruct_bytes(&i->j) || + memcmp(j, &i->j, bytes), c, + "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + goto found; + } + + if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { + where = &i->list; + goto add; + } + } + + where = jlist->head; +add: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) { + ret = -ENOMEM; + goto out; + } + + list_add(&i->list, where); + i->devs.nr = 0; + memcpy(&i->j, j, bytes); +found: + if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) + bch2_dev_list_add_dev(&i->devs, ca->dev_idx); + else + fsck_err_on(1, c, "duplicate journal entries on same device"); + ret = JOURNAL_ENTRY_ADD_OK; +out: +fsck_err: + return ret; +} + +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +/* this fills in a range with empty jset_entries: */ +static void journal_entry_null_range(void *start, void *end) +{ + struct jset_entry *entry; + + for (entry = start; entry != end; entry = vstruct_next(entry)) + memset(entry, 0, sizeof(*entry)); +} + +#define JOURNAL_ENTRY_REREAD 5 +#define JOURNAL_ENTRY_NONE 6 +#define JOURNAL_ENTRY_BAD 7 + +#define journal_entry_err(c, msg, ...) \ +({ \ + switch (write) { \ + case READ: \ + mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ + break; \ + case WRITE: \ + bch_err(c, "corrupt metadata before write:\n" \ + msg, ##__VA_ARGS__); \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ + break; \ + } \ + true; \ +}) + +#define journal_entry_err_on(cond, c, msg, ...) \ + ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) + +static int journal_validate_key(struct bch_fs *c, struct jset *jset, + struct jset_entry *entry, + struct bkey_i *k, enum bkey_type key_type, + const char *type, int write) +{ + void *next = vstruct_next(entry); + const char *invalid; + char buf[160]; + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, + "invalid %s in journal: k->u64s 0", type)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, + "invalid %s in journal: extends past end of journal entry", + type)) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, + "invalid %s in journal: bad format %u", + type, k->k.format)) { + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) + bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); + + invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); + if (invalid) { + bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), + bkey_i_to_s_c(k)); + mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", + type, invalid, buf); + + le16_add_cpu(&entry->u64s, -k->k.u64s); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } +fsck_err: + return ret; +} + +static int journal_entry_validate_btree_keys(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct bkey_i *k; + + vstruct_for_each(entry, k) { + int ret = journal_validate_key(c, jset, entry, k, + bkey_type(entry->level, + entry->btree_id), + "key", write); + if (ret) + return ret; + } + + return 0; +} + +static int journal_entry_validate_btree_root(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct bkey_i *k = entry->start; + int ret = 0; + + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, c, + "invalid btree root journal entry: wrong number of keys")) { + void *next = vstruct_next(entry); + /* + * we don't want to null out this jset_entry, + * just the contents, so that later we can tell + * we were _supposed_ to have a btree root + */ + entry->u64s = 0; + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE, + "btree root", write); +fsck_err: + return ret; +} + +static int journal_entry_validate_prio_ptrs(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + /* obsolete, don't care: */ + return 0; +} + +static int journal_entry_validate_blacklist(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } +fsck_err: + return ret; +} + +static int journal_entry_validate_blacklist_v2(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } + + bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); + + if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > + le64_to_cpu(bl_entry->end), c, + "invalid journal seq blacklist entry: start > end")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } + +fsck_err: + return ret; +} + +struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, int); +}; + +const struct jset_entry_ops bch2_jset_entry_ops[] = { +#define x(f, nr) \ + [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ + .validate = journal_entry_validate_##f, \ + }, + BCH_JSET_ENTRY_TYPES() +#undef x +}; + +static int journal_entry_validate(struct bch_fs *c, struct jset *jset, + struct jset_entry *entry, int write) +{ + int ret = 0; + + if (entry->type >= BCH_JSET_ENTRY_NR) { + journal_entry_err(c, "invalid journal entry type %u", + entry->type); + journal_entry_null_range(entry, vstruct_next(entry)); + return 0; + } + + ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write); +fsck_err: + return ret; +} + +static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + int write) +{ + struct jset_entry *entry; + int ret = 0; + + vstruct_for_each(jset, entry) { + if (journal_entry_err_on(vstruct_next(entry) > + vstruct_last(jset), c, + "journal entry extends past end of jset")) { + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); + break; + } + + ret = journal_entry_validate(c, jset, entry, write); + if (ret) + break; + } +fsck_err: + return ret; +} + +static int jset_validate(struct bch_fs *c, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read, + int write) +{ + size_t bytes = vstruct_bytes(jset); + struct bch_csum csum; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) { + bch_err(c, "unknown journal entry version %u", + le32_to_cpu(jset->version)); + return BCH_FSCK_UNKNOWN_VERSION; + } + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, + "journal entry too big (%zu bytes), sector %lluu", + bytes, sector)) { + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; + } + + if (bytes > sectors_read << 9) + return JOURNAL_ENTRY_REREAD; + + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + "journal entry with unknown csum type %llu sector %lluu", + JSET_CSUM_TYPE(jset), sector)) + return JOURNAL_ENTRY_BAD; + + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, + "journal checksum bad, sector %llu", sector)) { + /* XXX: retry IO, when we start retrying checksum errors */ + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; + } + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + + if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + "invalid journal entry: last_seq > seq")) + jset->last_seq = jset->seq; + + return 0; +fsck_err: + return ret; +} + +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + /* the bios are sized for this many pages, max: */ + if (new_size > JOURNAL_ENTRY_SIZE_MAX) + return -ENOMEM; + + new_size = roundup_pow_of_two(new_size); + n = kvpmalloc(new_size, GFP_KERNEL); + if (!n) + return -ENOMEM; + + kvpfree(b->data, b->size); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct bch_dev *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, + unsigned bucket, u64 *seq, bool *entries_found) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + struct bio *bio = ja->bio; + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; + bool saw_bad = false; + int ret = 0; + + pr_debug("reading %u", bucket); + + while (offset < end) { + if (!sectors_read) { +reread: sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); + + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = sectors_read << 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch2_bio_map(bio, buf->data); + + ret = submit_bio_wait(bio); + + if (bch2_dev_io_err_on(ret, ca, + "journal read from sector %llu", + offset) || + bch2_meta_read_fault("journal")) + return -EIO; + + j = buf->data; + } + + ret = jset_validate(c, j, offset, + end - offset, sectors_read, + READ); + switch (ret) { + case BCH_FSCK_OK: + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; + } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + sectors = c->opts.block_size; + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; + sectors = c->opts.block_size; + goto next_block; + default: + return ret; + } + + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + mutex_lock(&jlist->lock); + ret = journal_entry_add(c, ca, jlist, j); + mutex_unlock(&jlist->lock); + + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + *entries_found = true; + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; + } + + if (le64_to_cpu(j->seq) > *seq) + *seq = le64_to_cpu(j->seq); + + sectors = vstruct_sectors(j, c->block_bits); +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); + } + + return 0; +} + +static void bch2_journal_read_device(struct closure *cl) +{ +#define read_bucket(b) \ + ({ \ + bool entries_found = false; \ + ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ + &entries_found); \ + if (ret) \ + goto err; \ + __set_bit(b, bitmap); \ + entries_found; \ + }) + + struct journal_device *ja = + container_of(cl, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); + struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); + struct journal_read_buf buf = { NULL, 0 }; + + DECLARE_BITMAP(bitmap, ja->nr); + unsigned i, l, r; + u64 seq = 0; + int ret; + + if (!ja->nr) + goto out; + + bitmap_zero(bitmap, ja->nr); + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; + + pr_debug("%u journal buckets", ja->nr); + + /* + * If the device supports discard but not secure discard, we can't do + * the fancy fibonacci hash/binary search because the live journal + * entries might not form a contiguous range: + */ + for (i = 0; i < ja->nr; i++) + read_bucket(i); + goto search_done; + + if (!blk_queue_nonrot(q)) + goto linear_scan; + + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ja->nr; i++) { + l = (i * 2654435769U) % ja->nr; + + if (test_bit(l, bitmap)) + break; + + if (read_bucket(l)) + goto bsearch; + } + + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search"); +linear_scan: + for (l = find_first_zero_bit(bitmap, ja->nr); + l < ja->nr; + l = find_next_zero_bit(bitmap, ja->nr, l + 1)) + if (read_bucket(l)) + goto bsearch; + + /* no journal entries on this device? */ + if (l == ja->nr) + goto out; +bsearch: + /* Binary search */ + r = find_next_bit(bitmap, ja->nr, l + 1); + pr_debug("starting binary search, l %u r %u", l, r); + + while (l + 1 < r) { + unsigned m = (l + r) >> 1; + u64 cur_seq = seq; + + read_bucket(m); + + if (cur_seq != seq) + l = m; + else + r = m; + } + +search_done: + /* + * Find the journal bucket with the highest sequence number: + * + * If there's duplicate journal entries in multiple buckets (which + * definitely isn't supposed to happen, but...) - make sure to start + * cur_idx at the last of those buckets, so we don't deadlock trying to + * allocate + */ + seq = 0; + + for (i = 0; i < ja->nr; i++) + if (ja->bucket_seq[i] >= seq && + ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { + /* + * When journal_next_bucket() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + seq = ja->bucket_seq[i]; + } + + /* + * Set last_idx to indicate the entire journal is full and needs to be + * reclaimed - journal reclaim will immediately reclaim whatever isn't + * pinned when it first runs: + */ + ja->last_idx = (ja->cur_idx + 1) % ja->nr; + + /* + * Read buckets in reverse order until we stop finding more journal + * entries: + */ + for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; + i != ja->cur_idx; + i = (i + ja->nr - 1) % ja->nr) + if (!test_bit(i, bitmap) && + !read_bucket(i)) + break; +out: + kvpfree(buf.data, buf.size); + percpu_ref_put(&ca->io_ref); + closure_return(cl); +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; +#undef read_bucket +} + +void bch2_journal_entries_free(struct list_head *list) +{ + + while (!list_empty(list)) { + struct journal_replay *i = + list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + } +} + +static inline bool journal_has_keys(struct list_head *list) +{ + struct journal_replay *i; + struct jset_entry *entry; + struct bkey_i *k, *_n; + + list_for_each_entry(i, list, list) + for_each_jset_key(k, _n, entry, &i->j) + return true; + + return false; +} + +int bch2_journal_read(struct bch_fs *c, struct list_head *list) +{ + struct journal *j = &c->journal; + struct journal_list jlist; + struct journal_replay *i; + struct journal_entry_pin_list *p; + struct bch_dev *ca; + u64 cur_seq, end_seq, seq; + unsigned iter, keys = 0, entries = 0; + size_t nr; + bool degraded = false; + int ret = 0; + + closure_init_stack(&jlist.cl); + mutex_init(&jlist.lock); + jlist.head = list; + jlist.ret = 0; + + for_each_member_device(ca, c, iter) { + if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + continue; + + if ((ca->mi.state == BCH_MEMBER_STATE_RW || + ca->mi.state == BCH_MEMBER_STATE_RO) && + percpu_ref_tryget(&ca->io_ref)) + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, + &jlist.cl); + else + degraded = true; + } + + closure_sync(&jlist.cl); + + if (jlist.ret) + return jlist.ret; + + if (list_empty(list)){ + bch_err(c, "no journal entries found"); + return BCH_FSCK_REPAIR_IMPOSSIBLE; + } + + fsck_err_on(c->sb.clean && journal_has_keys(list), c, + "filesystem marked clean but journal has keys to replay"); + + list_for_each_entry(i, list, list) { + ret = jset_validate_entries(c, &i->j, READ); + if (ret) + goto fsck_err; + + /* + * If we're mounting in degraded mode - if we didn't read all + * the devices - this is wrong: + */ + + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, + i->devs), c, + "superblock not marked as containing replicas (type %u)", + BCH_DATA_JOURNAL))) { + ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); + if (ret) + return ret; + } + } + + i = list_last_entry(list, struct journal_replay, list); + + nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; + + if (nr > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%zu open entries)", nr); + return -ENOMEM; + } + } + + atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); + j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); + + j->pin.front = le64_to_cpu(i->j.last_seq); + j->pin.back = le64_to_cpu(i->j.seq) + 1; + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 0); + p->devs.nr = 0; + } + + mutex_lock(&j->blacklist_lock); + + list_for_each_entry(i, list, list) { + p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); + + atomic_set(&p->count, 1); + p->devs = i->devs; + + if (bch2_journal_seq_blacklist_read(j, i)) { + mutex_unlock(&j->blacklist_lock); + return -ENOMEM; + } + } + + mutex_unlock(&j->blacklist_lock); + + cur_seq = journal_last_seq(j); + end_seq = le64_to_cpu(list_last_entry(list, + struct journal_replay, list)->j.seq); + + list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; + bool blacklisted; + + mutex_lock(&j->blacklist_lock); + while (cur_seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_blacklist_find(j, cur_seq)) + cur_seq++; + + blacklisted = bch2_journal_seq_blacklist_find(j, + le64_to_cpu(i->j.seq)); + mutex_unlock(&j->blacklist_lock); + + fsck_err_on(blacklisted, c, + "found blacklisted journal entry %llu", + le64_to_cpu(i->j.seq)); + + fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + cur_seq, le64_to_cpu(i->j.seq) - 1, + journal_last_seq(j), end_seq); + + cur_seq = le64_to_cpu(i->j.seq) + 1; + + for_each_jset_key(k, _n, entry, &i->j) + keys++; + entries++; + } + + bch_info(c, "journal read done, %i keys in %i entries, seq %llu", + keys, entries, journal_cur_seq(j)); +fsck_err: + return ret; +} + +/* journal replay: */ + +int bch2_journal_mark(struct bch_fs *c, struct list_head *list) +{ + struct bkey_i *k, *n; + struct jset_entry *j; + struct journal_replay *r; + int ret; + + list_for_each_entry(r, list, list) + for_each_jset_key(k, n, j, &r->j) { + enum bkey_type type = bkey_type(j->level, j->btree_id); + struct bkey_s_c k_s_c = bkey_i_to_s_c(k); + + if (btree_type_has_ptrs(type)) { + ret = bch2_btree_mark_key_initial(c, type, k_s_c); + if (ret) + return ret; + } + } + + return 0; +} + +int bch2_journal_replay(struct bch_fs *c, struct list_head *list) +{ + struct journal *j = &c->journal; + struct journal_entry_pin_list *pin_list; + struct bkey_i *k, *_n; + struct jset_entry *entry; + struct journal_replay *i, *n; + int ret = 0; + + list_for_each_entry_safe(i, n, list, list) { + + j->replay_journal_seq = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) { + + if (entry->btree_id == BTREE_ID_ALLOC) { + /* + * allocation code handles replay for + * BTREE_ID_ALLOC keys: + */ + ret = bch2_alloc_replay_key(c, k->k.p); + } else { + /* + * We might cause compressed extents to be + * split, so we need to pass in a + * disk_reservation: + */ + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + + ret = bch2_btree_insert(c, entry->btree_id, k, + &disk_res, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_REPLAY); + } + + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", + ret); + goto err; + } + + cond_resched(); + } + + pin_list = journal_seq_pin(j, j->replay_journal_seq); + + if (atomic_dec_and_test(&pin_list->count)) + journal_wake(j); + } + + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + ret = bch2_journal_flush_all_pins(j); +err: + bch2_journal_entries_free(list); + return ret; +} + +/* journal write: */ + +static void bch2_journal_add_btree_root(struct journal_buf *buf, + enum btree_id id, struct bkey_i *k, + unsigned level) +{ + struct jset_entry *entry; + + entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s); + entry->type = BCH_JSET_ENTRY_btree_root; + entry->btree_id = id; + entry->level = level; + memcpy_u64s(entry->_data, k, k->k.u64s); +} + +static unsigned journal_dev_buckets_available(struct journal *j, + struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; + + /* + * Hack to avoid a deadlock during journal replay: + * journal replay might require setting a new btree + * root, which requires writing another journal entry - + * thus, if the journal is full (and this happens when + * replaying the first journal bucket's entries) we're + * screwed. + * + * So don't let the journal fill up unless we're in + * replay: + */ + if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) + available = max((int) available - 2, 0); + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) + available = max((int) available - 1, 0); + + return available; +} + +/* returns number of sectors available for next journal entry: */ +int bch2_journal_entry_sectors(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); + unsigned sectors_available = UINT_MAX; + unsigned i, nr_online = 0, nr_devs = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_required = 0; + + if (!ja->nr) + continue; + + sectors_available = min_t(unsigned, sectors_available, + ca->mi.bucket_size); + + /* + * Note that we don't allocate the space for a journal entry + * until we write it out - thus, if we haven't started the write + * for the previous entry we have to make sure we have space for + * it too: + */ + if (bch2_extent_has_device(e.c, ca->dev_idx)) { + if (j->prev_buf_sectors > ja->sectors_free) + buckets_required++; + + if (j->prev_buf_sectors + sectors_available > + ja->sectors_free) + buckets_required++; + } else { + if (j->prev_buf_sectors + sectors_available > + ca->mi.bucket_size) + buckets_required++; + + buckets_required++; + } + + if (journal_dev_buckets_available(j, ca) >= buckets_required) + nr_devs++; + nr_online++; + } + rcu_read_unlock(); + + if (nr_online < c->opts.metadata_replicas_required) + return -EROFS; + + if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) + return 0; + + return sectors_available; +} + +/** + * journal_next_bucket - move on to the next journal bucket if possible + */ +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned sectors) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct journal_device *ja; + struct bch_dev *ca; + struct dev_alloc_list devs_sorted; + unsigned i, replicas, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + + spin_lock(&j->lock); + e = bkey_i_to_s_extent(&j->key); + + /* + * Drop any pointers to devices that have been removed, are no longer + * empty, or filled up their current journal bucket: + * + * Note that a device may have had a small amount of free space (perhaps + * one sector) that wasn't enough for the smallest possible journal + * entry - that's why we drop pointers to devices <= current free space, + * i.e. whichever device was limiting the current journal entry size. + */ + extent_for_each_ptr_backwards(e, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.state != BCH_MEMBER_STATE_RW || + ca->journal.sectors_free <= sectors) + __bch2_extent_drop_ptr(e, ptr); + else + ca->journal.sectors_free -= sectors; + } + + replicas = bch2_extent_nr_ptrs(e.c); + + rcu_read_lock(); + devs_sorted = bch2_wp_alloc_list(c, &j->wp, + &c->rw_devs[BCH_DATA_JOURNAL]); + + for (i = 0; i < devs_sorted.nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; + + if (!ca->mi.durability) + continue; + + ja = &ca->journal; + if (!ja->nr) + continue; + + if (replicas >= replicas_want) + break; + + /* + * Check that we can use this device, and aren't already using + * it: + */ + if (bch2_extent_has_device(e.c, ca->dev_idx) || + !journal_dev_buckets_available(j, ca) || + sectors > ca->mi.bucket_size) + continue; + + j->wp.next_alloc[ca->dev_idx] += U32_MAX; + bch2_wp_rescale(c, ca, &j->wp); + + ja->sectors_free = ca->mi.bucket_size - sectors; + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + + extent_ptr_append(bkey_i_to_extent(&j->key), + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]), + .dev = ca->dev_idx, + }); + + replicas += ca->mi.durability; + } + rcu_read_unlock(); + + j->prev_buf_sectors = 0; + + bkey_copy(&w->key, &j->key); + spin_unlock(&j->lock); + + if (replicas < c->opts.metadata_replicas_required) + return -EROFS; + + BUG_ON(!replicas); + + return 0; +} + +static void journal_write_compact(struct jset *jset) +{ + struct jset_entry *i, *next, *prev = NULL; + + /* + * Simple compaction, dropping empty jset_entries (from journal + * reservations that weren't fully used) and merging jset_entries that + * can be. + * + * If we wanted to be really fancy here, we could sort all the keys in + * the jset and drop keys that were overwritten - probably not worth it: + */ + vstruct_for_each_safe(jset, i, next) { + unsigned u64s = le16_to_cpu(i->u64s); + + /* Empty entry: */ + if (!u64s) + continue; + + /* Can we merge with previous entry? */ + if (prev && + i->btree_id == prev->btree_id && + i->level == prev->level && + i->type == prev->type && + i->type == BCH_JSET_ENTRY_btree_keys && + le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { + memmove_u64s_down(vstruct_next(prev), + i->_data, + u64s); + le16_add_cpu(&prev->u64s, u64s); + continue; + } + + /* Couldn't merge, move i into new position (after prev): */ + prev = prev ? vstruct_next(prev) : jset->start; + if (i != prev) + memmove_u64s_down(prev, i, jset_u64s(u64s)); + } + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); +} + +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +{ + /* we aren't holding j->lock: */ + unsigned new_size = READ_ONCE(j->buf_size_want); + void *new_buf; + + if (buf->size >= new_size) + return; + + new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); + if (!new_buf) + return; + + memcpy(new_buf, buf->data, buf->size); + kvpfree(buf->data, buf->size); + buf->data = new_buf; + buf->size = new_size; +} + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_prev_buf(j); + struct bch_devs_list devs = + bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); + u64 seq = le64_to_cpu(w->data->seq); + + if (!devs.nr) { + bch_err(c, "unable to write journal to sufficient devices"); + goto err; + } + + if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) + goto err; +out: + bch2_time_stats_update(j->write_time, j->write_start_time); + + spin_lock(&j->lock); + j->last_seq_ondisk = seq; + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = devs; + + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); + + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + + BUG_ON(!j->reservations.prev_buf_unwritten); + atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, + &j->reservations.counter); + + closure_wake_up(&w->wait); + journal_wake(j); + + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) + mod_delayed_work(system_freezable_wq, &j->write_work, 0); + spin_unlock(&j->lock); + return; +err: + bch2_fatal_error(c); + bch2_journal_halt(j); + goto out; +} + +static void journal_write_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || + bch2_meta_write_fault("journal")) { + struct journal_buf *w = journal_prev_buf(j); + unsigned long flags; + + spin_lock_irqsave(&j->err_lock, flags); + bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx); + spin_unlock_irqrestore(&j->err_lock, flags); + } + + closure_put(&j->io); + percpu_ref_put(&ca->io_ref); +} + +void bch2_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_prev_buf(j); + struct jset *jset; + struct bio *bio; + struct bch_extent_ptr *ptr; + unsigned i, sectors, bytes; + + journal_buf_realloc(j, w); + jset = w->data; + + j->write_start_time = local_clock(); + mutex_lock(&c->btree_root_lock); + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = &c->btree_roots[i]; + + if (r->alive) + bch2_journal_add_btree_root(w, i, &r->key, r->level); + } + c->btree_roots_dirty = false; + mutex_unlock(&c->btree_root_lock); + + journal_write_compact(jset); + + jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); + jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(BCACHE_JSET_VERSION); + + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + jset_validate_entries(c, jset, WRITE)) + goto err; + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + jset_validate_entries(c, jset, WRITE)) + goto err; + + sectors = vstruct_sectors(jset, c->block_bits); + BUG_ON(sectors > j->prev_buf_sectors); + + bytes = vstruct_bytes(w->data); + memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); + + if (journal_write_alloc(j, w, sectors)) { + bch2_journal_halt(j); + bch_err(c, "Unable to allocate journal write"); + bch2_fatal_error(c); + continue_at(cl, journal_write_done, system_highpri_wq); + } + + /* + * XXX: we really should just disable the entire journal in nochanges + * mode + */ + if (c->opts.nochanges) + goto no_io; + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], + sectors); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_iter.bi_size = sectors << 9; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_WRITE, + REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bch2_bio_map(bio, jset); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); + } + + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_opf = REQ_OP_FLUSH; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) + ptr->offset += sectors; + + continue_at(cl, journal_write_done, system_highpri_wq); +err: + bch2_inconsistent_error(c); + continue_at(cl, journal_write_done, system_highpri_wq); +} diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h new file mode 100644 index 00000000..4236b7fc --- /dev/null +++ b/libbcachefs/journal_io.h @@ -0,0 +1,45 @@ +#ifndef _BCACHEFS_JOURNAL_IO_H +#define _BCACHEFS_JOURNAL_IO_H + +struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, + enum btree_id, unsigned *); + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct list_head list; + struct bch_devs_list devs; + /* must be last: */ + struct jset j; +}; + +static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + struct jset_entry *entry, unsigned type) +{ + while (entry < vstruct_last(jset)) { + if (entry->type == type) + return entry; + + entry = vstruct_next(entry); + } + + return NULL; +} + +#define for_each_jset_entry_type(entry, jset, type) \ + for (entry = (jset)->start; \ + (entry = __jset_entry_type_next(jset, entry, type)); \ + entry = vstruct_next(entry)) + +#define for_each_jset_key(k, _n, entry, jset) \ + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ + vstruct_for_each_safe(entry, k, _n) + +int bch2_journal_read(struct bch_fs *, struct list_head *); + +int bch2_journal_entry_sectors(struct journal *); +void bch2_journal_write(struct closure *); + +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c new file mode 100644 index 00000000..0e3e5b6a --- /dev/null +++ b/libbcachefs/journal_reclaim.c @@ -0,0 +1,411 @@ + +#include "bcachefs.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "super.h" + +/* + * Journal entry pinning - machinery for holding a reference on a given journal + * entry, holding it open to ensure it gets replayed during recovery: + */ + +static inline u64 journal_pin_seq(struct journal *j, + struct journal_entry_pin_list *pin_list) +{ + return fifo_entry_idx_abs(&j->pin, pin_list); +} + +u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) +{ + u64 ret = 0; + + spin_lock(&j->lock); + if (journal_pin_active(pin)) + ret = journal_pin_seq(j, pin->pin_list); + spin_unlock(&j->lock); + + return ret; +} + +static inline void __journal_pin_add(struct journal *j, + struct journal_entry_pin_list *pin_list, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + BUG_ON(journal_pin_active(pin)); + BUG_ON(!atomic_read(&pin_list->count)); + + atomic_inc(&pin_list->count); + pin->pin_list = pin_list; + pin->flush = flush_fn; + + if (flush_fn) + list_add(&pin->list, &pin_list->list); + else + INIT_LIST_HEAD(&pin->list); + + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock(&j->lock); + __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn); + spin_unlock(&j->lock); +} + +static inline void __journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + struct journal_entry_pin_list *pin_list = pin->pin_list; + + if (!journal_pin_active(pin)) + return; + + pin->pin_list = NULL; + list_del_init(&pin->list); + + /* + * Unpinning a journal entry make make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + */ + if (atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin)) + bch2_journal_reclaim_fast(j); +} + +void bch2_journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + spin_lock(&j->lock); + __journal_pin_drop(j, pin); + spin_unlock(&j->lock); +} + +void bch2_journal_pin_add_if_older(struct journal *j, + struct journal_entry_pin *src_pin, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock(&j->lock); + + if (journal_pin_active(src_pin) && + (!journal_pin_active(pin) || + journal_pin_seq(j, src_pin->pin_list) < + journal_pin_seq(j, pin->pin_list))) { + __journal_pin_drop(j, pin); + __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); + } + + spin_unlock(&j->lock); +} + +/* + * Journal reclaim: flush references to open journal entries to reclaim space in + * the journal + * + * May be done by the journal code in the background as needed to free up space + * for more journal entries, or as part of doing a clean shutdown, or to migrate + * data off of a specific device: + */ + +/** + * bch2_journal_reclaim_fast - do the fast part of journal reclaim + * + * Called from IO submission context, does not block. Cleans up after btree + * write completions by advancing the journal pin and each cache's last_idx, + * kicking off discards and background reclaim as necessary. + */ +void bch2_journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } + + if (popped) + journal_wake(j); +} + +static struct journal_entry_pin * +__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret; + u64 iter; + + /* no need to iterate over empty fifo entries: */ + bch2_journal_reclaim_fast(j); + + fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { + if (iter > seq_to_flush) + break; + + ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list); + if (ret) { + /* must be list_del_init(), see bch2_journal_pin_drop() */ + list_move(&ret->list, &pin_list->flushed); + *seq = iter; + return ret; + } + } + + return NULL; +} + +static struct journal_entry_pin * +journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +{ + struct journal_entry_pin *ret; + + spin_lock(&j->lock); + ret = __journal_get_next_pin(j, seq_to_flush, seq); + spin_unlock(&j->lock); + + return ret; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->nr && + (ja->last_idx != ja->cur_idx && + ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); + spin_unlock(&j->lock); + + return ret; +} + +/** + * bch2_journal_reclaim_work - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +void bch2_journal_reclaim_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(to_delayed_work(work), + struct bch_fs, journal.reclaim_work); + struct journal *j = &c->journal; + struct bch_dev *ca; + struct journal_entry_pin *pin; + u64 seq, seq_to_flush = 0; + unsigned iter, bucket_to_flush; + unsigned long next_flush; + bool reclaim_lock_held = false, need_flush; + + /* + * Advance last_idx to point to the oldest journal entry containing + * btree node updates that have not yet been written out + */ + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (should_discard_bucket(j, ja)) { + if (!reclaim_lock_held) { + /* + * ugh: + * might be called from __journal_res_get() + * under wait_event() - have to go back to + * TASK_RUNNING before doing something that + * would block, but only if we're doing work: + */ + __set_current_state(TASK_RUNNING); + + mutex_lock(&j->reclaim_lock); + reclaim_lock_held = true; + /* recheck under reclaim_lock: */ + continue; + } + + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->last_idx]), + ca->mi.bucket_size, GFP_NOIO, 0); + + spin_lock(&j->lock); + ja->last_idx = (ja->last_idx + 1) % ja->nr; + spin_unlock(&j->lock); + + journal_wake(j); + } + + /* + * Write out enough btree nodes to free up 50% journal + * buckets + */ + spin_lock(&j->lock); + bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; + seq_to_flush = max_t(u64, seq_to_flush, + ja->bucket_seq[bucket_to_flush]); + spin_unlock(&j->lock); + } + + if (reclaim_lock_held) + mutex_unlock(&j->reclaim_lock); + + /* Also flush if the pin fifo is more than half full */ + spin_lock(&j->lock); + seq_to_flush = max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); + spin_unlock(&j->lock); + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + need_flush = time_after(jiffies, next_flush); + + while ((pin = journal_get_next_pin(j, need_flush + ? U64_MAX + : seq_to_flush, &seq))) { + __set_current_state(TASK_RUNNING); + pin->flush(j, pin, seq); + need_flush = false; + + j->last_flushed = jiffies; + } + + if (!test_bit(BCH_FS_RO, &c->flags)) + queue_delayed_work(system_freezable_wq, &j->reclaim_work, + msecs_to_jiffies(j->reclaim_delay_ms)); +} + +static int journal_flush_done(struct journal *j, u64 seq_to_flush, + struct journal_entry_pin **pin, + u64 *pin_seq) +{ + int ret; + + *pin = NULL; + + ret = bch2_journal_error(j); + if (ret) + return ret; + + spin_lock(&j->lock); + /* + * If journal replay hasn't completed, the unreplayed journal entries + * hold refs on their corresponding sequence numbers + */ + ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || + !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + journal_last_seq(j) > seq_to_flush || + (fifo_used(&j->pin) == 1 && + atomic_read(&fifo_peek_front(&j->pin).count) == 1); + spin_unlock(&j->lock); + + return ret; +} + +int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin *pin; + u64 pin_seq; + bool flush; + + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return 0; +again: + wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); + if (pin) { + /* flushing a journal pin might cause a new one to be added: */ + pin->flush(j, pin, pin_seq); + goto again; + } + + spin_lock(&j->lock); + flush = journal_last_seq(j) != j->last_seq_ondisk || + (seq_to_flush == U64_MAX && c->btree_roots_dirty); + spin_unlock(&j->lock); + + return flush ? bch2_journal_meta(j) : 0; +} + +int bch2_journal_flush_all_pins(struct journal *j) +{ + return bch2_journal_flush_pins(j, U64_MAX); +} + +int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct bch_devs_list devs; + u64 iter, seq = 0; + int ret = 0; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(p, &j->pin, iter) + if (dev_idx >= 0 + ? bch2_dev_list_has_dev(p->devs, dev_idx) + : p->devs.nr < c->opts.metadata_replicas) + seq = iter; + spin_unlock(&j->lock); + + ret = bch2_journal_flush_pins(j, seq); + if (ret) + return ret; + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); + + seq = 0; + + spin_lock(&j->lock); + while (!ret && seq < j->pin.back) { + seq = max(seq, journal_last_seq(j)); + devs = journal_seq_pin(j, seq)->devs; + seq++; + + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); + spin_lock(&j->lock); + } + spin_unlock(&j->lock); + + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h new file mode 100644 index 00000000..7d460c35 --- /dev/null +++ b/libbcachefs/journal_reclaim.h @@ -0,0 +1,36 @@ +#ifndef _BCACHEFS_JOURNAL_RECLAIM_H +#define _BCACHEFS_JOURNAL_RECLAIM_H + +#define JOURNAL_PIN (32 * 1024) + +static inline bool journal_pin_active(struct journal_entry_pin *pin) +{ + return pin->pin_list != NULL; +} + +static inline struct journal_entry_pin_list * +journal_seq_pin(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->pin.front || seq >= j->pin.back); + + return &j->pin.data[seq & j->pin.mask]; +} + +u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); + +void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); +void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); +void bch2_journal_pin_add_if_older(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); + +void bch2_journal_reclaim_fast(struct journal *); +void bch2_journal_reclaim_work(struct work_struct *); + +int bch2_journal_flush_pins(struct journal *, u64); +int bch2_journal_flush_all_pins(struct journal *); +int bch2_journal_flush_device_pins(struct journal *, int); + +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c new file mode 100644 index 00000000..b5301d96 --- /dev/null +++ b/libbcachefs/journal_seq_blacklist.c @@ -0,0 +1,358 @@ + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" + +/* + * journal_seq_blacklist machinery: + * + * To guarantee order of btree updates after a crash, we need to detect when a + * btree node entry (bset) is newer than the newest journal entry that was + * successfully written, and ignore it - effectively ignoring any btree updates + * that didn't make it into the journal. + * + * If we didn't do this, we might have two btree nodes, a and b, both with + * updates that weren't written to the journal yet: if b was updated after a, + * but b was flushed and not a - oops; on recovery we'll find that the updates + * to b happened, but not the updates to a that happened before it. + * + * Ignoring bsets that are newer than the newest journal entry is always safe, + * because everything they contain will also have been journalled - and must + * still be present in the journal on disk until a journal entry has been + * written _after_ that bset was written. + * + * To accomplish this, bsets record the newest journal sequence number they + * contain updates for; then, on startup, the btree code queries the journal + * code to ask "Is this sequence number newer than the newest journal entry? If + * so, ignore it." + * + * When this happens, we must blacklist that journal sequence number: the + * journal must not write any entries with that sequence number, and it must + * record that it was blacklisted so that a) on recovery we don't think we have + * missing journal entries and b) so that the btree code continues to ignore + * that bset, until that btree node is rewritten. + * + * Blacklisted journal sequence numbers are themselves recorded as entries in + * the journal. + */ + +/* + * Called when journal needs to evict a blacklist entry to reclaim space: find + * any btree nodes that refer to the blacklist journal sequence numbers, and + * rewrite them: + */ +static void journal_seq_blacklist_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct bch_fs *c = + container_of(j, struct bch_fs, journal); + struct journal_seq_blacklist *bl = + container_of(pin, struct journal_seq_blacklist, pin); + struct blacklisted_node n; + struct closure cl; + unsigned i; + int ret; + + closure_init_stack(&cl); + + for (i = 0;; i++) { + struct btree_iter iter; + struct btree *b; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); + + __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); + + b = bch2_btree_iter_peek_node(&iter); + + /* The node might have already been rewritten: */ + + if (b->data->keys.seq == n.seq) { + ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); + if (ret) { + bch2_btree_iter_unlock(&iter); + bch2_fs_fatal_error(c, + "error %i rewriting btree node with blacklisted journal seq", + ret); + bch2_journal_halt(j); + return; + } + } + + bch2_btree_iter_unlock(&iter); + } + + for (i = 0;; i++) { + struct btree_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&j->blacklist_lock); + if (i >= bl->nr_entries) { + mutex_unlock(&j->blacklist_lock); + break; + } + n = bl->entries[i]; + mutex_unlock(&j->blacklist_lock); +redo_wait: + mutex_lock(&c->btree_interior_update_lock); + + /* + * Is the node on the list of pending interior node updates - + * being freed? If so, wait for that to finish: + */ + for_each_pending_btree_node_free(c, as, d) + if (n.seq == d->seq && + n.btree_id == d->btree_id && + !d->level && + !bkey_cmp(n.pos, d->key.k.p)) { + closure_wait(&as->wait, &cl); + mutex_unlock(&c->btree_interior_update_lock); + closure_sync(&cl); + goto redo_wait; + } + + mutex_unlock(&c->btree_interior_update_lock); + } + + mutex_lock(&j->blacklist_lock); + + bch2_journal_pin_drop(j, &bl->pin); + list_del(&bl->list); + kfree(bl->entries); + kfree(bl); + + mutex_unlock(&j->blacklist_lock); +} + +/* + * Determine if a particular sequence number is blacklisted - if so, return + * blacklist entry: + */ +struct journal_seq_blacklist * +bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + list_for_each_entry(bl, &j->seq_blacklist, list) + if (seq >= bl->start && seq <= bl->end) + return bl; + + return NULL; +} + +/* + * Allocate a new, in memory blacklist entry: + */ +static struct journal_seq_blacklist * +bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end) +{ + struct journal_seq_blacklist *bl; + + lockdep_assert_held(&j->blacklist_lock); + + /* + * When we start the journal, bch2_journal_start() will skip over @seq: + */ + + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return NULL; + + bl->start = start; + bl->end = end; + + list_add_tail(&bl->list, &j->seq_blacklist); + return bl; +} + +/* + * Returns true if @seq is newer than the most recent journal entry that got + * written, and data corresponding to @seq should be ignored - also marks @seq + * as blacklisted so that on future restarts the corresponding data will still + * be ignored: + */ +int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +{ + struct journal *j = &c->journal; + struct journal_seq_blacklist *bl = NULL; + struct blacklisted_node *n; + u64 journal_seq; + int ret = 0; + + if (!seq) + return 0; + + spin_lock(&j->lock); + journal_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + /* Interier updates aren't journalled: */ + BUG_ON(b->level); + BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + + /* + * Decrease this back to j->seq + 2 when we next rev the on disk format: + * increasing it temporarily to work around bug in old kernels + */ + fsck_err_on(seq > journal_seq + 4, c, + "bset journal seq too far in the future: %llu > %llu", + seq, journal_seq); + + if (seq <= journal_seq && + list_empty_careful(&j->seq_blacklist)) + return 0; + + mutex_lock(&j->blacklist_lock); + + if (seq <= journal_seq) { + bl = bch2_journal_seq_blacklist_find(j, seq); + if (!bl) + goto out; + } else { + bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", + b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); + + if (!j->new_blacklist) { + j->new_blacklist = bch2_journal_seq_blacklisted_new(j, + journal_seq + 1, + journal_seq + 1); + if (!j->new_blacklist) { + ret = -ENOMEM; + goto out; + } + } + bl = j->new_blacklist; + bl->end = max(bl->end, seq); + } + + for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) + if (b->data->keys.seq == n->seq && + b->btree_id == n->btree_id && + !bkey_cmp(b->key.k.p, n->pos)) + goto found_entry; + + if (!bl->nr_entries || + is_power_of_2(bl->nr_entries)) { + n = krealloc(bl->entries, + max(bl->nr_entries * 2, 8UL) * sizeof(*n), + GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + bl->entries = n; + } + + bl->entries[bl->nr_entries++] = (struct blacklisted_node) { + .seq = b->data->keys.seq, + .btree_id = b->btree_id, + .pos = b->key.k.p, + }; +found_entry: + ret = 1; +out: +fsck_err: + mutex_unlock(&j->blacklist_lock); + return ret; +} + +static int __bch2_journal_seq_blacklist_read(struct journal *j, + struct journal_replay *i, + u64 start, u64 end) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_seq_blacklist *bl; + + bch_verbose(c, "blacklisting existing journal seq %llu-%llu", + start, end); + + bl = bch2_journal_seq_blacklisted_new(j, start, end); + if (!bl) + return -ENOMEM; + + bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, + journal_seq_blacklist_flush); + return 0; +} + +/* + * After reading the journal, find existing journal seq blacklist entries and + * read them into memory: + */ +int bch2_journal_seq_blacklist_read(struct journal *j, + struct journal_replay *i) +{ + struct jset_entry *entry; + int ret = 0; + + vstruct_for_each(&i->j, entry) { + switch (entry->type) { + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = __bch2_journal_seq_blacklist_read(j, i, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq)); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = __bch2_journal_seq_blacklist_read(j, i, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end)); + break; + } + } + + if (ret) + break; + } + + return ret; +} + +/* + * After reading the journal and walking the btree, we might have new journal + * sequence numbers to blacklist - add entries to the next journal entry to be + * written: + */ +void bch2_journal_seq_blacklist_write(struct journal *j) +{ + struct journal_seq_blacklist *bl = j->new_blacklist; + struct jset_entry_blacklist_v2 *bl_entry; + struct jset_entry *entry; + + if (!bl) + return; + + entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j), + (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64)); + + bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); + bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2; + bl_entry->start = cpu_to_le64(bl->start); + bl_entry->end = cpu_to_le64(bl->end); + + bch2_journal_pin_add(j, + journal_cur_seq(j), + &bl->pin, + journal_seq_blacklist_flush); + + j->new_blacklist = NULL; +} diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h new file mode 100644 index 00000000..95ea6e90 --- /dev/null +++ b/libbcachefs/journal_seq_blacklist.h @@ -0,0 +1,13 @@ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + +struct journal_replay; + +struct journal_seq_blacklist * +bch2_journal_seq_blacklist_find(struct journal *, u64); +int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); +int bch2_journal_seq_blacklist_read(struct journal *, + struct journal_replay *); +void bch2_journal_seq_blacklist_write(struct journal *); + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index e39b18f2..a27e0548 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -59,8 +59,9 @@ struct blacklisted_node { struct journal_seq_blacklist { struct list_head list; - u64 seq; - bool written; + u64 start; + u64 end; + struct journal_entry_pin pin; struct blacklisted_node *entries; @@ -171,10 +172,11 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; - struct journal_entry_pin_list *replay_pin_list; + u64 replay_journal_seq; struct mutex blacklist_lock; struct list_head seq_blacklist; + struct journal_seq_blacklist *new_blacklist; BKEY_PADDED(key); struct write_point wp; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 87e6e80c..0431fb81 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -5,6 +5,7 @@ #include "buckets.h" #include "inode.h" #include "io.h" +#include "journal_reclaim.h" #include "move.h" #include "replicas.h" #include "super-io.h" @@ -22,7 +23,6 @@ struct moving_io { struct closure cl; bool read_completed; - unsigned read_dev; unsigned read_sectors; unsigned write_sectors; @@ -42,7 +42,7 @@ struct moving_context { struct list_head reads; /* in flight sectors: */ - atomic_t read_sectors[BCH_SB_MEMBERS_MAX]; + atomic_t read_sectors; atomic_t write_sectors; wait_queue_head_t wait; @@ -306,7 +306,8 @@ static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - if (likely(!io->rbio.bio.bi_status)) { + if (likely(!io->rbio.bio.bi_status && + !io->rbio.hole)) { bch2_migrate_read_done(&io->write, &io->rbio); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); @@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio) struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); struct moving_context *ctxt = io->write.ctxt; - atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]); + atomic_sub(io->read_sectors, &ctxt->read_sectors); io->read_completed = true; if (next_pending_write(ctxt)) @@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c, enum data_cmd data_cmd, struct data_opts data_opts) { - struct extent_pick_ptr pick; struct moving_io *io; const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; @@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c, atomic_read(&ctxt->write_sectors) < SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); - if (IS_ERR_OR_NULL(pick.ca)) - return pick.ca ? PTR_ERR(pick.ca) : 0; - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) < + atomic_read(&ctxt->read_sectors) < SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ @@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c, goto err; io->write.ctxt = ctxt; - io->read_dev = pick.ca->dev_idx; - io->read_sectors = pick.crc.uncompressed_size; + io->read_sectors = e.k->size; io->write_sectors = e.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); @@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); + io->rbio.bio.bi_vcnt = pages; bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c, trace_move_extent(e.k); - atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]); + atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); /* @@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE); + bch2_read_extent(c, &io->rbio, e.s_c, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); return 0; err_free_pages: bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - percpu_ref_put(&pick.ca->io_ref); trace_move_alloc_fail(e.k); return ret; } @@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: stats->data_type = BCH_DATA_JOURNAL; - ret = bch2_journal_flush_device(&c->journal, -1); + ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; @@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c, return -EINVAL; stats->data_type = BCH_DATA_JOURNAL; - ret = bch2_journal_flush_device(&c->journal, op.migrate.dev); + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 05910c40..16b8cbfc 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -26,6 +26,8 @@ #include "inode.h" #include "io.h" #include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "migrate.h" @@ -396,9 +398,15 @@ err: static void bch2_fs_free(struct bch_fs *c) { +#define BCH_TIME_STAT(name) \ + bch2_time_stats_exit(&c->name##_time); + BCH_TIME_STATS() +#undef BCH_TIME_STAT + bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); bch2_fs_encryption_exit(c); + bch2_fs_io_exit(c); bch2_fs_btree_cache_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); @@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c) lg_lock_free(&c->usage_lock); free_percpu(c->usage_percpu); mempool_exit(&c->btree_bounce_pool); - mempool_exit(&c->bio_bounce_pages); - bioset_exit(&c->bio_write); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); mempool_exit(&c->btree_reserve_pool); @@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - spin_lock_init(&c->name##_time.lock); +#define BCH_TIME_STAT(name) \ + bch2_time_stats_init(&c->name##_time); BCH_TIME_STATS() #undef BCH_TIME_STAT @@ -587,9 +591,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->gc_pos_lock); - c->copy_gc_enabled = 1; - c->rebalance_enabled = 1; - c->rebalance_percent = 10; + c->copy_gc_enabled = 1; + c->rebalance_enabled = 1; + c->rebalance_percent = 10; + c->promote_whole_extents = true; c->journal.write_time = &c->journal_write_time; c->journal.delay_time = &c->journal_delay_time; @@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS) || - mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->sb.encoded_extent_max) / - PAGE_SECTORS, 0) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || lg_lock_init(&c->usage_lock) || mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || @@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || bch2_fs_btree_cache_init(c) || + bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || bch2_fs_fsio_init(c)) @@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c) goto recovery_done; /* - * bch2_journal_start() can't happen sooner, or btree_gc_finish() + * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() * will give spurious errors about oldest_gen > bucket_gen - * this is a hack but oh well. */ - bch2_journal_start(c); + bch2_fs_journal_start(&c->journal); err = "error starting allocator"; if (bch2_fs_allocator_start(c)) @@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_journal_start(c); + bch2_fs_journal_start(&c->journal); bch2_journal_set_replay_done(&c->journal); err = "error starting allocator"; @@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca) bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); + bch2_time_stats_exit(&ca->io_latency[WRITE]); + bch2_time_stats_exit(&ca->io_latency[READ]); + percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); kobject_put(&ca->kobj); @@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); + bch2_time_stats_init(&ca->io_latency[READ]); + bch2_time_stats_init(&ca->io_latency[WRITE]); + ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; @@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - ret = bch2_journal_flush_device(&c->journal, ca->dev_idx); + ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); if (ret) { bch_err(ca, "Remove failed: error %i flushing journal", ret); goto err; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index a52ee3bb..231bc529 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return ca->disk_sb.bdev != NULL; + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_FAILED; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_RW || + (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; } static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index e8089db9..65345d80 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -141,11 +141,19 @@ read_attribute(btree_node_size); read_attribute(first_bucket); read_attribute(nbuckets); read_attribute(durability); -read_attribute(iostats); -read_attribute(last_read_quantiles); -read_attribute(last_write_quantiles); -read_attribute(fragmentation_quantiles); -read_attribute(oldest_gen_quantiles); +read_attribute(iodone); + +read_attribute(io_latency_read); +read_attribute(io_latency_write); +read_attribute(io_latency_stats_read); +read_attribute(io_latency_stats_write); +read_attribute(congested); + +read_attribute(bucket_quantiles_last_read); +read_attribute(bucket_quantiles_last_write); +read_attribute(bucket_quantiles_fragmentation); +read_attribute(bucket_quantiles_oldest_gen); + read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc); rw_attribute(rebalance_enabled); rw_attribute(rebalance_percent); sysfs_pd_controller_attribute(rebalance); +rw_attribute(promote_whole_extents); rw_attribute(pd_controllers_update_seconds); @@ -189,8 +198,9 @@ read_attribute(data_replicas_have); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_time_stats_attribute(name, frequency_units, duration_units); +#define BCH_TIME_STAT(_name) \ + static struct attribute sysfs_time_stat_##_name = \ + { .name = #_name, .mode = S_IRUGO }; BCH_TIME_STATS() #undef BCH_TIME_STAT @@ -332,9 +342,10 @@ SHOW(bch2_fs) sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled); sysfs_print(rebalance_percent, c->rebalance_percent); - sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */ + sysfs_print(promote_whole_extents, c->promote_whole_extents); + sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true)); sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false)); @@ -406,6 +417,8 @@ STORE(__bch2_fs) sysfs_strtoul(rebalance_percent, c->rebalance_percent); sysfs_pd_controller_store(rebalance, &c->rebalance_pd); + sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); + /* Debugging: */ #define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); @@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_journal_reclaim_delay_ms, &sysfs_rebalance_percent, + &sysfs_promote_whole_extents, &sysfs_compression_stats, NULL @@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir) struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); int ret, id = opt - bch2_opt_table; + char *tmp; u64 v; - ret = bch2_opt_parse(c, opt, buf, &v); + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = bch2_opt_parse(c, opt, strim(tmp), &v); + kfree(tmp); + if (ret < 0) return ret; @@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_print_time_stats(&c->name##_time, name, \ - frequency_units, duration_units); +#define BCH_TIME_STAT(name) \ + if (attr == &sysfs_time_stat_##name) \ + return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE); BCH_TIME_STATS() #undef BCH_TIME_STAT @@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats) STORE(bch2_fs_time_stats) { - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_clear_time_stats(&c->name##_time, name); - BCH_TIME_STATS() -#undef BCH_TIME_STAT - return size; } SYSFS_OPS(bch2_fs_time_stats); struct attribute *bch2_fs_time_stats_files[] = { -#define BCH_TIME_STAT(name, frequency_units, duration_units) \ - sysfs_time_stats_attribute_list(name, frequency_units, duration_units) +#define BCH_TIME_STAT(name) \ + &sysfs_time_stat_##name, BCH_TIME_STATS() #undef BCH_TIME_STAT - NULL }; @@ -774,7 +787,7 @@ static const char * const bch2_rw[] = { NULL }; -static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf) +static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) { char *out = buf, *end = buf + PAGE_SIZE; int rw, i, cpu; @@ -851,16 +864,28 @@ SHOW(bch2_dev) return out - buf; } - if (attr == &sysfs_iostats) - return show_dev_iostats(ca, buf); + if (attr == &sysfs_iodone) + return show_dev_iodone(ca, buf); - if (attr == &sysfs_last_read_quantiles) + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + + if (attr == &sysfs_io_latency_stats_read) + return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); + if (attr == &sysfs_io_latency_stats_write) + return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + + if (attr == &sysfs_bucket_quantiles_last_read) return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); - if (attr == &sysfs_last_write_quantiles) + if (attr == &sysfs_bucket_quantiles_last_write) return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); - if (attr == &sysfs_fragmentation_quantiles) + if (attr == &sysfs_bucket_quantiles_fragmentation) return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); - if (attr == &sysfs_oldest_gen_quantiles) + if (attr == &sysfs_bucket_quantiles_oldest_gen) return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); if (attr == &sysfs_reserve_stats) @@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = { &sysfs_label, &sysfs_has_data, - &sysfs_iostats, + &sysfs_iodone, + + &sysfs_io_latency_read, + &sysfs_io_latency_write, + &sysfs_io_latency_stats_read, + &sysfs_io_latency_stats_write, + &sysfs_congested, /* alloc info - other stats: */ - &sysfs_last_read_quantiles, - &sysfs_last_write_quantiles, - &sysfs_fragmentation_quantiles, - &sysfs_oldest_gen_quantiles, + &sysfs_bucket_quantiles_last_read, + &sysfs_bucket_quantiles_last_write, + &sysfs_bucket_quantiles_fragmentation, + &sysfs_bucket_quantiles_oldest_gen, + &sysfs_reserve_stats, /* debug: */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index fa853750..1f2c23b9 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -13,12 +13,15 @@ #include #include #include +#include +#include #include #include #include #include #include +#include "eytzinger.h" #include "util.h" #define simple_strtoint(c, end, base) simple_strtol(c, end, base) @@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_time_stats_clear(struct time_stats *stats) +void bch2_quantiles_update(struct quantiles *q, u64 v) { - spin_lock(&stats->lock); + unsigned i = 0; - stats->count = 0; - stats->last_duration = 0; - stats->max_duration = 0; - stats->average_duration = 0; - stats->average_frequency = 0; - stats->last = 0; + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; - spin_unlock(&stats->lock); + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } } -void __bch2_time_stats_update(struct time_stats *stats, u64 start_time) +/* time stats: */ + +static void bch2_time_stats_update_one(struct time_stats *stats, + u64 start, u64 end) { - u64 now, duration, last; + u64 duration, freq; + + duration = time_after64(end, start) + ? end - start : 0; + freq = time_after64(end, stats->last_event) + ? end - stats->last_event : 0; stats->count++; - now = local_clock(); - duration = time_after64(now, start_time) - ? now - start_time : 0; - last = time_after64(now, stats->last) - ? now - stats->last : 0; + stats->average_duration = stats->average_duration + ? ewma_add(stats->average_duration, duration, 6) + : duration; + + stats->average_frequency = stats->average_frequency + ? ewma_add(stats->average_frequency, freq, 6) + : freq; - stats->last_duration = duration; stats->max_duration = max(stats->max_duration, duration); - if (stats->last) { - stats->average_duration = ewma_add(stats->average_duration, - duration << 8, 3); + stats->last_event = end; - if (stats->average_frequency) - stats->average_frequency = - ewma_add(stats->average_frequency, - last << 8, 3); - else - stats->average_frequency = last << 8; + bch2_quantiles_update(&stats->quantiles, duration); +} + +void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + bch2_time_stats_update_one(stats, start, end); + + if (stats->average_frequency < 32 && + stats->count > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); } else { - stats->average_duration = duration << 8; + struct time_stat_buffer_entry *i; + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (b->nr == ARRAY_SIZE(b->entries)) { + spin_lock_irqsave(&stats->lock, flags); + for (i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + spin_unlock_irqrestore(&stats->lock, flags); + + b->nr = 0; + } + + preempt_enable(); + } +} + +static const struct time_unit { + const char *name; + u32 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "sec", NSEC_PER_SEC }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static size_t pr_time_units(char *buf, size_t len, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + +size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) +{ + char *out = buf, *end = buf + len; + const struct time_unit *u; + u64 freq = READ_ONCE(stats->average_frequency); + u64 q, last_q = 0; + int i; + + out += scnprintf(out, end - out, "count:\t\t%llu\n", + stats->count); + out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n", + freq ? div64_u64(NSEC_PER_SEC, freq) : 0); + + out += scnprintf(out, end - out, "frequency:\t"); + out += pr_time_units(out, end - out, freq); + + out += scnprintf(out, end - out, "\navg duration:\t"); + out += pr_time_units(out, end - out, stats->average_duration); + + out += scnprintf(out, end - out, "\nmax duration:\t"); + out += pr_time_units(out, end - out, stats->max_duration); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + + out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + q = max(stats->quantiles.entries[i].m, last_q); + out += scnprintf(out, end - out, "%llu%s", + div_u64(q, u->nsecs), + is_last ? "\n" : " "); + last_q = q; } - stats->last = now ?: 1; + return out - buf; } -void bch2_time_stats_update(struct time_stats *stats, u64 start_time) +void bch2_time_stats_exit(struct time_stats *stats) { - spin_lock(&stats->lock); - __bch2_time_stats_update(stats, start_time); - spin_unlock(&stats->lock); + free_percpu(stats->buffer); } +void bch2_time_stats_init(struct time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + spin_lock_init(&stats->lock); +} + +/* ratelimit: */ + /** * bch2_ratelimit_delay() - return how long to delay until the next time to do * some work @@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d) } } +/* pd controller: */ + /* * Updates pd_controller. Attempts to scale inputed values to units per second. * @target: desired value @@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) derivative, change, next_io); } +/* misc: */ + void bch2_bio_map(struct bio *bio, void *base) { size_t size = bio->bi_iter.bi_size; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index cc89da1f..7c7264f4 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]); ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64); u64 bch2_read_flag_list(char *, const char * const[]); +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[32]; +}; + struct time_stats { spinlock_t lock; u64 count; - /* - * all fields are in nanoseconds, averages are ewmas stored left shifted - * by 8 - */ - u64 last_duration; - u64 max_duration; + /* all fields are in nanoseconds */ u64 average_duration; u64 average_frequency; - u64 last; + u64 max_duration; + u64 last_event; + struct quantiles quantiles; + + struct time_stat_buffer __percpu *buffer; }; -void bch2_time_stats_clear(struct time_stats *stats); -void __bch2_time_stats_update(struct time_stats *stats, u64 time); -void bch2_time_stats_update(struct time_stats *stats, u64 time); +void __bch2_time_stats_update(struct time_stats *stats, u64, u64); -static inline unsigned local_clock_us(void) +static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) { - return local_clock() >> 10; + __bch2_time_stats_update(stats, start, local_clock()); } -#define NSEC_PER_ns 1L -#define NSEC_PER_us NSEC_PER_USEC -#define NSEC_PER_ms NSEC_PER_MSEC -#define NSEC_PER_sec NSEC_PER_SEC +size_t bch2_time_stats_print(struct time_stats *, char *, size_t); -#define __print_time_stat(stats, name, stat, units) \ - sysfs_print(name ## _ ## stat ## _ ## units, \ - div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) - -#define sysfs_print_time_stats(stats, name, \ - frequency_units, \ - duration_units) \ -do { \ - __print_time_stat(stats, name, \ - average_frequency, frequency_units); \ - __print_time_stat(stats, name, \ - average_duration, duration_units); \ - sysfs_print(name ## _ ##count, (stats)->count); \ - sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \ - div_u64((stats)->last_duration, \ - NSEC_PER_ ## duration_units)); \ - sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ - div_u64((stats)->max_duration, \ - NSEC_PER_ ## duration_units)); \ - \ - sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ - ? div_s64(local_clock() - (stats)->last, \ - NSEC_PER_ ## frequency_units) \ - : -1LL); \ -} while (0) - -#define sysfs_clear_time_stats(stats, name) \ -do { \ - if (attr == &sysfs_ ## name ## _clear) \ - bch2_time_stats_clear(stats); \ -} while (0) - -#define sysfs_time_stats_attribute(name, \ - frequency_units, \ - duration_units) \ -write_attribute(name ## _clear); \ -read_attribute(name ## _count); \ -read_attribute(name ## _average_frequency_ ## frequency_units); \ -read_attribute(name ## _average_duration_ ## duration_units); \ -read_attribute(name ## _last_duration_ ## duration_units); \ -read_attribute(name ## _max_duration_ ## duration_units); \ -read_attribute(name ## _last_ ## frequency_units) - -#define sysfs_time_stats_attribute_list(name, \ - frequency_units, \ - duration_units) \ -&sysfs_ ## name ## _clear, \ -&sysfs_ ## name ## _count, \ -&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ -&sysfs_ ## name ## _average_duration_ ## duration_units, \ -&sysfs_ ## name ## _last_duration_ ## duration_units, \ -&sysfs_ ## name ## _max_duration_ ## duration_units, \ -&sysfs_ ## name ## _last_ ## frequency_units, +void bch2_time_stats_exit(struct time_stats *); +void bch2_time_stats_init(struct time_stats *); #define ewma_add(ewma, val, weight) \ ({ \