diff --git a/.bcachefs_revision b/.bcachefs_revision index 29c360d6..e53d4d84 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -78c6c8127e21fe2c8bf5c1d6a5e6832e28136f8f +63bbe0ca416791095c994aba7bea388e947dd60a diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c index a5b7786d..bce115b7 100644 --- a/c_src/cmd_migrate.c +++ b/c_src/cmd_migrate.c @@ -31,9 +31,6 @@ #include "libbcachefs/replicas.h" #include "libbcachefs/super.h" -/* XXX cut and pasted from fsck.c */ -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - static char *dev_t_to_path(dev_t dev) { char link[PATH_MAX], *p; diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c index d4701263..9a681f7b 100644 --- a/c_src/posix_to_bcachefs.c +++ b/c_src/posix_to_bcachefs.c @@ -1,5 +1,6 @@ #include <dirent.h> #include <sys/xattr.h> +#include <linux/dcache.h> #include <linux/xattr.h> #include "posix_to_bcachefs.h" @@ -158,7 +159,7 @@ static void write_data(struct bch_fs *c, op.nr_replicas = 1; op.subvol = 1; op.pos = SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX); - op.flags |= BCH_WRITE_SYNC; + op.flags |= BCH_WRITE_sync; int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, c->opts.data_replicas, 0); @@ -167,7 +168,7 @@ static void write_data(struct bch_fs *c, closure_call(&op.cl, bch2_write, NULL, NULL); - BUG_ON(!(op.flags & BCH_WRITE_SUBMITTED)); + BUG_ON(!(op.flags & BCH_WRITE_submitted)); dst_inode->bi_sectors += len >> 9; if (op.error) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7637854d..f395ce7f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -9,4 +9,7 @@ struct dentry { struct inode *d_inode; }; +#define QSTR_INIT(n,l) { { { .len = l } }, .name = n } +#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) + #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8f1cbc2d..2e2406dc 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include <linux/bug.h> #include <linux/byteorder.h> #include <linux/compiler.h> +#include <linux/dcache.h> #include <linux/math.h> #include <linux/minmax.h> diff --git a/include/linux/sched.h b/include/linux/sched.h index 153bd73d..ef5f9c49 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -97,6 +97,11 @@ struct task_struct { struct signal_struct { struct rw_semaphore exec_update_lock; } *signal, _signal; + + struct { + u64 sum_exec_runtime; + u64 exec_start; + } se; }; extern __thread struct task_struct *current; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index fc2ef33b..43c29b0d 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1803,7 +1803,6 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - u64 need_journal_commit_this_dev; }; static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - pos.inode, pos.offset)) { - s->need_journal_commit++; - s->need_journal_commit_this_dev++; + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + pos.inode, pos.offset); + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; goto out; } @@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, discard_locked = true; } - if (!bkey_eq(*discard_pos_done, iter.pos) && - ca->mi.discard && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - *discard_pos_done = iter.pos; + if (!bkey_eq(*discard_pos_done, iter.pos)) { s->discarded++; + *discard_pos_done = iter.pos; - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; + if (ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock_long(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); @@ -1897,7 +1897,10 @@ commit: if (ret) goto out; - count_event(c, bucket_discard); + if (!fastpath) + count_event(c, bucket_discard); + else + count_event(c, bucket_discard_fast); out: fsck_err: if (discard_locked) @@ -1929,6 +1932,9 @@ static void bch2_do_discards_work(struct work_struct *work) POS(ca->dev_idx, U64_MAX), 0, k, bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); + if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) + bch2_journal_flush_async(&c->journal, NULL); + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -2024,7 +2030,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 6df41c33..1759c15a 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - static inline bool may_alloc_bucket(struct bch_fs *c, struct bpos bucket, struct bucket_alloc_state *s) @@ -205,8 +188,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c, return false; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + u64 journal_seq_ready = + bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; s->skipped_need_journal_commit++; return false; } @@ -235,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -570,7 +557,7 @@ alloc: ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - if (s.skipped_need_journal_commit * 2 > avail) + if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { @@ -724,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + cl, flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -1332,7 +1319,7 @@ retry: if (wp->data_type != BCH_DATA_user) have_cache = true; - if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + if (target && !(flags & BCH_WRITE_only_specified_devs)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1422,7 +1409,7 @@ err: if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; - if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && + if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index f25481a0..baf5dc16 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) return bch2_dev_have_ref(c, ob->dev); } +static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_interior_updates: + return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum bch_watermark, enum bch_data_type, struct closure *); diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 9bbb28e9..8f79f46c 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -18,6 +18,7 @@ struct bucket_alloc_state { u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; u64 skipped_mi_btree_bitmap; @@ -89,6 +90,7 @@ struct dev_stripe_state { x(stopped) \ x(waiting_io) \ x(waiting_work) \ + x(runnable) \ x(running) enum write_point_state { @@ -124,6 +126,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; + u64 last_runtime; } __aligned(SMP_CACHE_BYTES); }; diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index ebeb6a5f..655be233 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -244,27 +244,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) return bkey_s_c_null; - if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level, + iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); + return k; + } + + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + + if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 161cf2f0..13acfbf3 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -687,7 +687,8 @@ struct btree_trans_buf { x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ - x(btree_write_buffer) + x(btree_write_buffer) \ + x(btree_node_scrub) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 3c23bdf7..e8a89d37 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -87,6 +87,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) +#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -213,6 +214,10 @@ struct bch_ioctl_data { struct bpos end_pos; union { + struct { + __u32 dev; + __u32 data_types; + } scrub; struct { __u32 dev; __u32 pad; @@ -237,11 +242,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; @@ -443,4 +456,13 @@ struct bch_ioctl_query_accounting { struct bkey_i_accounting accounting[]; }; +#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) + +struct bch_ioctl_query_counters { + __u16 nr; + __u16 flags; + __u32 pad; + __u64 d[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 672ca2c1..ca755e8d 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -24,7 +24,10 @@ do { \ } while (0) const char * const bch2_btree_node_flags[] = { -#define x(f) #f, + "typebit", + "typebit", + "typebit", +#define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index e371e60e..13ab827d 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" #include "btree_cache.h" @@ -1352,7 +1353,7 @@ start: can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - &failed, &rb->pick) > 0; + &failed, &rb->pick, -1) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { @@ -1697,7 +1698,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick); + NULL, &pick, -1); if (ret <= 0) { struct printbuf buf = PRINTBUF; @@ -1811,6 +1812,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } +struct btree_node_scrub { + struct bch_fs *c; + struct bch_dev *ca; + void *buf; + bool used_mempool; + unsigned written; + + enum btree_id btree; + unsigned level; + struct bkey_buf key; + __le64 seq; + + struct work_struct work; + struct bio bio; +}; + +static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, + struct printbuf *err) +{ + unsigned written = 0; + + if (le64_to_cpu(data->magic) != bset_magic(c)) { + prt_printf(err, "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(data->magic)); + return false; + } + + while (written < (ptr_written ?: btree_sectors(c))) { + struct btree_node_entry *bne; + struct bset *i; + bool first = !written; + + if (first) { + bne = NULL; + i = &data->keys; + } else { + bne = (void *) data + (written << 9); + i = &bne->keys; + + if (!ptr_written && i->seq != data->keys.seq) + break; + } + + struct nonce nonce = btree_nonce(i, written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); + if (bch2_crc_cmp(data->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); + return false; + } + } + + written += vstruct_sectors(data, c->block_bits); + } else { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + if (bch2_crc_cmp(bne->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); + return false; + } + } + + written += vstruct_sectors(bne, c->block_bits); + } + } + + return true; +} + +static void btree_node_scrub_work(struct work_struct *work) +{ + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; + struct printbuf err = PRINTBUF; + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { + struct btree_trans *trans = bch2_trans_get(c); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, scrub->btree, + scrub->key.k->k.p, 0, scrub->level - 1, 0); + + struct btree *b; + int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + if (ret) + goto err; + + if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { + bch_err(c, "error validating btree node during scrub on %s at btree %s", + scrub->ca->name, err.buf); + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_begin(trans); + bch2_trans_put(trans); + } + + printbuf_exit(&err); + bch2_bkey_buf_exit(&scrub->key, c);; + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); + percpu_ref_put(&scrub->ca->io_ref); + kfree(scrub); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); +} + +static void btree_node_scrub_endio(struct bio *bio) +{ + struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); + + queue_work(scrub->c->btree_read_complete_wq, &scrub->work); +} + +int bch2_btree_node_scrub(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c k, unsigned dev) +{ + if (k.k->type != KEY_TYPE_btree_ptr_v2) + return 0; + + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) + return -BCH_ERR_erofs_no_writes; + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { + ret = -BCH_ERR_device_offline; + goto err; + } + + bool used_mempool = false; + void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); + + unsigned vecs = buf_pages(buf, c->opts.btree_node_size); + + struct btree_node_scrub *scrub = + kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); + if (!scrub) { + ret = -ENOMEM; + goto err_free; + } + + scrub->c = c; + scrub->ca = ca; + scrub->buf = buf; + scrub->used_mempool = used_mempool; + scrub->written = btree_ptr_sectors_written(k); + + scrub->btree = btree; + scrub->level = level; + bch2_bkey_buf_init(&scrub->key); + bch2_bkey_buf_reassemble(&scrub->key, c, k); + scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; + + INIT_WORK(&scrub->work, btree_node_scrub_work); + + bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); + scrub->bio.bi_iter.bi_sector = pick.ptr.offset; + scrub->bio.bi_end_io = btree_node_scrub_endio; + submit_bio(&scrub->bio); + return 0; +err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); + percpu_ref_put(&ca->io_ref); +err: + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + return ret; +} + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 6f9e4a6d..75ead381 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -132,6 +132,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 367231ab..5988219c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (!k.k) return k; @@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos iter->k = u; k.k = &iter->k; + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 3b62296c..1821f40c 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, unsigned flags) { - if (flags & BTREE_ITER_cached_nofill) + if (flags & BTREE_ITER_cached_nofill) { + ck_path->l[0].b = NULL; return 0; + } struct bch_fs *c = trans->c; struct btree_iter iter; @@ -746,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_unlock(); mutex_lock(&bc->table.mutex); mutex_unlock(&bc->table.mutex); - rcu_read_lock(); continue; } for (i = 0; i < tbl->size; i++) diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 6b79b672..2760dd95 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, flags, trans); } #define JSET_ENTRY_LOG_U64s 4 diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index f4aeadbe..ab111fec 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2189,6 +2189,26 @@ err: goto out; } +int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bpos pos, unsigned flags) +{ + BUG_ON(!level); + + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + struct async_btree_rewrite { struct bch_fs *c; struct work_struct work; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 7930ffea..fa5a88f9 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -169,7 +169,11 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); +int bch2_btree_node_rewrite_key(struct btree_trans *, + enum btree_id, unsigned, + struct bpos, unsigned); void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, unsigned, bool); diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index f9fb150e..c8a488e6 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ memset(t->d, 0, sizeof(t->d[0]) << t->bits); } -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket) +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) { struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - bool ret = false; - unsigned i; + u64 ret = 0; mutex_lock(&b->lock); t = b->t; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq > flushed_seq; + ret = h->journal_seq; break; } } diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h index d2ae19cb..365619ca 100644 --- a/libbcachefs/buckets_waiting_for_journal.h +++ b/libbcachefs/buckets_waiting_for_journal.h @@ -4,8 +4,8 @@ #include "buckets_waiting_for_journal_types.h" -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64); +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, + unsigned, u64); int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, u64, unsigned, u64, u64); diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 46e9e321..b38a3c6f 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -11,6 +11,7 @@ #include "move.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-counters.h" #include "super-io.h" #include "thread_with_file.h" @@ -312,7 +313,10 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.data_type = U8_MAX; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; return 0; } @@ -331,14 +335,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; @@ -710,6 +730,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); case BCH_IOCTL_QUERY_ACCOUNTING: return bch2_ioctl_query_accounting(c, arg); + case BCH_IOCTL_QUERY_COUNTERS: + return bch2_ioctl_query_counters(c, arg); default: return -ENOTTY; } diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index f99ff181..114bf2f3 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -4,6 +4,7 @@ #include "compress.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "opts.h" #include "super-io.h" @@ -254,11 +255,14 @@ err: goto out; } -int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - struct bch_extent_crc_unpacked *crc) +int bch2_bio_uncompress_inplace(struct bch_write_op *op, + struct bio *bio) { + struct bch_fs *c = op->c; + struct bch_extent_crc_unpacked *crc = &op->crc; struct bbuf data = { NULL }; size_t dst_len = crc->uncompressed_size << 9; + int ret = 0; /* bio must own its pages: */ BUG_ON(!bio->bi_vcnt); @@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch_err(c, "error rewriting existing data: extent too big"); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: extent too big"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch_err(c, "error rewriting existing data: decompression error"); - bio_unmap_or_unbounce(c, data); - return -EIO; + if (!c->opts.no_data_io) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: decompression error"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; + goto err; } /* @@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, crc->uncompressed_size = crc->live_size; crc->offset = 0; crc->csum = (struct bch_csum) { 0, 0 }; - +err: bio_unmap_or_unbounce(c, data); - return 0; + return ret; } int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h index 607fd5e2..bec2f05b 100644 --- a/libbcachefs/compress.h +++ b/libbcachefs/compress.h @@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, - struct bch_extent_crc_unpacked *); +struct bch_write_op; +int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, struct bvec_iter, struct bch_extent_crc_unpacked); unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 19ee424c..0c2f9e52 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -20,6 +20,8 @@ #include "subvolume.h" #include "trace.h" +#include <linux/ioprio.h> + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { + if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -91,15 +93,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_move_extent_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { - if (trace_move_extent_finish_enabled()) { - struct printbuf buf = PRINTBUF; + struct bch_fs *c = u->op.c; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_finish(c, buf.buf); - printbuf_exit(&buf); - } + prt_newline(&buf); + + bch2_data_update_to_text(&buf, u); + prt_newline(&buf); + + prt_str_indented(&buf, "new replicas:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + prt_newline(&buf); + + prt_str_indented(&buf, "insert:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); } static void trace_move_extent_fail2(struct data_update *m, @@ -372,7 +387,8 @@ restart_drop_extra_replicas: bch2_btree_iter_set_pos(&iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + if (trace_move_extent_finish_enabled()) + trace_move_extent_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -412,14 +428,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } -void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc) +void bch2_data_update_read_done(struct data_update *m) { + m->read_done = true; + /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); - m->op.crc = crc; - m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + m->op.crc = m->rbio.pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size); closure_call(&m->op.cl, bch2_write, NULL, NULL); } @@ -429,31 +448,34 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + kfree(update->bvecs); + update->bvecs = NULL; + if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); bkey_put_dev_refs(c, k); - bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + bch2_bkey_buf_exit(&update->k, c); } -static void bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; - struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; struct closure cl; struct btree_iter iter; struct bkey_s_c k; - int ret; + int ret = 0; closure_init_stack(&cl); bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - while (bio_sectors(bio)) { - unsigned sectors = bio_sectors(bio); + while (bpos_lt(update->op.pos, update->k.k->k.p)) { + unsigned sectors = update->k.k->k.p.offset - + update->op.pos.offset; bch2_trans_begin(trans); @@ -489,7 +511,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch_err_fn_ratelimited(c, ret); if (ret) - return; + break; sectors = min(sectors, wp->sectors_free); @@ -499,7 +521,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); bch2_alloc_sectors_done(c, wp); - bio_advance(bio, sectors << 9); update->op.pos.offset += sectors; extent_for_each_ptr(extent_i_to_s(e), ptr) @@ -518,6 +539,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_unlock(trans); closure_sync(&cl); } + + return ret; } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, @@ -527,37 +550,47 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, if (!out->nr_tabstops) printbuf_tabstop_push(out, 20); - prt_printf(out, "rewrite ptrs:\t"); + prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_printf(out, "kill ptrs:\t"); + prt_str_indented(out, "kill ptrs:\t"); bch2_prt_u64_base2(out, data_opts->kill_ptrs); prt_newline(out); - prt_printf(out, "target:\t"); + prt_str_indented(out, "target:\t"); bch2_target_to_text(out, c, data_opts->target); prt_newline(out); - prt_printf(out, "compression:\t"); + prt_str_indented(out, "compression:\t"); bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); - prt_printf(out, "opts.replicas:\t"); + prt_str_indented(out, "opts.replicas:\t"); prt_u64(out, io_opts->data_replicas); prt_newline(out); - prt_printf(out, "extra replicas:\t"); + prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); prt_newline(out); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_newline(out); + + prt_str_indented(out, "old key:\t"); + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); +} + +void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) { bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_printf(out, "read_done:\t\%u\n", m->read_done); bch2_write_op_to_text(out, &m->op); printbuf_indent_sub(out, 2); } @@ -605,6 +638,40 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } +static bool can_allocate_without_blocking(struct bch_fs *c, + struct data_update *m) +{ + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return false; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + return nr_replicas >= m->op.nr_replicas; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, @@ -628,16 +695,7 @@ int bch2_data_update_init(struct btree_trans *trans, * snapshots table - just skip it, we can move it later. */ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; - - if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done; - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - bkey_put_dev_refs(c, k); - return -BCH_ERR_nocow_lock_blocked; - } + return -BCH_ERR_data_update_done_no_snapshot; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -652,10 +710,10 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + m->op.flags |= BCH_WRITE_pages_stable| + BCH_WRITE_pages_owned| + BCH_WRITE_data_encoded| + BCH_WRITE_move| m->data_opts.write_flags; m->op.compression_opt = io_opts.background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; @@ -729,7 +787,15 @@ int bch2_data_update_init(struct btree_trans *trans, /* if iter == NULL, it's just a promote */ if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); - goto out; + if (!ret) + ret = -BCH_ERR_data_update_done_no_writes_needed; + goto out_bkey_buf_exit; + } + + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + !can_allocate_without_blocking(c, m)) { + ret = -BCH_ERR_data_update_done_would_block; + goto out_bkey_buf_exit; } if (reserve_sectors) { @@ -738,18 +804,63 @@ int bch2_data_update_init(struct btree_trans *trans, ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto out; + goto out_bkey_buf_exit; + } + + if (!bkey_get_dev_refs(c, k)) { + ret = -BCH_ERR_data_update_done_no_dev_refs; + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto out_put_dev_refs; } if (bkey_extent_is_unwritten(k)) { - bch2_update_unwritten_extent(trans, m); - goto out; + ret = bch2_update_unwritten_extent(trans, m) ?: + -BCH_ERR_data_update_done_unwritten; + goto out_nocow_unlock; } + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + goto enomem; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) + goto enomem; + + rbio_init(&m->rbio.bio, c, io_opts, NULL); + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; -out: - bch2_data_update_exit(m); - return ret ?: -BCH_ERR_data_update_done; +enomem: + ret = -ENOMEM; + kfree(m->bvecs); + m->bvecs = NULL; +out_nocow_unlock: + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); +out_put_dev_refs: + bkey_put_dev_refs(c, k); +out_put_disk_res: + bch2_disk_reservation_put(c, &m->op.res); +out_bkey_buf_exit: + bch2_bkey_buf_exit(&m->k, c); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index e4b50723..dcc60765 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -4,6 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" +#include "io_read.h" #include "io_write_types.h" struct moving_context; @@ -15,6 +16,9 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, @@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, struct data_update { /* extent being updated: */ + bool read_done; enum btree_id btree_id; struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; struct bch_move_stats *stats; + + struct bch_read_bio rbio; struct bch_write_op op; + struct bio_vec *bvecs; }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); +void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); -void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked); +void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index b5de52a5..788af88f 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -20,6 +21,7 @@ #include "extents.h" #include "fsck.h" #include "inode.h" +#include "journal_reclaim.h" #include "super.h" #include <linux/console.h> @@ -189,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, unsigned offset = 0; int ret; - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } @@ -843,8 +845,11 @@ restart: seqmutex_unlock(&c->btree_trans_lock); } -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); + +static ssize_t bch2_simple_print(struct file *file, char __user *buf, + size_t size, loff_t *ppos, + fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -855,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, i->ret = 0; if (!i->iter) { - btree_deadlock_to_text(&i->buf, c); + fn(&i->buf, c); i->iter++; } @@ -868,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, return ret ?: i->ret; } +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); +} + static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, @@ -875,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { .read = bch2_btree_deadlock_read, }; +static ssize_t bch2_write_points_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); +} + +static const struct file_operations write_points_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_write_points_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -926,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 4590cd0c..89df9781 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -180,6 +180,11 @@ x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ + x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ + x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -269,6 +274,7 @@ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ + x(EIO, device_offline) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 05d5f71a..78a51d96 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -114,8 +114,9 @@ static inline bool ptr_better(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick, + int dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -137,6 +138,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, break; } + /* Are we being asked to read from a specific device? */ + if (dev >= 0 && p.ptr.dev != dev) + continue; + /* * If there are any dirty pointers it's an error if we can't * read: diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 620b284a..8fae6b23 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -404,7 +404,7 @@ void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index ab1d5db2..83e15908 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); @@ -211,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -280,12 +278,13 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), @@ -323,10 +322,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); @@ -420,7 +419,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 2089c36b..535bc5fc 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + bool split = false; size_t shorten; ssize_t ret; @@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_read_bioset); - bio->bi_end_io = bch2_direct_IO_read_endio; - dio = container_of(bio, struct dio_read, rbio.bio); closure_init(&dio->cl, NULL); @@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { + split = true; + bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), REQ_OP_READ, GFP_KERNEL, &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; start: bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; @@ -160,7 +160,15 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + struct bch_read_bio *rbio = + rbio_init(bio, + c, + opts, + split + ? bch2_direct_IO_read_split_endio + : bch2_direct_IO_read_endio); + + bch2_read(c, rbio, inode_inum(inode)); } blk_finish_plug(&plug); @@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + dio->op.flags |= BCH_WRITE_sync; + dio->op.flags |= BCH_WRITE_check_enospc; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, bio_sectors(bio), true); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 8fcf7c8e..53a421ff 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * return ret; struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); - struct qstr name = (struct qstr) QSTR(name_buf); + struct qstr name = QSTR(name_buf); inode->bi_dir = lostfound.bi_inum; diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index d2e13452..428b9be6 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +#include "rebalance.h" + static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { struct bch_io_opts io_opts; bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(&io_opts); + return io_opts_to_rebalance_opts(c, &io_opts); } int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 8c7b2d3d..c8d0925f 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -80,6 +80,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; + struct work_struct work; struct data_update write; struct bio_vec bi_inline_vecs[]; /* must be last */ }; @@ -96,6 +97,26 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } +static bool ptr_being_rewritten(struct bch_read_bio *orig, + unsigned dev, + unsigned flags) +{ + if (!(flags & BCH_READ_data_update)) + return false; + + struct data_update *u = container_of(orig, struct data_update, rbio); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev && + u->data_opts.rewrite_ptrs & BIT(i)) + return true; + i++; + } + + return false; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, @@ -105,7 +126,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) + if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) @@ -125,163 +146,138 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void promote_free(struct bch_fs *c, struct promote_op *op) +static noinline void promote_free(struct bch_read_bio *rbio) { - int ret; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + struct bch_fs *c = rbio->c; + + int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); bch2_data_update_exit(&op->write); - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; + struct promote_op *op = container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.rbio.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); + promote_free(&op->write.rbio); } -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +static void promote_start_work(struct work_struct *work) { - struct bio *bio = &op->write.op.wbio.bio; + struct promote_op *op = container_of(work, struct promote_op, work); + + bch2_data_update_read_done(&op->write); +} + +static noinline void promote_start(struct bch_read_bio *rbio) +{ + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); trace_and_count(op->write.op.c, read_promote, &rbio->bio); - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - - bch2_data_update_read_done(&op->write, rbio->pick.crc); + INIT_WORK(&op->work, promote_start_work); + queue_work(rbio->c->write_ref_wq, &op->work); } -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio, - struct bch_io_failures *failed) +static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + unsigned sectors, + unsigned flags, + struct bch_read_bio *orig, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; + + if (!have_io_error(failed)) { + update_opts.target = orig->opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags |= BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; + } else { + update_opts.target = orig->opts.foreground_target; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev) && + !ptr_being_rewritten(orig, ptr->dev, flags)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + if (!update_opts.rewrite_ptrs) + return NULL; + } + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; - goto err; + goto err_put; } op->start_time = local_clock(); op->pos = pos; - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_KERNEL); - if (!*rbio) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } - - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } - - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { ret = -BCH_ERR_nopromote_in_flight; goto err; } - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - struct data_update_opts update_opts = {}; - - if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; - } else { - update_opts.target = opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - opts, + orig->opts, update_opts, btree_id, k); /* * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ - if (ret) { - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); - goto err; - } + if (ret) + goto err_remove_hash; + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; + op->write.rbio.promote = true; op->write.op.end_io = promote_done; - return op; + return &op->write.rbio; +err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; + bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); +err_put: bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, +static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned flags, - struct bch_read_bio **rbio, + struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed) @@ -301,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags, failed); + ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio, failed); + struct bch_read_bio *promote = + __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, sectors, flags, orig, failed); + if (!promote) + return NULL; + ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -375,20 +374,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->kmalloc) - kfree(rbio); - else + if (unlikely(rbio->promote)) { + if (!rbio->bio.bi_status) + promote_start(rbio); + else + promote_free(rbio); + } else { + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + } rbio = parent; } @@ -408,61 +407,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, +static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { + struct data_update *u = container_of(rbio, struct data_update, rbio); struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - - bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_slots); retry: bch2_trans_begin(trans); - rbio->bio.bi_status = 0; - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + u->btree_id, bkey_start_pos(&u->k.k->k), + 0))); if (ret) goto err; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ rbio->hole = true; - goto out; + goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); + bkey_start_pos(&u->k.k->k), + u->btree_id, + bkey_i_to_s_c(u->k.k), + 0, failed, flags, -1); +err: + bch2_trans_iter_exit(trans, &iter); + if (ret == READ_RETRY) goto retry; if (ret) - goto err; -out: + rbio->bio.bi_status = BLK_STS_IOERR; + + BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); bch2_rbio_done(rbio); - bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - return; -err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; } static void bch2_rbio_retry(struct work_struct *work) @@ -483,29 +468,29 @@ static void bch2_rbio_retry(struct work_struct *work) if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); - rbio->bio.bi_status = 0; + if (!rbio->split) + rbio->bio.bi_status = 0; rbio = bch2_rbio_free(rbio); - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; + flags |= BCH_READ_in_retry; + flags &= ~BCH_READ_may_promote; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; - if (flags & BCH_READ_NODECODE) { + if (flags & BCH_READ_data_update) bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - + else __bch2_read(c, rbio, iter, inum, &failed, flags); - } } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, blk_status_t error) { rbio->retry = retry; + rbio->saw_error = true; - if (rbio->flags & BCH_READ_IN_RETRY) + if (rbio->flags & BCH_READ_in_retry) return; if (retry == READ_ERR) { @@ -712,32 +697,40 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; + if (likely(!(rbio->flags & BCH_READ_data_update))) { + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (rbio->split) + rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -754,12 +747,9 @@ static void __bch2_read_endio(struct work_struct *work) ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } @@ -772,8 +762,8 @@ csum_err: * reading into buffers owned by userspace (that userspace can * scribble over) - retry the read, bouncing it this time: */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; + if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); goto out; } @@ -810,11 +800,11 @@ static void bch2_read_endio(struct bio *bio) return; } - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); - if (rbio->flags & BCH_READ_RETRY_IF_STALE) + if (rbio->flags & BCH_READ_retry_if_stale) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); else bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); @@ -883,12 +873,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); int pick_ret; @@ -905,7 +894,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ if (!pick_ret) @@ -941,7 +930,7 @@ retry_pick: * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ - if ((flags & BCH_READ_IN_RETRY) && + if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { @@ -955,48 +944,52 @@ retry_pick: * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - bch2_trans_unlock(trans); + if (!(flags & BCH_READ_in_retry)) + bch2_trans_unlock(trans); + else + bch2_trans_unlock_long(trans); - if (flags & BCH_READ_NODECODE) { + if (!(flags & BCH_READ_data_update)) { + if (!(flags & BCH_READ_last_fragment) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_must_clone; + + narrow_crcs = !(flags & BCH_READ_in_retry) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { + read_full = true; + bounce = true; + } + } else { + read_full = true; /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + struct data_update *u = container_of(orig, struct data_update, rbio); + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); goto hole; } iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { - read_full = true; - bounce = true; } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full, failed); + rbio = promote_alloc(trans, iter, k, &pick, flags, orig, + &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1015,7 +1008,7 @@ retry_pick: pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } -get_bio: + if (rbio) { /* * promote already allocated bounce rbio: @@ -1030,17 +1023,16 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(NULL, + rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { + } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1049,11 +1041,10 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); rbio->bio.bi_iter = iter; - rbio->split = true; } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1062,11 +1053,8 @@ get_bio: EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - rbio->c = c; rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else + if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; @@ -1076,20 +1064,14 @@ get_bio: rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; - rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_NODECODE) - orig->pick = pick; - rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; @@ -1097,18 +1079,19 @@ get_bio: if (rbio->bounce) trace_and_count(c, read_bounce, &rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + if (!(flags & BCH_READ_data_update)) + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); trace_and_count(c, read_split, &orig->bio); } @@ -1132,10 +1115,10 @@ get_bio: bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); @@ -1153,11 +1136,11 @@ get_bio: goto out; } - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { int ret; @@ -1180,7 +1163,7 @@ out: } err: - if (flags & BCH_READ_IN_RETRY) + if (flags & BCH_READ_in_retry) return READ_ERR; orig->bio.bi_status = BLK_STS_IOERR; @@ -1188,16 +1171,16 @@ err: hole: /* - * won't normally happen in the BCH_READ_NODECODE + * won't normally happen in the BCH_READ_data_update * (bch2_move_extent()) path, but if we retry and the extent we wanted * to read no longer exists we have to signal that: */ - if (flags & BCH_READ_NODECODE) + if (flags & BCH_READ_data_update) orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) bch2_rbio_done(orig); return 0; } @@ -1212,7 +1195,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1262,15 +1245,15 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, - offset_into_extent, failed, flags); + offset_into_extent, failed, flags, -1); if (ret) goto err; - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(bvec_iter.bi_size, bytes); diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index a82e8a94..73275da5 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -35,20 +35,19 @@ struct bch_read_bio { u16 flags; union { struct { - u16 bounce:1, + u16 promote:1, + bounce:1, split:1, - kmalloc:1, have_ioref:1, narrow_crcs:1, hole:1, + saw_error:1, retry:2, context:2; }; u16 _state; }; - struct bch_devs_list devs_have; - struct extent_ptr_decoded pick; /* @@ -65,8 +64,6 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct promote_op *promote; - struct bch_io_opts opts; struct work_struct work; @@ -108,23 +105,32 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; } -enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, +#define BCH_READ_FLAGS() \ + x(retry_if_stale) \ + x(may_promote) \ + x(user_mapped) \ + x(data_update) \ + x(last_fragment) \ + x(must_bounce) \ + x(must_clone) \ + x(in_retry) - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, +enum __bch_read_flags { +#define x(n) __BCH_READ_##n, + BCH_READ_FLAGS() +#undef x +}; + +enum bch_read_flags { +#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), + BCH_READ_FLAGS() +#undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); + struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, @@ -132,7 +138,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, unsigned offset_into_extent, unsigned flags) { __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); + data_btree, k, offset_into_extent, NULL, flags, -1); } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, @@ -145,24 +151,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BUG_ON(rbio->_state); - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped); } -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) +static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + struct bch_read_bio *orig) { struct bch_read_bio *rbio = to_rbio(bio); + rbio->c = orig->c; rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; + return rbio; +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_fs *c, + struct bch_io_opts opts, + bio_end_io_t end_io) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->start_time = local_clock(); + rbio->c = c; + rbio->_state = 0; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; return rbio; } diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 3e71860f..0177198e 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -374,7 +374,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -403,10 +403,10 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + op->flags & BCH_WRITE_move ? "(internal move)" : ""); } -static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) { __bch2_write_op_error(out, op, op->pos.offset); } @@ -483,7 +483,7 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) + if (!(op->flags & BCH_WRITE_move)) bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); @@ -529,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -538,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -570,14 +570,22 @@ out: err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -591,7 +599,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; @@ -605,8 +613,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_SUBMITTED) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -644,11 +652,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -672,7 +680,7 @@ static void bch2_write_endio(struct bio *bio) "data write error: %s", bch2_blk_status_to_str(bio->bi_status))) { set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { @@ -719,7 +727,7 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); bch2_keylist_push(&op->insert_keys); } @@ -836,7 +844,7 @@ static enum prep_encoded_ret { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + if (!(op->flags & BCH_WRITE_data_encoded)) return PREP_ENCODED_OK; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -873,7 +881,7 @@ static enum prep_encoded_ret { if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + if (bch2_bio_uncompress_inplace(op, bio)) return PREP_ENCODED_ERR; } @@ -944,9 +952,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); @@ -966,7 +974,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -1004,7 +1012,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -1036,7 +1044,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, @@ -1210,9 +1218,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1241,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bucket_to_lock *stale_at; int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1299,7 +1307,7 @@ retry: }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1307,7 +1315,7 @@ retry: bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { @@ -1332,7 +1340,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1346,7 +1354,7 @@ retry: op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) break; bch2_btree_iter_advance(&iter); } @@ -1366,15 +1374,15 @@ err: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1426,7 +1434,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1456,7 +1464,7 @@ again: ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, @@ -1479,10 +1487,10 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { + if (!(op->flags & BCH_WRITE_alloc_nowait)) { struct printbuf buf = PRINTBUF; bch2_write_op_error(&buf, op); prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); @@ -1514,14 +1522,14 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_SUBMITTED) && - !(op->flags & BCH_WRITE_IN_WORKER))) { + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1542,8 +1550,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) memset(&op->failed, 0, sizeof(op->failed)); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1606,8 +1614,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); - if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - op->flags |= BCH_WRITE_ALLOC_NOWAIT; + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); @@ -1628,13 +1636,14 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && + if (!(op->flags & BCH_WRITE_move) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_move)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index 5400ce94..02cca52b 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -20,22 +20,23 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); + #define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(SUBMITTED) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) enum __bch_write_flags { #define x(f) __BCH_WRITE_##f, diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h index 6e878a6f..3ef6df91 100644 --- a/libbcachefs/io_write_types.h +++ b/libbcachefs/io_write_types.h @@ -64,7 +64,7 @@ struct bch_write_op { struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_ENCODED: */ + /* For BCH_WRITE_data_encoded: */ struct bch_extent_crc_unpacked crc; struct write_point_specifier write_point; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 2cd20114..20b748f6 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(p->list); i++) - INIT_LIST_HEAD(&p->list[i]); - INIT_LIST_HEAD(&p->flushed); + for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) + INIT_LIST_HEAD(&p->unflushed[i]); + for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) + INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; } @@ -307,7 +306,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_space_available(j); - __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) @@ -320,6 +319,16 @@ void bch2_journal_halt(struct journal *j) spin_unlock(&j->lock); } +void bch2_journal_halt_locked(struct journal *j) +{ + lockdep_assert_held(&j->lock); + + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -382,9 +391,12 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; - if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, - c, "cannot start: journal seq overflow")) + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } BUG_ON(!j->cur_entry_sectors); @@ -601,6 +613,16 @@ out: : -BCH_ERR_journal_res_get_blocked; } +static unsigned max_dev_latency(struct bch_fs *c) +{ + u64 nsecs = 0; + + for_each_rw_member(c, ca) + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + + return nsecs_to_jiffies(nsecs); +} + /* * Essentially the entry function to the journaling code. When bcachefs is doing * a btree insert, it calls this function to get the current journal write. @@ -612,17 +634,31 @@ out: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags) + unsigned flags, + struct btree_trans *trans) { int ret; if (closure_wait_event_timeout(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK), - HZ * 10)) + HZ)) return ret; + if (trans) + bch2_trans_unlock_long(trans); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); + + remaining_wait = max(0, remaining_wait - HZ); + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + remaining_wait)) + return ret; + struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", @@ -727,7 +763,7 @@ recheck_need_open: * livelock: */ sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -760,6 +796,7 @@ recheck_need_open: } buf->must_flush = true; + j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); @@ -848,7 +885,7 @@ out: static int __bch2_journal_meta(struct journal *j) { struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -1345,8 +1382,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - j->reservations.unwritten_idx++; + j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1602,54 +1638,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) __bch2_journal_debug_to_text(out, j); spin_unlock(&j->lock); } - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - if (!list_empty(&pin_list->flushed)) - prt_printf(out, "flushed:\n"); - - list_for_each_entry(pin, &pin_list->flushed, list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index cb0df066..e514d664 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -193,7 +193,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { - return vstruct_idx(j->buf[res->idx].data, res->offset); + return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, @@ -267,8 +267,9 @@ bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); -static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -276,8 +277,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s bch2_journal_buf_put_final(j, seq); } -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -306,13 +308,13 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx, res->seq); + bch2_journal_buf_put(j, res->seq); res->ref = 0; } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned); + unsigned, struct btree_trans *); /* First bits for BCH_WATERMARK: */ enum journal_res_flags { @@ -361,14 +363,17 @@ static inline int journal_res_get_fast(struct journal *j, &old.v, new.v)); res->ref = true; - res->idx = old.idx; res->offset = old.cur_entry_offset; - res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + res->seq = journal_cur_seq(j); + res->seq -= (res->seq - old.idx) & JOURNAL_BUF_MASK; + + EBUG_ON(res->seq != le64_to_cpu(j->buf[old.idx].data->seq)); return 1; } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags) + unsigned u64s, unsigned flags, + struct btree_trans *trans) { int ret; @@ -380,7 +385,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, flags); + ret = bch2_journal_res_get_slowpath(j, res, flags, trans); if (ret) return ret; out: @@ -408,6 +413,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); +void bch2_journal_halt_locked(struct journal *); static inline int bch2_journal_error(struct journal *j) { @@ -429,8 +435,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 7f2efe85..f2ff28e6 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -17,6 +17,7 @@ #include "sb-clean.h" #include "trace.h" +#include <linux/ioprio.h> #include <linux/string_choices.h> void bch2_journal_pos_from_member_info_set(struct bch_fs *c) @@ -1610,7 +1611,6 @@ static CLOSURE_CALLBACK(journal_write_done) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1670,16 +1670,6 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); - BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - - new.unwritten_idx++; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - closure_wake_up(&w->wait); completed = true; } @@ -1694,7 +1684,7 @@ static CLOSURE_CALLBACK(journal_write_done) } if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1763,6 +1753,7 @@ static CLOSURE_CALLBACK(journal_write_submit) bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ca->prev_journal_sector = bio->bi_iter.bi_sector; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 3c824260..6a9cefb6 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) popped = true; } - if (popped) + if (popped) { bch2_journal_space_available(j); + __closure_wake_up(&j->reclaim_flush_wait); + } } bool __bch2_journal_pin_put(struct journal *j, u64 seq) @@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, pin->seq = 0; list_del_init(&pin->list); + if (j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + /* * Unpinning a journal entry may make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: @@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || fn == bch2_btree_node_flush1) - return JOURNAL_PIN_btree; + return JOURNAL_PIN_TYPE_btree; else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_key_cache; + return JOURNAL_PIN_TYPE_key_cache; else - return JOURNAL_PIN_other; + return JOURNAL_PIN_TYPE_other; } static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, @@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; - list_add(&pin->list, &pin_list->list[type]); + + if (list_empty(&pin_list->unflushed[type]) && + j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + + list_add(&pin->list, &pin_list->unflushed[type]); } void bch2_journal_pin_copy(struct journal *j, @@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j, { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { if (*seq > seq_to_flush && !allowed_above_seq) break; - for (i = 0; i < JOURNAL_PIN_NR; i++) - if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || - ((1U << i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->list[i], + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || + (BIT(i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->unflushed[i], struct journal_entry_pin, list); if (ret) return ret; @@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j, } if (min_key_cache) { - allowed_above |= 1U << JOURNAL_PIN_key_cache; - allowed_below |= 1U << JOURNAL_PIN_key_cache; + allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); + allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); } cond_resched(); @@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j, j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + pin = journal_get_next_pin(j, seq_to_flush, + allowed_below, + allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j) return 0; } +static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + struct journal_entry_pin_list *pin_list; + u64 seq; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { + if (seq > seq_to_flush) + break; + + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if ((BIT(i) & types) && + (!list_empty(&pin_list->unflushed[i]) || + !list_empty(&pin_list->flushed[i]))) { + spin_unlock(&j->lock); + return true; + } + } + spin_unlock(&j->lock); + + return false; +} + +static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || + journal_pins_still_flushing(j, seq_to_flush, types); +} + static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool *did_work) { - int ret; + int ret = 0; ret = bch2_journal_error(j); if (ret) @@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_key_cache)| - (1U << JOURNAL_PIN_other), 0, 0, 0) || - journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_btree), 0, 0, 0)) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, + BIT(JOURNAL_PIN_TYPE_key_cache)| + BIT(JOURNAL_PIN_TYPE_other))) { *did_work = true; + goto unlock; + } + + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, + BIT(JOURNAL_PIN_TYPE_btree))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); @@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, !fifo_used(&j->pin); spin_unlock(&j->lock); +unlock: mutex_unlock(&j->reclaim_lock); return ret; @@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) if (!test_bit(JOURNAL_running, &j->flags)) return false; - closure_wait_event(&j->async_wait, + closure_wait_event(&j->reclaim_flush_wait, journal_flush_done(j, seq_to_flush, &did_work)); return did_work; @@ -926,3 +975,54 @@ err: return ret; } + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + + spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); + printbuf_indent_add(out, 2); + + prt_printf(out, "unflushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) + list_for_each_entry(pin, &pin_list->unflushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + prt_printf(out, "flushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) + list_for_each_entry(pin, &pin_list->flushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index ec84c334..0a73d713 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) int bch2_journal_flush_device_pins(struct journal *, int); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index e9bd716f..6a098c7e 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -53,15 +53,15 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_btree, - JOURNAL_PIN_key_cache, - JOURNAL_PIN_other, - JOURNAL_PIN_NR, + JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_key_cache, + JOURNAL_PIN_TYPE_other, + JOURNAL_PIN_TYPE_NR, }; struct journal_entry_pin_list { - struct list_head list[JOURNAL_PIN_NR]; - struct list_head flushed; + struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; + struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; struct bch_devs_list devs; }; @@ -79,7 +79,6 @@ struct journal_entry_pin { struct journal_res { bool ref; - u8 idx; u16 u64s; u32 offset; u64 seq; @@ -95,9 +94,8 @@ union journal_res_state { }; struct { - u64 cur_entry_offset:20, + u64 cur_entry_offset:22, idx:2, - unwritten_idx:2, buf0_count:10, buf1_count:10, buf2_count:10, @@ -107,13 +105,13 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ /* * We stash some journal state as sentinal values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) @@ -226,6 +224,7 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist reclaim_flush_wait; struct delayed_work write_work; struct workqueue_struct *wq; @@ -236,6 +235,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; + u64 flushing_seq; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 6c6ece52..e2581686 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -74,11 +74,7 @@ struct moving_io { unsigned read_sectors; unsigned write_sectors; - struct bch_read_bio rbio; - struct data_update write; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -113,7 +109,20 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + struct moving_context *ctxt = io->write.ctxt; + + if (ctxt->stats) { + if (io->write.rbio.bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (io->write.rbio.saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + if (unlikely(io->write.rbio.bio.bi_status || + io->write.rbio.hole || + io->write.data_opts.scrub)) { move_free(io); return; } @@ -131,7 +140,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc); + bch2_data_update_read_done(&io->write); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -144,7 +153,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx static void move_read_endio(struct bio *bio) { - struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); @@ -257,11 +266,6 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct moving_io *io; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned sectors = k.k->size, pages; int ret = -ENOMEM; trace_move_extent2(c, k, &io_opts, &data_opts); @@ -272,7 +276,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -284,13 +289,7 @@ int bch2_move_extent(struct moving_context *ctxt, */ bch2_trans_unlock(trans); - /* write path might have to decompress data: */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); - - pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - io = kzalloc(sizeof(struct moving_io) + - sizeof(struct bio_vec) * pages, GFP_KERNEL); + struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); if (!io) goto err; @@ -299,31 +298,21 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - bio_set_prio(&io->write.op.wbio.bio, - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, - GFP_KERNEL)) - goto err_free; + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + io->write.op.c = c; + io->write.data_opts = data_opts; + } - io->rbio.c = c; - io->rbio.opts = io_opts; - bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->rbio.bio.bi_vcnt = pages; - bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - io->rbio.bio.bi_iter.bi_size = sectors << 9; - - io->rbio.bio.bi_opf = REQ_OP_READ; - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = move_read_endio; - - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free_pages; - - io->write.op.end_io = move_write_done; + io->write.rbio.bio.bi_end_io = move_read_endio; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -355,18 +344,19 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_NODECODE| - BCH_READ_LAST_FRAGMENT); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_data_update| + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free_pages: - bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - if (ret == -BCH_ERR_data_update_done) + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) return 0; if (bch2_err_matches(ret, EROFS) || @@ -626,7 +616,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; - if (ret2 == -ENOMEM) { + if (bch2_err_matches(ret2, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); continue; @@ -688,21 +678,22 @@ int bch2_move_data(struct bch_fs *c, bool wait_on_copygc, move_pred_fn pred, void *arg) { - struct moving_context ctxt; - int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_move_data(&ctxt, start, end, pred, arg); + int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts _data_opts) +static int __bch2_move_data_phys(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + unsigned data_types, + move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -711,16 +702,20 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - struct data_update_opts data_opts; unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + struct bch_dev *ca = bch2_dev_tryget(c, dev); if (!ca) return 0; - trace_bucket_evacuate(c, &bucket); + bucket_end = min(bucket_end, ca->mi.nbuckets); + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); + bch2_dev_put(ca); + ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -731,8 +726,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), 0); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) @@ -756,7 +750,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; - if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + if (!k.k || bkey_gt(k.k->p, bp_end)) break; if (k.k->type != KEY_TYPE_backpointer) @@ -764,107 +758,145 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + if (ctxt->stats) + ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + if (!(data_types & BIT(bp.v->data_type))) + goto next; + + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } - - data_opts = _data_opts; - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; - - unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - if (p.ptr.dev == bucket.inode) { - if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - data_opts.rewrite_ptrs |= 1U << i; - break; - } - i++; - } - - ret = bch2_move_extent(ctxt, bucket_in_flight, - &iter, k, io_opts, data_opts); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; - } else { - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!b) - goto next; - - unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, sectors); - if (ctxt->stats) { - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } - sectors_moved += btree_sectors(c); } + + struct data_update_opts data_opts = {}; + if (!pred(c, arg, k, &io_opts, &data_opts)) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + /* move_extent will drop locks */ + unsigned sectors = bp.v->bucket_len; + + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); + + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); + sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } - - trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); return ret; } +static int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ctxt.stats->phys = true; + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct evacuate_bucket_arg *arg = _arg; + + *data_opts = arg->data_opts; + + unsigned i = 0; + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == arg->bucket.inode && + (arg->gen < 0 || arg->gen == ptr->gen) && + !ptr->cached) + data_opts->rewrite_ptrs |= BIT(i); + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +int bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) +{ + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + ~0, + evacuate_bucket_pred, &arg); +} + typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); @@ -1103,6 +1135,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +static bool scrub_pred(struct bch_fs *c, void *_arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -1117,6 +1173,16 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -1215,7 +1281,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str mutex_lock(&ctxt->lock); list_for_each_entry(io, &ctxt->ios, io_list) - bch2_data_update_to_text(out, &io->write); + bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index e22841ef..82e473ed 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -3,17 +3,31 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { - enum bch_data_type data_type; - struct bbpos pos; char name[32]; + bool phys; + enum bch_ioctl_data_event_ret ret; + + union { + struct { + enum bch_data_type data_type; + struct bbpos pos; + }; + struct { + unsigned dev; + u64 offset; + }; + }; atomic64_t keys_moved; atomic64_t keys_raced; atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 85c361e7..21805509 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); @@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt, *did_work = true; } err: - darray_exit(&buckets); /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) @@ -254,8 +254,11 @@ err: if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "from bch2_move_data()"); - moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; - trace_and_count(c, copygc, c, moved, 0, 0, 0); + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; + trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + + darray_exit(&buckets); return ret; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index e763d52e..9d397fc2 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -476,13 +476,13 @@ enum fsck_err_opts { NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable copygc: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ @@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); -/* rebalance opts: */ - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) -{ - return (struct bch_extent_rebalance) { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; -}; - #endif /* _BCACHEFS_OPTS_H */ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 4adc74cd..58f6d97e 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + if (opts->background_target) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - } return sectors; } @@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); return old == NULL || memcmp(old, &new, sizeof(new)); } else { return old != NULL; @@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(opts); + *old = io_opts_to_rebalance_opts(c, opts); } else { if (old) extent_entry_drop(k, (union bch_extent_entry *) old); @@ -343,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { /* @@ -451,7 +449,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, { data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; return data_opts->rewrite_ptrs != 0; } diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h index 0a0821ab..62a3859d 100644 --- a/libbcachefs/rebalance.h +++ b/libbcachefs/rebalance.h @@ -4,8 +4,28 @@ #include "compress.h" #include "disk_groups.h" +#include "opts.h" #include "rebalance_types.h" +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, + struct bch_io_opts *opts) +{ + struct bch_extent_rebalance r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; +}; + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); int bch2_get_update_rebalance_opts(struct btree_trans *, diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 98825437..71c786cd 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -32,7 +32,6 @@ #include <linux/sort.h> #include <linux/stat.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { diff --git a/libbcachefs/sb-counters.c b/libbcachefs/sb-counters.c index 6992e746..2b4b8445 100644 --- a/libbcachefs/sb-counters.c +++ b/libbcachefs/sb-counters.c @@ -5,7 +5,13 @@ /* BCH_SB_FIELD_counters */ -static const char * const bch2_counter_names[] = { +static const u8 counters_to_stable_map[] = { +#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, + BCH_PERSISTENT_COUNTERS() +#undef x +}; + +const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x @@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -}; +} static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; -}; +} static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (unsigned i = 0; i < nr; i++) - prt_printf(out, "%s \t%llu\n", - i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", - le64_to_cpu(ctrs->d[i])); -}; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + prt_printf(out, "%s \t%llu\n", + bch2_counter_names[i], + le64_to_cpu(ctrs->d[stable])); + } +} int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - u64 val = 0; - for (i = 0; i < BCH_COUNTER_NR; i++) + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { - val = le64_to_cpu(ctrs->d[i]); - percpu_u64_set(&c->counters[i], val); - c->counters_on_mount[i] = val; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) { + u64 v = le64_to_cpu(ctrs->d[stable]); + percpu_u64_set(&c->counters[i], v); + c->counters_on_mount[i] = v; + } } + return 0; -}; +} int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + } - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) - ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); return 0; } @@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; + +#ifndef NO_BCACHEFS_CHARDEV +long bch2_ioctl_query_counters(struct bch_fs *c, + struct bch_ioctl_query_counters __user *user_arg) +{ + struct bch_ioctl_query_counters arg; + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); + if (ret) + return ret; + + if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || + arg.pad) + return -EINVAL; + + arg.nr = min(arg.nr, BCH_COUNTER_NR); + ret = put_user(arg.nr, &user_arg->nr); + if (ret) + return ret; + + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + + if (stable < arg.nr) { + u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) + ? percpu_u64_get(&c->counters[i]) + : c->counters_on_mount[i]; + + ret = put_user(v, &user_arg->d[stable]); + if (ret) + return ret; + } + } + + return 0; +} +#endif diff --git a/libbcachefs/sb-counters.h b/libbcachefs/sb-counters.h index 81f8aec9..a4329ad8 100644 --- a/libbcachefs/sb-counters.h +++ b/libbcachefs/sb-counters.h @@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); void bch2_fs_counters_exit(struct bch_fs *); int bch2_fs_counters_init(struct bch_fs *); +extern const char * const bch2_counter_names[]; extern const struct bch_sb_field_ops bch_sb_field_ops_counters; +long bch2_ioctl_query_counters(struct bch_fs *, + struct bch_ioctl_query_counters __user *); + #endif // _BCACHEFS_SB_COUNTERS_H diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index fdcf598f..d0391c5d 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -13,6 +13,7 @@ enum counters_flags { x(io_move, 2, TYPE_SECTORS) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ x(bucket_alloc, 5, TYPE_COUNTER) \ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ x(btree_cache_scan, 7, TYPE_COUNTER) \ @@ -95,6 +96,13 @@ enum bch_persistent_counters { BCH_COUNTER_NR }; +enum bch_persistent_counters_stable { +#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_STABLE_NR +}; + struct bch_sb_field_counters { struct bch_sb_field field; __le64 d[]; diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 0b4fe899..ea0a1836 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -57,7 +57,7 @@ enum bch_fsck_flags { x(bset_wrong_sector_offset, 44, 0) \ x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, 0) \ + x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index 762083b5..b29b6c6c 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + static inline bool bch2_dev_is_readable(struct bch_dev *ca) { return bch2_dev_is_online(ca) && diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c index 8c2c5539..d78451c2 100644 --- a/libbcachefs/str_hash.c +++ b/libbcachefs/str_hash.c @@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static noinline int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) { struct qstr old_name = bch2_dirent_get_name(old); struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); @@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans, return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); } -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) +static noinline int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) { if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) @@ -142,8 +142,8 @@ fsck_err: * All versions of the same inode in different snapshots must have the same hash * seed/type: verify that the hash info we're using matches the root */ -static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) +static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) { struct bch_fs *c = trans->c; struct btree_iter iter; diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index e3d04752..b7b96283 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, POS(0, snapid), 0, snapshot); - ret = bkey_err(subvol); + ret = bkey_err(snapshot); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", snapid); if (ret) @@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, BTREE_ID_snapshot_trees, POS(0, treeid), 0, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) + goto err; if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { struct bkey_i_snapshot_tree *snapshot_tree_mut = diff --git a/libbcachefs/super.c b/libbcachefs/super.c index d97ea7bd..6d97d412 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index fa6d5221..04f8287e 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index a7eb1f51..b3f2c651 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -176,7 +176,6 @@ read_attribute(btree_reserve_cache); read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); -read_attribute(write_points); read_attribute(nocow_lock_table); #ifdef BCH_WRITE_REF_DEBUG @@ -364,9 +363,6 @@ SHOW(bch2_fs) if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); - if (attr == &sysfs_write_points) - bch2_write_points_to_text(out, c); - if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -569,7 +565,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, - &sysfs_write_points, #ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, #endif diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 9d40b7d4..2f25dcfc 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, TP_ARGS(c, str) ); -TRACE_EVENT(discard_buckets, +DECLARE_EVENT_CLASS(discard_buckets_class, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), @@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets, __entry->err) ); +DEFINE_EVENT(discard_buckets_class, discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + +DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), @@ -785,27 +797,6 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -TRACE_EVENT(bucket_evacuate, - TP_PROTO(struct bch_fs *c, struct bpos *bucket), - TP_ARGS(c, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = bucket->inode; - __entry->bucket = bucket->offset; - ), - - TP_printk("%d:%d %u:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) -); - DEFINE_EVENT(fs_str, move_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) @@ -869,65 +860,32 @@ TRACE_EVENT(move_data, __entry->sectors_raced) ); -TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, - int ret), - TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, member ) - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->member = bucket->inode; - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; - __entry->ret = ret; - ), - - TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, - __entry->ret) -); - TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, - u64 sectors_moved, u64 sectors_not_moved, - u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(c, - sectors_moved, sectors_not_moved, - buckets_moved, buckets_not_moved), + u64 buckets, + u64 sectors_seen, + u64 sectors_moved), + TP_ARGS(c, buckets, sectors_seen, sectors_moved), TP_STRUCT__entry( __field(dev_t, dev ) + __field(u64, buckets ) + __field(u64, sectors_seen ) __field(u64, sectors_moved ) - __field(u64, sectors_not_moved ) - __field(u64, buckets_moved ) - __field(u64, buckets_not_moved ) ), TP_fast_assign( __entry->dev = c->dev; + __entry->buckets = buckets; + __entry->sectors_seen = sectors_seen; __entry->sectors_moved = sectors_moved; - __entry->sectors_not_moved = sectors_not_moved; - __entry->buckets_moved = buckets_moved; - __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->sectors_not_moved, - __entry->buckets_moved, __entry->buckets_not_moved) + __entry->buckets, + __entry->sectors_seen, + __entry->sectors_moved) ); TRACE_EVENT(copygc_wait, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 1a172011..e7c3541b 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - static inline bool qstr_eq(const struct qstr l, const struct qstr r) { return l.len == r.len && !memcmp(l.name, r.name, l.len);