From b93b20592e431504210cd72e38fa89989da1cbbe Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko Date: Mon, 10 Feb 2025 07:26:39 +0300 Subject: [PATCH] sys-kernel/hardened-kernel: delete patchset 1192-bcachefs-upd-from-master-81b5431 Signed-off-by: Alexander Miroshnichenko --- ...192-bcachefs-upd-from-master-81b5431.patch | 6119 ----------------- 1 file changed, 6119 deletions(-) delete mode 100644 sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch diff --git a/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch b/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch deleted file mode 100644 index f794535..0000000 --- a/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch +++ /dev/null @@ -1,6119 +0,0 @@ -From 21a9c2ace04f6c699870b9222c3da9b8a9aaedf6 Mon Sep 17 00:00:00 2001 -From: Alexander Miroshnichenko -Date: Sun, 9 Feb 2025 22:05:21 +0300 -Subject: [PATCH] bcachefs: cherry-pick updates from master 81b5431 -Content-Type: text/plain; charset="utf-8" -Content-Transfer-Encoding: 8bit - -Signed-off-by: Alexander Miroshnichenko ---- - fs/bcachefs/Kconfig | 2 + - fs/bcachefs/Makefile | 1 + - fs/bcachefs/alloc_background.c | 12 +- - fs/bcachefs/alloc_background.h | 2 +- - fs/bcachefs/alloc_foreground.c | 25 +- - fs/bcachefs/alloc_foreground.h | 17 + - fs/bcachefs/alloc_types.h | 2 + - fs/bcachefs/backpointers.c | 108 ++---- - fs/bcachefs/backpointers.h | 11 +- - fs/bcachefs/bcachefs.h | 5 +- - fs/bcachefs/bcachefs_ioctl.h | 29 +- - fs/bcachefs/btree_gc.c | 18 +- - fs/bcachefs/btree_io.c | 205 ++++++++++- - fs/bcachefs/btree_io.h | 4 + - fs/bcachefs/btree_update_interior.c | 20 ++ - fs/bcachefs/btree_update_interior.h | 4 + - fs/bcachefs/chardev.c | 38 +- - fs/bcachefs/clock.c | 25 +- - fs/bcachefs/data_update.c | 220 +++++++++--- - fs/bcachefs/data_update.h | 17 +- - fs/bcachefs/debug.c | 34 +- - fs/bcachefs/ec.c | 25 +- - fs/bcachefs/errcode.h | 6 + - fs/bcachefs/error.c | 50 ++- - fs/bcachefs/error.h | 4 +- - fs/bcachefs/extents.c | 9 +- - fs/bcachefs/extents.h | 2 +- - fs/bcachefs/eytzinger.c | 76 ++-- - fs/bcachefs/eytzinger.h | 95 ++--- - fs/bcachefs/fs-io-buffered.c | 26 +- - fs/bcachefs/fs-io-direct.c | 20 +- - fs/bcachefs/fsck.c | 2 +- - fs/bcachefs/io_misc.c | 3 +- - fs/bcachefs/io_read.c | 515 ++++++++++++++-------------- - fs/bcachefs/io_read.h | 75 ++-- - fs/bcachefs/io_write.c | 95 ++--- - fs/bcachefs/io_write.h | 29 +- - fs/bcachefs/io_write_types.h | 2 +- - fs/bcachefs/journal.c | 123 +++++-- - fs/bcachefs/journal.h | 38 +- - fs/bcachefs/journal_io.c | 30 +- - fs/bcachefs/journal_seq_blacklist.c | 7 +- - fs/bcachefs/journal_types.h | 19 +- - fs/bcachefs/migrate.c | 26 +- - fs/bcachefs/move.c | 418 ++++++++++++---------- - fs/bcachefs/move_types.h | 18 +- - fs/bcachefs/progress.c | 63 ++++ - fs/bcachefs/progress.h | 29 ++ - fs/bcachefs/rebalance.c | 4 +- - fs/bcachefs/recovery.c | 1 - - fs/bcachefs/reflink.c | 21 +- - fs/bcachefs/sb-counters.c | 90 +++-- - fs/bcachefs/sb-counters.h | 4 + - fs/bcachefs/sb-counters_format.h | 30 +- - fs/bcachefs/sb-members.h | 12 + - fs/bcachefs/snapshot.c | 7 +- - fs/bcachefs/snapshot.h | 1 + - fs/bcachefs/sysfs.c | 5 - - fs/bcachefs/trace.h | 76 +--- - fs/bcachefs/util.c | 210 +++++++++--- - fs/bcachefs/util.h | 2 - - 61 files changed, 1967 insertions(+), 1100 deletions(-) - create mode 100644 fs/bcachefs/progress.c - create mode 100644 fs/bcachefs/progress.h - -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -index e8549d04dcb8..85eea7a4dea3 100644 ---- a/fs/bcachefs/Kconfig -+++ b/fs/bcachefs/Kconfig -@@ -15,6 +15,7 @@ config BCACHEFS_FS - select ZLIB_INFLATE - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS -+ select CRYPTO - select CRYPTO_SHA256 - select CRYPTO_CHACHA20 - select CRYPTO_POLY1305 -@@ -24,6 +25,7 @@ config BCACHEFS_FS - select XXHASH - select SRCU - select SYMBOLIC_ERRNAME -+ select MIN_HEAP - help - The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -index d2689388d5e8..1cf17a16af9f 100644 ---- a/fs/bcachefs/Makefile -+++ b/fs/bcachefs/Makefile -@@ -67,6 +67,7 @@ bcachefs-y := \ - nocow_locking.o \ - opts.o \ - printbuf.o \ -+ progress.o \ - quota.o \ - rebalance.o \ - rcu_pending.o \ -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -index 3ea809990ef1..a35455802280 100644 ---- a/fs/bcachefs/alloc_background.c -+++ b/fs/bcachefs/alloc_background.c -@@ -1897,7 +1897,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, - if (ret) - goto out; - -- count_event(c, bucket_discard); -+ if (!fastpath) -+ count_event(c, bucket_discard); -+ else -+ count_event(c, bucket_discard_fast); - out: - fsck_err: - if (discard_locked) -@@ -2090,6 +2093,13 @@ static int invalidate_one_bucket(struct btree_trans *trans, - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) - goto out; - -+ /* -+ * Impossible since alloc_lru_idx_read() only returns nonzero if the -+ * bucket is supposed to be on the cached bucket LRU (i.e. -+ * BCH_DATA_cached) -+ * -+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 -+ */ - BUG_ON(a->v.data_type != BCH_DATA_cached); - BUG_ON(a->v.dirty_sectors); - -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -index de25ba4ee94b..c556ccaffe89 100644 ---- a/fs/bcachefs/alloc_background.h -+++ b/fs/bcachefs/alloc_background.h -@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - if (a.stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (bch2_bucket_sectors_dirty(a)) -- return data_type; -+ return bucket_data_type(data_type); - if (a.cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -index 5a781fb4c794..1759c15a7745 100644 ---- a/fs/bcachefs/alloc_foreground.c -+++ b/fs/bcachefs/alloc_foreground.c -@@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) - closure_wake_up(&c->freelist_wait); - } - --static inline unsigned open_buckets_reserved(enum bch_watermark watermark) --{ -- switch (watermark) { -- case BCH_WATERMARK_interior_updates: -- return 0; -- case BCH_WATERMARK_reclaim: -- return OPEN_BUCKETS_COUNT / 6; -- case BCH_WATERMARK_btree: -- case BCH_WATERMARK_btree_copygc: -- return OPEN_BUCKETS_COUNT / 4; -- case BCH_WATERMARK_copygc: -- return OPEN_BUCKETS_COUNT / 3; -- default: -- return OPEN_BUCKETS_COUNT / 2; -- } --} -- - static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) -@@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * - - spin_lock(&c->freelist_lock); - -- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { -+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - -@@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - - struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, -- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); -+ cl, flags & BCH_WRITE_alloc_nowait, &usage); - if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); -@@ -1336,7 +1319,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - if (wp->data_type != BCH_DATA_user) - have_cache = true; - -- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, -@@ -1426,7 +1409,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; - -- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && -+ if (cl && !(flags & BCH_WRITE_alloc_nowait) && - bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; - -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -index f25481a0d1a0..baf5dc163c8a 100644 ---- a/fs/bcachefs/alloc_foreground.h -+++ b/fs/bcachefs/alloc_foreground.h -@@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) - return bch2_dev_have_ref(c, ob->dev); - } - -+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) -+{ -+ switch (watermark) { -+ case BCH_WATERMARK_interior_updates: -+ return 0; -+ case BCH_WATERMARK_reclaim: -+ return OPEN_BUCKETS_COUNT / 6; -+ case BCH_WATERMARK_btree: -+ case BCH_WATERMARK_btree_copygc: -+ return OPEN_BUCKETS_COUNT / 4; -+ case BCH_WATERMARK_copygc: -+ return OPEN_BUCKETS_COUNT / 3; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ - struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, enum bch_data_type, - struct closure *); -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -index 4aa8ee026cb8..8f79f46c2a78 100644 ---- a/fs/bcachefs/alloc_types.h -+++ b/fs/bcachefs/alloc_types.h -@@ -90,6 +90,7 @@ struct dev_stripe_state { - x(stopped) \ - x(waiting_io) \ - x(waiting_work) \ -+ x(runnable) \ - x(running) - - enum write_point_state { -@@ -125,6 +126,7 @@ struct write_point { - enum write_point_state state; - u64 last_state_change; - u64 time[WRITE_POINT_STATE_NR]; -+ u64 last_runtime; - } __aligned(SMP_CACHE_BYTES); - }; - -diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c -index ebeb6a5ff9d2..eb374d1970fe 100644 ---- a/fs/bcachefs/backpointers.c -+++ b/fs/bcachefs/backpointers.c -@@ -11,6 +11,7 @@ - #include "checksum.h" - #include "disk_accounting.h" - #include "error.h" -+#include "progress.h" - - #include - -@@ -244,27 +245,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) - return bkey_s_c_null; - -- if (likely(!bp.v->level)) { -- bch2_trans_node_iter_init(trans, iter, -- bp.v->btree_id, -- bp.v->pos, -- 0, 0, -- iter_flags); -- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -- if (bkey_err(k)) { -- bch2_trans_iter_exit(trans, iter); -- return k; -- } -+ bch2_trans_node_iter_init(trans, iter, -+ bp.v->btree_id, -+ bp.v->pos, -+ 0, -+ bp.v->level, -+ iter_flags); -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) { -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+ } - -- if (k.k && -- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) -- return k; -+ if (k.k && -+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) -+ return k; - -- bch2_trans_iter_exit(trans, iter); -+ bch2_trans_iter_exit(trans, iter); -+ -+ if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { - struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); -+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) -+ return bkey_s_c_null; - if (IS_ERR_OR_NULL(b)) - return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - -@@ -715,71 +720,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - return ret; - } - --struct progress_indicator_state { -- unsigned long next_print; -- u64 nodes_seen; -- u64 nodes_total; -- struct btree *last_node; --}; -- --static inline void progress_init(struct progress_indicator_state *s, -- struct bch_fs *c, -- u64 btree_id_mask) --{ -- memset(s, 0, sizeof(*s)); -- -- s->next_print = jiffies + HZ * 10; -- -- for (unsigned i = 0; i < BTREE_ID_NR; i++) { -- if (!(btree_id_mask & BIT_ULL(i))) -- continue; -- -- struct disk_accounting_pos acc = { -- .type = BCH_DISK_ACCOUNTING_btree, -- .btree.id = i, -- }; -- -- u64 v; -- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); -- s->nodes_total += div64_ul(v, btree_sectors(c)); -- } --} -- --static inline bool progress_update_p(struct progress_indicator_state *s) --{ -- bool ret = time_after_eq(jiffies, s->next_print); -- -- if (ret) -- s->next_print = jiffies + HZ * 10; -- return ret; --} -- --static void progress_update_iter(struct btree_trans *trans, -- struct progress_indicator_state *s, -- struct btree_iter *iter, -- const char *msg) --{ -- struct bch_fs *c = trans->c; -- struct btree *b = path_l(btree_iter_path(trans, iter))->b; -- -- s->nodes_seen += b != s->last_node; -- s->last_node = b; -- -- if (progress_update_p(s)) { -- struct printbuf buf = PRINTBUF; -- unsigned percent = s->nodes_total -- ? div64_u64(s->nodes_seen * 100, s->nodes_total) -- : 0; -- -- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", -- msg, percent, s->nodes_seen, s->nodes_total); -- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); -- -- bch_info(c, "%s", buf.buf); -- printbuf_exit(&buf); -- } --} -- - static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct extents_to_bp_state *s) - { -@@ -787,7 +727,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct progress_indicator_state progress; - int ret = 0; - -- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); -+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); - - for (enum btree_id btree_id = 0; - btree_id < btree_id_nr_alive(c); -@@ -806,7 +746,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); -+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - })); -@@ -1206,11 +1146,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); -- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); -+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - - int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, ({ -- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); -+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); - -diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h -index 060dad1521ee..7786731d4ada 100644 ---- a/fs/bcachefs/backpointers.h -+++ b/fs/bcachefs/backpointers.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H --#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H -+#ifndef _BCACHEFS_BACKPOINTERS_H -+#define _BCACHEFS_BACKPOINTERS_H - - #include "btree_cache.h" - #include "btree_iter.h" -@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: -- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; -+ if (p.has_ec) -+ return BCH_DATA_stripe; -+ if (p.ptr.cached) -+ return BCH_DATA_cached; -+ else -+ return BCH_DATA_user; - case KEY_TYPE_stripe: { - const struct bch_extent_ptr *ptr = &entry->ptr; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -index 161cf2f05d2a..e8f4999806b6 100644 ---- a/fs/bcachefs/bcachefs.h -+++ b/fs/bcachefs/bcachefs.h -@@ -444,6 +444,7 @@ BCH_DEBUG_PARAMS_DEBUG() - x(btree_node_sort) \ - x(btree_node_read) \ - x(btree_node_read_done) \ -+ x(btree_node_write) \ - x(btree_interior_update_foreground) \ - x(btree_interior_update_total) \ - x(btree_gc) \ -@@ -456,6 +457,7 @@ BCH_DEBUG_PARAMS_DEBUG() - x(blocked_journal_low_on_space) \ - x(blocked_journal_low_on_pin) \ - x(blocked_journal_max_in_flight) \ -+ x(blocked_journal_max_open) \ - x(blocked_key_cache_flush) \ - x(blocked_allocate) \ - x(blocked_allocate_open_bucket) \ -@@ -687,7 +689,8 @@ struct btree_trans_buf { - x(gc_gens) \ - x(snapshot_delete_pagecache) \ - x(sysfs) \ -- x(btree_write_buffer) -+ x(btree_write_buffer) \ -+ x(btree_node_scrub) - - enum bch_write_ref { - #define x(n) BCH_WRITE_REF_##n, -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -index 3c23bdf788ce..52594e925eb7 100644 ---- a/fs/bcachefs/bcachefs_ioctl.h -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -87,6 +87,7 @@ struct bch_ioctl_incremental { - #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) - #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) - #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) -+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) - - /* ioctl below act on a particular file, not the filesystem as a whole: */ - -@@ -213,6 +214,10 @@ struct bch_ioctl_data { - struct bpos end_pos; - - union { -+ struct { -+ __u32 dev; -+ __u32 data_types; -+ } scrub; - struct { - __u32 dev; - __u32 pad; -@@ -229,6 +234,11 @@ enum bch_data_event { - BCH_DATA_EVENT_NR = 1, - }; - -+enum data_progress_data_type_special { -+ DATA_PROGRESS_DATA_TYPE_phys = 254, -+ DATA_PROGRESS_DATA_TYPE_done = 255, -+}; -+ - struct bch_ioctl_data_progress { - __u8 data_type; - __u8 btree_id; -@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { - - __u64 sectors_done; - __u64 sectors_total; -+ __u64 sectors_error_corrected; -+ __u64 sectors_error_uncorrected; - } __packed __aligned(8); - -+enum bch_ioctl_data_event_ret { -+ BCH_IOCTL_DATA_EVENT_RET_done = 1, -+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, -+}; -+ - struct bch_ioctl_data_event { - __u8 type; -- __u8 pad[7]; -+ __u8 ret; -+ __u8 pad[6]; - union { - struct bch_ioctl_data_progress p; - __u64 pad2[15]; -@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { - struct bkey_i_accounting accounting[]; - }; - -+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) -+ -+struct bch_ioctl_query_counters { -+ __u16 nr; -+ __u16 flags; -+ __u32 pad; -+ __u64 d[]; -+}; -+ - #endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -index dd1d9b74076e..ff681e733598 100644 ---- a/fs/bcachefs/btree_gc.c -+++ b/fs/bcachefs/btree_gc.c -@@ -27,6 +27,7 @@ - #include "journal.h" - #include "keylist.h" - #include "move.h" -+#include "progress.h" - #include "recovery_passes.h" - #include "reflink.h" - #include "recovery.h" -@@ -656,7 +657,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - return ret; - } - --static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) -+static int bch2_gc_btree(struct btree_trans *trans, -+ struct progress_indicator_state *progress, -+ enum btree_id btree, bool initial) - { - struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; -@@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ -+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); - gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); - bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); - })); -@@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) - static int bch2_gc_btrees(struct bch_fs *c) - { - struct btree_trans *trans = bch2_trans_get(c); -- enum btree_id ids[BTREE_ID_NR]; - struct printbuf buf = PRINTBUF; -- unsigned i; - int ret = 0; - -- for (i = 0; i < BTREE_ID_NR; i++) -+ struct progress_indicator_state progress; -+ bch2_progress_init(&progress, c, ~0ULL); -+ -+ enum btree_id ids[BTREE_ID_NR]; -+ for (unsigned i = 0; i < BTREE_ID_NR; i++) - ids[i] = i; - bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - -- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { -+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - - if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) - continue; - -- ret = bch2_gc_btree(trans, btree, true); -+ ret = bch2_gc_btree(trans, &progress, btree, true); - } - - printbuf_exit(&buf); -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -index e371e60e3133..e71b278672b6 100644 ---- a/fs/bcachefs/btree_io.c -+++ b/fs/bcachefs/btree_io.c -@@ -1,6 +1,7 @@ - // SPDX-License-Identifier: GPL-2.0 - - #include "bcachefs.h" -+#include "bkey_buf.h" - #include "bkey_methods.h" - #include "bkey_sort.h" - #include "btree_cache.h" -@@ -1352,7 +1353,7 @@ static void btree_node_read_work(struct work_struct *work) - - can_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), -- &failed, &rb->pick) > 0; -+ &failed, &rb->pick, -1) > 0; - - if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { -@@ -1697,7 +1698,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - return; - - ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -- NULL, &pick); -+ NULL, &pick, -1); - - if (ret <= 0) { - struct printbuf buf = PRINTBUF; -@@ -1811,6 +1812,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); - } - -+struct btree_node_scrub { -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ void *buf; -+ bool used_mempool; -+ unsigned written; -+ -+ enum btree_id btree; -+ unsigned level; -+ struct bkey_buf key; -+ __le64 seq; -+ -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, -+ struct printbuf *err) -+{ -+ unsigned written = 0; -+ -+ if (le64_to_cpu(data->magic) != bset_magic(c)) { -+ prt_printf(err, "bad magic: want %llx, got %llx", -+ bset_magic(c), le64_to_cpu(data->magic)); -+ return false; -+ } -+ -+ while (written < (ptr_written ?: btree_sectors(c))) { -+ struct btree_node_entry *bne; -+ struct bset *i; -+ bool first = !written; -+ -+ if (first) { -+ bne = NULL; -+ i = &data->keys; -+ } else { -+ bne = (void *) data + (written << 9); -+ i = &bne->keys; -+ -+ if (!ptr_written && i->seq != data->keys.seq) -+ break; -+ } -+ -+ struct nonce nonce = btree_nonce(i, written << 9); -+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); -+ -+ if (first) { -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); -+ if (bch2_crc_cmp(data->csum, csum)) { -+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); -+ return false; -+ } -+ } -+ -+ written += vstruct_sectors(data, c->block_bits); -+ } else { -+ if (good_csum_type) { -+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ if (bch2_crc_cmp(bne->csum, csum)) { -+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); -+ return false; -+ } -+ } -+ -+ written += vstruct_sectors(bne, c->block_bits); -+ } -+ } -+ -+ return true; -+} -+ -+static void btree_node_scrub_work(struct work_struct *work) -+{ -+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); -+ struct bch_fs *c = scrub->c; -+ struct printbuf err = PRINTBUF; -+ -+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, -+ bkey_i_to_s_c(scrub->key.k)); -+ prt_newline(&err); -+ -+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { -+ struct btree_trans *trans = bch2_trans_get(c); -+ -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, scrub->btree, -+ scrub->key.k->k.p, 0, scrub->level - 1, 0); -+ -+ struct btree *b; -+ int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); -+ if (ret) -+ goto err; -+ -+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { -+ bch_err(c, "error validating btree node during scrub on %s at btree %s", -+ scrub->ca->name, err.buf); -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_begin(trans); -+ bch2_trans_put(trans); -+ } -+ -+ printbuf_exit(&err); -+ bch2_bkey_buf_exit(&scrub->key, c);; -+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); -+ percpu_ref_put(&scrub->ca->io_ref); -+ kfree(scrub); -+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); -+} -+ -+static void btree_node_scrub_endio(struct bio *bio) -+{ -+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); -+ -+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work); -+} -+ -+int bch2_btree_node_scrub(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c k, unsigned dev) -+{ -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ return 0; -+ -+ struct bch_fs *c = trans->c; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) -+ return -BCH_ERR_erofs_no_writes; -+ -+ struct extent_ptr_decoded pick; -+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); -+ if (ret <= 0) -+ goto err; -+ -+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); -+ if (!ca) { -+ ret = -BCH_ERR_device_offline; -+ goto err; -+ } -+ -+ bool used_mempool = false; -+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); -+ -+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size); -+ -+ struct btree_node_scrub *scrub = -+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); -+ if (!scrub) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ -+ scrub->c = c; -+ scrub->ca = ca; -+ scrub->buf = buf; -+ scrub->used_mempool = used_mempool; -+ scrub->written = btree_ptr_sectors_written(k); -+ -+ scrub->btree = btree; -+ scrub->level = level; -+ bch2_bkey_buf_init(&scrub->key); -+ bch2_bkey_buf_reassemble(&scrub->key, c, k); -+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; -+ -+ INIT_WORK(&scrub->work, btree_node_scrub_work); -+ -+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); -+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); -+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset; -+ scrub->bio.bi_end_io = btree_node_scrub_endio; -+ submit_bio(&scrub->bio); -+ return 0; -+err_free: -+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); -+ percpu_ref_put(&ca->io_ref); -+err: -+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); -+ return ret; -+} -+ - static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) - { -@@ -1831,7 +2016,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - bch2_journal_pin_drop(&c->journal, &w->journal); - } - --static void __btree_node_write_done(struct bch_fs *c, struct btree *b) -+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) - { - struct btree_write *w = btree_prev_write(b); - unsigned long old, new; -@@ -1839,6 +2024,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) - - bch2_btree_complete_write(c, b, w); - -+ if (start_time) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); -+ - old = READ_ONCE(b->flags); - do { - new = old; -@@ -1869,7 +2057,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - } - --static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) - { - struct btree_trans *trans = bch2_trans_get(c); - -@@ -1877,7 +2065,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) - - /* we don't need transaction context anymore after we got the lock. */ - bch2_trans_put(trans); -- __btree_node_write_done(c, b); -+ __btree_node_write_done(c, b, start_time); - six_unlock_read(&b->c.lock); - } - -@@ -1887,6 +2075,7 @@ static void btree_node_write_work(struct work_struct *work) - container_of(work, struct btree_write_bio, work); - struct bch_fs *c = wbio->wbio.c; - struct btree *b = wbio->wbio.bio.bi_private; -+ u64 start_time = wbio->start_time; - int ret = 0; - - btree_bounce_free(c, -@@ -1919,7 +2108,7 @@ static void btree_node_write_work(struct work_struct *work) - } - out: - bio_put(&wbio->wbio.bio); -- btree_node_write_done(c, b); -+ btree_node_write_done(c, b, start_time); - return; - err: - set_btree_node_noevict(b); -@@ -2023,6 +2212,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - bool validate_before_checksum = false; - enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; - void *data; -+ u64 start_time = local_clock(); - int ret; - - if (flags & BTREE_WRITE_ALREADY_STARTED) -@@ -2231,6 +2421,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - wbio->data = data; - wbio->data_bytes = bytes; - wbio->sector_offset = b->written; -+ wbio->start_time = start_time; - wbio->wbio.c = c; - wbio->wbio.used_mempool = used_mempool; - wbio->wbio.first_btree_write = !b->written; -@@ -2258,7 +2449,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) - b->written += sectors_to_write; - nowrite: - btree_bounce_free(c, bytes, used_mempool, data); -- __btree_node_write_done(c, b); -+ __btree_node_write_done(c, b, 0); - } - - /* -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -index 6f9e4a6dacf7..dbf76d22c660 100644 ---- a/fs/bcachefs/btree_io.h -+++ b/fs/bcachefs/btree_io.h -@@ -52,6 +52,7 @@ struct btree_write_bio { - void *data; - unsigned data_bytes; - unsigned sector_offset; -+ u64 start_time; - struct bch_write_bio wbio; - }; - -@@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); - int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, unsigned); -+ - bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); - - enum btree_write_flags { -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -index f4aeadbe53c1..ab111fec1701 100644 ---- a/fs/bcachefs/btree_update_interior.c -+++ b/fs/bcachefs/btree_update_interior.c -@@ -2189,6 +2189,26 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, - goto out; - } - -+int bch2_btree_node_rewrite_key(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bpos pos, unsigned flags) -+{ -+ BUG_ON(!level); -+ -+ /* Traverse one depth lower to get a pointer to the node itself: */ -+ struct btree_iter iter; -+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); -+ struct btree *b = bch2_btree_iter_peek_node(&iter); -+ int ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ - struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -index 7930ffea3075..fa5a88f95d89 100644 ---- a/fs/bcachefs/btree_update_interior.h -+++ b/fs/bcachefs/btree_update_interior.h -@@ -169,7 +169,11 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - - int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); -+int bch2_btree_node_rewrite_key(struct btree_trans *, -+ enum btree_id, unsigned, -+ struct bpos, unsigned); - void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); -+ - int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, - unsigned, bool); -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -index 46e9e32105a9..57d55b3ddc71 100644 ---- a/fs/bcachefs/chardev.c -+++ b/fs/bcachefs/chardev.c -@@ -11,6 +11,7 @@ - #include "move.h" - #include "recovery_passes.h" - #include "replicas.h" -+#include "sb-counters.h" - #include "super-io.h" - #include "thread_with_file.h" - -@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) - struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -- ctx->stats.data_type = U8_MAX; -+ if (ctx->thr.ret == -BCH_ERR_device_offline) -+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; -+ else { -+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; -+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; -+ } - return 0; - } - -@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - struct bch_fs *c = ctx->c; - struct bch_ioctl_data_event e = { -- .type = BCH_DATA_EVENT_PROGRESS, -- .p.data_type = ctx->stats.data_type, -- .p.btree_id = ctx->stats.pos.btree, -- .p.pos = ctx->stats.pos.pos, -- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -- .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .ret = ctx->stats.ret, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.pos.btree, -+ .p.pos = ctx->stats.pos.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), -+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), - }; - -+ if (ctx->arg.op == BCH_DATA_OP_scrub) { -+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); -+ if (ca) { -+ struct bch_dev_usage u; -+ bch2_dev_usage_read_fast(ca, &u); -+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) -+ if (ctx->arg.scrub.data_types & BIT(i)) -+ e.p.sectors_total += u.d[i].sectors; -+ bch2_dev_put(ca); -+ } -+ } else { -+ e.p.sectors_total = bch2_fs_usage_read_short(c).used; -+ } -+ - if (len < sizeof(e)) - return -EINVAL; - -@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) - BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); - case BCH_IOCTL_QUERY_ACCOUNTING: - return bch2_ioctl_query_accounting(c, arg); -+ case BCH_IOCTL_QUERY_COUNTERS: -+ return bch2_ioctl_query_counters(c, arg); - default: - return -ENOTTY; - } -diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c -index 1d6b691e8da6..1f8e035d7119 100644 ---- a/fs/bcachefs/clock.c -+++ b/fs/bcachefs/clock.c -@@ -14,21 +14,13 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus - return (*_l)->expire < (*_r)->expire; - } - --static inline void io_timer_swp(void *l, void *r, void __always_unused *args) --{ -- struct io_timer **_l = (struct io_timer **)l; -- struct io_timer **_r = (struct io_timer **)r; -- -- swap(*_l, *_r); --} -+static const struct min_heap_callbacks callbacks = { -+ .less = io_timer_cmp, -+ .swp = NULL, -+}; - - void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) - { -- const struct min_heap_callbacks callbacks = { -- .less = io_timer_cmp, -- .swp = io_timer_swp, -- }; -- - spin_lock(&clock->timer_lock); - - if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { -@@ -48,11 +40,6 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) - - void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) - { -- const struct min_heap_callbacks callbacks = { -- .less = io_timer_cmp, -- .swp = io_timer_swp, -- }; -- - spin_lock(&clock->timer_lock); - - for (size_t i = 0; i < clock->timers.nr; i++) -@@ -142,10 +129,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, - static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) - { - struct io_timer *ret = NULL; -- const struct min_heap_callbacks callbacks = { -- .less = io_timer_cmp, -- .swp = io_timer_swp, -- }; - - if (clock->timers.nr && - time_after_eq64(now, clock->timers.data[0]->expire)) { -diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c -index 337494facac6..c66ef8a1b5f2 100644 ---- a/fs/bcachefs/data_update.c -+++ b/fs/bcachefs/data_update.c -@@ -20,6 +20,8 @@ - #include "subvolume.h" - #include "trace.h" - -+#include -+ - static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -@@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { -- if (!bch2_dev_tryget(c, ptr->dev)) { -+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; -@@ -91,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc - return true; - } - --static noinline void trace_move_extent_finish2(struct data_update *u, -+static noinline void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) - { -@@ -111,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - -- trace_move_extent_finish(c, buf.buf); -+ trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); - } - --static void trace_move_extent_fail2(struct data_update *m, -+static void trace_io_move_fail2(struct data_update *m, - struct bkey_s_c new, - struct bkey_s_c wrote, - struct bkey_i *insert, -@@ -126,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m, - struct printbuf buf = PRINTBUF; - unsigned rewrites_found = 0; - -- if (!trace_move_extent_fail_enabled()) -+ if (!trace_io_move_fail_enabled()) - return; - - prt_str(&buf, msg); -@@ -166,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m, - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - } - -- trace_move_extent_fail(c, buf.buf); -+ trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); - } - -@@ -214,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - new = bkey_i_to_extent(bch2_keylist_front(keys)); - - if (!bch2_extents_match(k, old)) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); - goto nowork; - } -@@ -254,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - if (m->data_opts.rewrite_ptrs && - !rewrites_found && - bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); - goto nowork; - } - -@@ -271,7 +273,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - } - - if (!bkey_val_u64s(&new->k)) { -- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); -+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); - goto nowork; - } - -@@ -384,9 +386,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); - -- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); -- if (trace_move_extent_finish_enabled()) -- trace_move_extent_finish2(m, &new->k_i, insert); -+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); -+ if (trace_io_move_finish_enabled()) -+ trace_io_move_finish2(m, &new->k_i, insert); - } - err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -408,7 +410,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, - &m->stats->sectors_raced); - } - -- count_event(c, move_extent_fail); -+ count_event(c, io_move_fail); - - bch2_btree_iter_advance(&iter); - goto next; -@@ -426,14 +428,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); - } - --void bch2_data_update_read_done(struct data_update *m, -- struct bch_extent_crc_unpacked crc) -+void bch2_data_update_read_done(struct data_update *m) - { -+ m->read_done = true; -+ - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - -- m->op.crc = crc; -- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; -+ m->op.crc = m->rbio.pick.crc; -+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; -+ -+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); - - closure_call(&m->op.cl, bch2_write, NULL, NULL); - } -@@ -443,31 +448,34 @@ void bch2_data_update_exit(struct data_update *update) - struct bch_fs *c = update->op.c; - struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - -+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); -+ kfree(update->bvecs); -+ update->bvecs = NULL; -+ - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); -- bch2_bkey_buf_exit(&update->k, c); - bch2_disk_reservation_put(c, &update->op.res); -- bch2_bio_free_pages_pool(c, &update->op.wbio.bio); -+ bch2_bkey_buf_exit(&update->k, c); - } - --static void bch2_update_unwritten_extent(struct btree_trans *trans, -- struct data_update *update) -+static int bch2_update_unwritten_extent(struct btree_trans *trans, -+ struct data_update *update) - { - struct bch_fs *c = update->op.c; -- struct bio *bio = &update->op.wbio.bio; - struct bkey_i_extent *e; - struct write_point *wp; - struct closure cl; - struct btree_iter iter; - struct bkey_s_c k; -- int ret; -+ int ret = 0; - - closure_init_stack(&cl); - bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - -- while (bio_sectors(bio)) { -- unsigned sectors = bio_sectors(bio); -+ while (bpos_lt(update->op.pos, update->k.k->k.p)) { -+ unsigned sectors = update->k.k->k.p.offset - -+ update->op.pos.offset; - - bch2_trans_begin(trans); - -@@ -503,7 +511,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch_err_fn_ratelimited(c, ret); - - if (ret) -- return; -+ break; - - sectors = min(sectors, wp->sectors_free); - -@@ -513,7 +521,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - -- bio_advance(bio, sectors << 9); - update->op.pos.offset += sectors; - - extent_for_each_ptr(extent_i_to_s(e), ptr) -@@ -532,13 +539,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, - bch2_trans_unlock(trans); - closure_sync(&cl); - } -+ -+ return ret; - } - - void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- printbuf_tabstop_push(out, 20); -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 20); - - prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); -@@ -562,6 +572,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - - prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); -+ prt_newline(out); - } - - void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) -@@ -573,6 +584,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - } - -+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) -+{ -+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); -+ prt_printf(out, "read_done:\t\%u\n", m->read_done); -+ bch2_write_op_to_text(out, &m->op); -+ printbuf_indent_sub(out, 2); -+} -+ - int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, -@@ -616,12 +638,80 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - } - -+static bool can_allocate_without_blocking(struct bch_fs *c, -+ struct data_update *m) -+{ -+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) -+ return false; -+ -+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs -+ ? m->op.target -+ : 0; -+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); -+ -+ darray_for_each(m->op.devs_have, i) -+ __clear_bit(*i, devs.d); -+ -+ rcu_read_lock(); -+ unsigned nr_replicas = 0, i; -+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { -+ struct bch_dev *ca = bch2_dev_rcu(c, i); -+ -+ struct bch_dev_usage usage; -+ bch2_dev_usage_read_fast(ca, &usage); -+ -+ if (!dev_buckets_free(ca, usage, m->op.watermark)) -+ continue; -+ -+ nr_replicas += ca->mi.durability; -+ if (nr_replicas >= m->op.nr_replicas) -+ break; -+ } -+ rcu_read_unlock(); -+ -+ return nr_replicas >= m->op.nr_replicas; -+} -+ -+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, -+ struct bch_io_opts *io_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ /* write path might have to decompress data: */ -+ unsigned buf_bytes = 0; -+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) -+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); -+ -+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); -+ -+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); -+ if (!m->bvecs) -+ return -ENOMEM; -+ -+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); -+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); -+ -+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { -+ kfree(m->bvecs); -+ m->bvecs = NULL; -+ return -ENOMEM; -+ } -+ -+ rbio_init(&m->rbio.bio, c, *io_opts, NULL); -+ m->rbio.bio.bi_iter.bi_size = buf_bytes; -+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); -+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); -+ return 0; -+} -+ - int bch2_data_update_init(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct data_update *m, - struct write_point_specifier wp, -- struct bch_io_opts io_opts, -+ struct bch_io_opts *io_opts, - struct data_update_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -@@ -639,16 +729,7 @@ int bch2_data_update_init(struct btree_trans *trans, - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) -- return -BCH_ERR_data_update_done; -- -- if (!bkey_get_dev_refs(c, k)) -- return -BCH_ERR_data_update_done; -- -- if (c->opts.nocow_enabled && -- !bkey_nocow_lock(c, ctxt, k)) { -- bkey_put_dev_refs(c, k); -- return -BCH_ERR_nocow_lock_blocked; -- } -+ return -BCH_ERR_data_update_done_no_snapshot; - - bch2_bkey_buf_init(&m->k); - bch2_bkey_buf_reassemble(&m->k, c, k); -@@ -657,18 +738,18 @@ int bch2_data_update_init(struct btree_trans *trans, - m->ctxt = ctxt; - m->stats = ctxt ? ctxt->stats : NULL; - -- bch2_write_op_init(&m->op, c, io_opts); -+ bch2_write_op_init(&m->op, c, *io_opts); - m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->bversion; - m->op.target = data_opts.target; - m->op.write_point = wp; - m->op.nr_replicas = 0; -- m->op.flags |= BCH_WRITE_PAGES_STABLE| -- BCH_WRITE_PAGES_OWNED| -- BCH_WRITE_DATA_ENCODED| -- BCH_WRITE_MOVE| -+ m->op.flags |= BCH_WRITE_pages_stable| -+ BCH_WRITE_pages_owned| -+ BCH_WRITE_data_encoded| -+ BCH_WRITE_move| - m->data_opts.write_flags; -- m->op.compression_opt = io_opts.background_compression; -+ m->op.compression_opt = io_opts->background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; -@@ -706,7 +787,7 @@ int bch2_data_update_init(struct btree_trans *trans, - ptr_bit <<= 1; - } - -- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); -+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); - - /* - * If current extent durability is less than io_opts.data_replicas, -@@ -739,8 +820,16 @@ int bch2_data_update_init(struct btree_trans *trans, - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) -- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); -- goto out; -+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); -+ if (!ret) -+ ret = -BCH_ERR_data_update_done_no_writes_needed; -+ goto out_bkey_buf_exit; -+ } -+ -+ if ((m->op.flags & BCH_WRITE_alloc_nowait) && -+ !can_allocate_without_blocking(c, m)) { -+ ret = -BCH_ERR_data_update_done_would_block; -+ goto out_bkey_buf_exit; - } - - if (reserve_sectors) { -@@ -749,18 +838,41 @@ int bch2_data_update_init(struct btree_trans *trans, - ? 0 - : BCH_DISK_RESERVATION_NOFAIL); - if (ret) -- goto out; -+ goto out_bkey_buf_exit; -+ } -+ -+ if (!bkey_get_dev_refs(c, k)) { -+ ret = -BCH_ERR_data_update_done_no_dev_refs; -+ goto out_put_disk_res; -+ } -+ -+ if (c->opts.nocow_enabled && -+ !bkey_nocow_lock(c, ctxt, k)) { -+ ret = -BCH_ERR_nocow_lock_blocked; -+ goto out_put_dev_refs; - } - - if (bkey_extent_is_unwritten(k)) { -- bch2_update_unwritten_extent(trans, m); -- goto out; -+ ret = bch2_update_unwritten_extent(trans, m) ?: -+ -BCH_ERR_data_update_done_unwritten; -+ goto out_nocow_unlock; - } - -+ ret = bch2_data_update_bios_init(m, c, io_opts); -+ if (ret) -+ goto out_nocow_unlock; -+ - return 0; --out: -- bch2_data_update_exit(m); -- return ret ?: -BCH_ERR_data_update_done; -+out_nocow_unlock: -+ if (c->opts.nocow_enabled) -+ bkey_nocow_unlock(c, k); -+out_put_dev_refs: -+ bkey_put_dev_refs(c, k); -+out_put_disk_res: -+ bch2_disk_reservation_put(c, &m->op.res); -+out_bkey_buf_exit: -+ bch2_bkey_buf_exit(&m->k, c); -+ return ret; - } - - void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h -index e4b50723428e..c194cbbf5b51 100644 ---- a/fs/bcachefs/data_update.h -+++ b/fs/bcachefs/data_update.h -@@ -4,6 +4,7 @@ - #define _BCACHEFS_DATA_UPDATE_H - - #include "bkey_buf.h" -+#include "io_read.h" - #include "io_write_types.h" - - struct moving_context; -@@ -15,6 +16,9 @@ struct data_update_opts { - u8 extra_replicas; - unsigned btree_insert_flags; - unsigned write_flags; -+ -+ int read_dev; -+ bool scrub; - }; - - void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, -@@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - - struct data_update { - /* extent being updated: */ -+ bool read_done; - enum btree_id btree_id; - struct bkey_buf k; - struct data_update_opts data_opts; - struct moving_context *ctxt; - struct bch_move_stats *stats; -+ -+ struct bch_read_bio rbio; - struct bch_write_op op; -+ struct bio_vec *bvecs; - }; - - void bch2_data_update_to_text(struct printbuf *, struct data_update *); -+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); - - int bch2_data_update_index_update(struct bch_write_op *); - --void bch2_data_update_read_done(struct data_update *, -- struct bch_extent_crc_unpacked); -+void bch2_data_update_read_done(struct data_update *); - - int bch2_extent_drop_ptrs(struct btree_trans *, - struct btree_iter *, -@@ -43,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, - struct bch_io_opts *, - struct data_update_opts *); - -+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, -+ struct bch_io_opts *); -+ - void bch2_data_update_exit(struct data_update *); - int bch2_data_update_init(struct btree_trans *, struct btree_iter *, - struct moving_context *, - struct data_update *, - struct write_point_specifier, -- struct bch_io_opts, struct data_update_opts, -+ struct bch_io_opts *, struct data_update_opts, - enum btree_id, struct bkey_s_c); - void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); - -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -index 55333e82d1fe..788af88f6979 100644 ---- a/fs/bcachefs/debug.c -+++ b/fs/bcachefs/debug.c -@@ -7,6 +7,7 @@ - */ - - #include "bcachefs.h" -+#include "alloc_foreground.h" - #include "bkey_methods.h" - #include "btree_cache.h" - #include "btree_io.h" -@@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - unsigned offset = 0; - int ret; - -- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { - prt_printf(out, "error getting device to read from: invalid device\n"); - return; - } -@@ -844,8 +845,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) - seqmutex_unlock(&c->btree_trans_lock); - } - --static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, -- size_t size, loff_t *ppos) -+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); -+ -+static ssize_t bch2_simple_print(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos, -+ fs_to_text_fn fn) - { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; -@@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - i->ret = 0; - - if (!i->iter) { -- btree_deadlock_to_text(&i->buf, c); -+ fn(&i->buf, c); - i->iter++; - } - -@@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - return ret ?: i->ret; - } - -+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); -+} -+ - static const struct file_operations btree_deadlock_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, -@@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { - .read = bch2_btree_deadlock_read, - }; - -+static ssize_t bch2_write_points_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); -+} -+ -+static const struct file_operations write_points_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_write_points_read, -+}; -+ - void bch2_fs_debug_exit(struct bch_fs *c) - { - if (!IS_ERR_OR_NULL(c->fs_debug_dir)) -@@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) - debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, - c->btree_debug, &btree_deadlock_ops); - -+ debugfs_create_file("write_points", 0400, c->fs_debug_dir, -+ c->btree_debug, &write_points_ops); -+ - c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); - if (IS_ERR_OR_NULL(c->btree_debug_dir)) - return; -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -index b211e90ac54e..1aa56d28de33 100644 ---- a/fs/bcachefs/ec.c -+++ b/fs/bcachefs/ec.c -@@ -1056,6 +1056,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h) - ec_stripes_heap_set_backpointer(_h, j); - } - -+static const struct min_heap_callbacks callbacks = { -+ .less = ec_stripes_heap_cmp, -+ .swp = ec_stripes_heap_swap, -+}; -+ - static void heap_verify_backpointer(struct bch_fs *c, size_t idx) - { - ec_stripes_heap *h = &c->ec_stripes_heap; -@@ -1068,11 +1073,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) - void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) - { -- const struct min_heap_callbacks callbacks = { -- .less = ec_stripes_heap_cmp, -- .swp = ec_stripes_heap_swap, -- }; -- - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - -@@ -1083,11 +1083,6 @@ void bch2_stripes_heap_del(struct bch_fs *c, - void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) - { -- const struct min_heap_callbacks callbacks = { -- .less = ec_stripes_heap_cmp, -- .swp = ec_stripes_heap_swap, -- }; -- - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - -@@ -1106,10 +1101,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c, - void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) - { -- const struct min_heap_callbacks callbacks = { -- .less = ec_stripes_heap_cmp, -- .swp = ec_stripes_heap_swap, -- }; - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; -@@ -1389,8 +1380,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); -+ if (bp.v->btree_id == BTREE_ID_stripes) -+ continue; -+ - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, -- bkey_s_c_to_backpointer(bp_k), &last_flushed); -+ bp, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); -diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h -index 4590cd0c7c90..89df97810076 100644 ---- a/fs/bcachefs/errcode.h -+++ b/fs/bcachefs/errcode.h -@@ -180,6 +180,11 @@ - x(EINVAL, not_in_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ - x(0, data_update_done) \ -+ x(BCH_ERR_data_update_done, data_update_done_would_block) \ -+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ -+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ - x(EINVAL, mismatched_block_size) \ -@@ -269,6 +274,7 @@ - x(EIO, invalidate_stripe_to_dev) \ - x(EIO, no_encryption_key) \ - x(EIO, insufficient_journal_devices) \ -+ x(EIO, device_offline) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -index 038da6a61f6b..c8fc58fab958 100644 ---- a/fs/bcachefs/error.c -+++ b/fs/bcachefs/error.c -@@ -530,35 +530,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c) - mutex_unlock(&c->fsck_error_msgs_lock); - } - --int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) -+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ subvol_inum inum, u64 offset) - { - u32 restart_count = trans->restart_count; - int ret = 0; - -- /* XXX: we don't yet attempt to print paths when we don't know the subvol */ -- if (inum.subvol) -- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); -+ if (inum.subvol) { -+ ret = bch2_inum_to_path(trans, inum, out); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ } - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); -+ prt_printf(out, " offset %llu: ", offset); - - return trans_was_restarted(trans, restart_count); - } - --int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -- subvol_inum inum, u64 offset) -+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, -+ subvol_inum inum, u64 offset) - { -- int ret = bch2_inum_err_msg_trans(trans, out, inum); -- prt_printf(out, " offset %llu: ", offset); -- return ret; -+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); - } - --void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) -+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, -+ struct bpos pos) - { -- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); --} -+ struct bch_fs *c = trans->c; -+ int ret = 0; - --void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, -- subvol_inum inum, u64 offset) --{ -- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -+ if (!bch2_snapshot_is_leaf(c, pos.snapshot)) -+ prt_str(out, "(multiple snapshots) "); -+ -+ subvol_inum inum = { -+ .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), -+ .inum = pos.inode, -+ }; -+ -+ if (inum.subvol) { -+ ret = bch2_inum_to_path(trans, inum, out); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ } -+ -+ if (!inum.subvol || ret) -+ prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); -+ -+ prt_printf(out, " offset %llu: ", pos.offset << 8); -+ return 0; - } -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -index 7acf2a27ca28..76da0e88cee8 100644 ---- a/fs/bcachefs/error.h -+++ b/fs/bcachefs/error.h -@@ -238,10 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - _ret; \ - }) - --int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); - int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); - --void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); - void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); - -+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); -+ - #endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -index 05d5f71a7ca9..78a51d96bd2d 100644 ---- a/fs/bcachefs/extents.c -+++ b/fs/bcachefs/extents.c -@@ -114,8 +114,9 @@ static inline bool ptr_better(struct bch_fs *c, - * other devices, it will still pick a pointer from avoid. - */ - int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, -- struct bch_io_failures *failed, -- struct extent_ptr_decoded *pick) -+ struct bch_io_failures *failed, -+ struct extent_ptr_decoded *pick, -+ int dev) - { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; -@@ -137,6 +138,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - break; - } - -+ /* Are we being asked to read from a specific device? */ -+ if (dev >= 0 && p.ptr.dev != dev) -+ continue; -+ - /* - * If there are any dirty pointers it's an error if we can't - * read: -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -index 620b284aa34f..8fae6b23a341 100644 ---- a/fs/bcachefs/extents.h -+++ b/fs/bcachefs/extents.h -@@ -404,7 +404,7 @@ void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); - int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, -- struct extent_ptr_decoded *); -+ struct extent_ptr_decoded *, int); - - /* KEY_TYPE_btree_ptr: */ - -diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c -index 2eaffe37b5e7..0e742555cb0a 100644 ---- a/fs/bcachefs/eytzinger.c -+++ b/fs/bcachefs/eytzinger.c -@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr - return cmp(a, b, priv); - } - --static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, -+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) - { -- return do_cmp(base + inorder_to_eytzinger0(l, n) * size, -- base + inorder_to_eytzinger0(r, n) * size, -+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, -+ base1 + inorder_to_eytzinger1(r, n) * size, - cmp_func, priv); - } - --static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, -+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) - { -- do_swap(base + inorder_to_eytzinger0(l, n) * size, -- base + inorder_to_eytzinger0(r, n) * size, -+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size, -+ base1 + inorder_to_eytzinger1(r, n) * size, - size, swap_func, priv); - } - --void eytzinger0_sort_r(void *base, size_t n, size_t size, -- cmp_r_func_t cmp_func, -- swap_r_func_t swap_func, -- const void *priv) -+static void eytzinger1_sort_r(void *base1, size_t n, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) - { -- int i, j, k; -+ unsigned i, j, k; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) - swap_func = NULL; - - if (!swap_func) { -- if (is_aligned(base, size, 8)) -+ if (is_aligned(base1, size, 8)) - swap_func = SWAP_WORDS_64; -- else if (is_aligned(base, size, 4)) -+ else if (is_aligned(base1, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ -- for (i = n / 2 - 1; i >= 0; --i) { -+ for (i = n / 2; i >= 1; --i) { - /* Find the sift-down path all the way to the leaves. */ -- for (j = i; k = j * 2 + 1, k + 1 < n;) -- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; -+ for (j = i; k = j * 2, k < n;) -+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ -- if (j * 2 + 2 == n) -- j = j * 2 + 1; -+ if (j * 2 == n) -+ j *= 2; - - /* Backtrack to the correct location. */ -- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) -- j = (j - 1) / 2; -+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) -+ j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j != i;) { -- j = (j - 1) / 2; -- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); -+ j /= 2; -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - - /* sort */ -- for (i = n - 1; i > 0; --i) { -- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); -+ for (i = n; i > 1; --i) { -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); - - /* Find the sift-down path all the way to the leaves. */ -- for (j = 0; k = j * 2 + 1, k + 1 < i;) -- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; -+ for (j = 1; k = j * 2, k + 1 < i;) -+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ -- if (j * 2 + 2 == i) -- j = j * 2 + 1; -+ if (j * 2 + 1 == i) -+ j *= 2; - - /* Backtrack to the correct location. */ -- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) -- j = (j - 1) / 2; -+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) -+ j /= 2; - - /* Shift the element into its correct place. */ -- for (k = j; j;) { -- j = (j - 1) / 2; -- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); -+ for (k = j; j > 1;) { -+ j /= 2; -+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - } - -+void eytzinger0_sort_r(void *base, size_t n, size_t size, -+ cmp_r_func_t cmp_func, -+ swap_r_func_t swap_func, -+ const void *priv) -+{ -+ void *base1 = base - size; -+ -+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); -+} -+ - void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -index 0541192d7bc0..643c1f716061 100644 ---- a/fs/bcachefs/eytzinger.h -+++ b/fs/bcachefs/eytzinger.h -@@ -6,6 +6,7 @@ - #include - - #ifdef EYTZINGER_DEBUG -+#include - #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) - #else - #define EYTZINGER_BUG_ON(cond) -@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) - return rounddown_pow_of_two(size + 1) - 1; - } - --/* -- * eytzinger1_next() and eytzinger1_prev() have the nice properties that -- * -- * eytzinger1_next(0) == eytzinger1_first()) -- * eytzinger1_prev(0) == eytzinger1_last()) -- * -- * eytzinger1_prev(eytzinger1_first()) == 0 -- * eytzinger1_next(eytzinger1_last()) == 0 -- */ -- - static inline unsigned eytzinger1_next(unsigned i, unsigned size) - { -- EYTZINGER_BUG_ON(i > size); -+ EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_right_child(i) <= size) { - i = eytzinger1_right_child(i); - -- i <<= __fls(size + 1) - __fls(i); -+ i <<= __fls(size) - __fls(i); - i >>= i > size; - } else { - i >>= ffz(i) + 1; -@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) - - static inline unsigned eytzinger1_prev(unsigned i, unsigned size) - { -- EYTZINGER_BUG_ON(i > size); -+ EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_left_child(i) <= size) { - i = eytzinger1_left_child(i) + 1; - -- i <<= __fls(size + 1) - __fls(i); -+ i <<= __fls(size) - __fls(i); - i -= 1; - i >>= i > size; - } else { -@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) - (_i) != -1; \ - (_i) = eytzinger0_next((_i), (_size))) - -+#define eytzinger0_for_each_prev(_i, _size) \ -+ for (unsigned (_i) = eytzinger0_last((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_prev((_i), (_size))) -+ - /* return greatest node <= @search, or -1 if not found */ - static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- unsigned i, n = 0; -- -- if (!nr) -- return -1; -- -- do { -- i = n; -- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); -- } while (n < nr); -- -- if (n & 1) { -- /* -- * @i was greater than @search, return previous node: -- * -- * if @i was leftmost/smallest element, -- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected -- */ -- return eytzinger0_prev(i, nr); -- } else { -- return i; -- } -+ void *base1 = base - size; -+ unsigned n = 1; -+ -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); -+ n >>= __ffs(n) + 1; -+ return n - 1; - } - -+/* return smallest node > @search, or -1 if not found */ - static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); -+ void *base1 = base - size; -+ unsigned n = 1; - -- /* -- * if eytitzinger0_find_le() returned -1 - no element was <= search - we -- * want to return the first element; next/prev identities mean this work -- * as expected -- * -- * similarly if find_le() returns last element, we should return -1; -- * identities mean this all works out: -- */ -- return eytzinger0_next(idx, nr); -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); -+ n >>= __ffs(n + 1) + 1; -+ return n - 1; - } - -+/* return smallest node >= @search, or -1 if not found */ - static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) - { -- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); -- -- if (idx < nr && !cmp(base + idx * size, search)) -- return idx; -+ void *base1 = base - size; -+ unsigned n = 1; - -- return eytzinger0_next(idx, nr); -+ while (n <= nr) -+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); -+ n >>= __ffs(n + 1) + 1; -+ return n - 1; - } - - #define eytzinger0_find(base, nr, size, _cmp, search) \ - ({ \ -- void *_base = (base); \ -+ size_t _size = (size); \ -+ void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ -- size_t _size = (size); \ -- size_t _i = 0; \ -+ size_t _i = 1; \ - int _res; \ - \ -- while (_i < _nr && \ -- (_res = _cmp(_search, _base + _i * _size))) \ -- _i = eytzinger0_child(_i, _res > 0); \ -- _i; \ -+ while (_i <= _nr && \ -+ (_res = _cmp(_search, _base1 + _i * _size))) \ -+ _i = eytzinger1_child(_i, _res > 0); \ -+ _i - 1; \ - }) - - void eytzinger0_sort_r(void *, size_t, size_t, -diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c -index ab1d5db2fa56..a1ccb9139b04 100644 ---- a/fs/bcachefs/fs-io-buffered.c -+++ b/fs/bcachefs/fs-io-buffered.c -@@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans, - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; -- int flags = BCH_READ_RETRY_IF_STALE| -- BCH_READ_MAY_PROMOTE; -+ int flags = BCH_READ_retry_if_stale| -+ BCH_READ_may_promote; - int ret = 0; - -- rbio->c = c; -- rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); -@@ -211,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans, - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) -- flags |= BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_last_fragment; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if (flags & BCH_READ_last_fragment) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); -@@ -232,7 +230,8 @@ static void bchfs_read(struct btree_trans *trans, - - if (ret) { - struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -@@ -280,12 +279,13 @@ void bch2_readahead(struct readahead_control *ractl) - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), -- opts); -+ c, -+ opts, -+ bch2_readpages_end_io); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); -- rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(trans, rbio, inode_inum(inode), -@@ -323,10 +323,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), -- opts); -+ c, -+ opts, -+ bch2_read_single_folio_end_io); - rbio->bio.bi_private = &done; -- rbio->bio.bi_end_io = bch2_read_single_folio_end_io; -- - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); -@@ -420,7 +420,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) - } - } - -- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ if (io->op.flags & BCH_WRITE_wrote_data_inline) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - -diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c -index 2089c36b5866..535bc5fcbcc0 100644 ---- a/fs/bcachefs/fs-io-direct.c -+++ b/fs/bcachefs/fs-io-direct.c -@@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); -+ bool split = false; - size_t shorten; - ssize_t ret; - -@@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - GFP_KERNEL, - &c->dio_read_bioset); - -- bio->bi_end_io = bch2_direct_IO_read_endio; -- - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - -@@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - - goto start; - while (iter->count) { -+ split = true; -+ - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); -- bio->bi_end_io = bch2_direct_IO_read_split_endio; - start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; -@@ -160,7 +160,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - if (iter->count) - closure_get(&dio->cl); - -- bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); -+ struct bch_read_bio *rbio = -+ rbio_init(bio, -+ c, -+ opts, -+ split -+ ? bch2_direct_IO_read_split_endio -+ : bch2_direct_IO_read_endio); -+ -+ bch2_read(c, rbio, inode_inum(inode)); - } - - blk_finish_plug(&plug); -@@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) -- dio->op.flags |= BCH_WRITE_SYNC; -- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; -+ dio->op.flags |= BCH_WRITE_sync; -+ dio->op.flags |= BCH_WRITE_check_enospc; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -index 8fcf7c8e5ede..53a421ff136d 100644 ---- a/fs/bcachefs/fsck.c -+++ b/fs/bcachefs/fsck.c -@@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * - return ret; - - struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); -- struct qstr name = (struct qstr) QSTR(name_buf); -+ struct qstr name = QSTR(name_buf); - - inode->bi_dir = lostfound.bi_inum; - -diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c -index 5353979117b0..6b842c8d21be 100644 ---- a/fs/bcachefs/io_misc.c -+++ b/fs/bcachefs/io_misc.c -@@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, - bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); - prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c -index 8c7b2d3d779d..821ff222b361 100644 ---- a/fs/bcachefs/io_read.c -+++ b/fs/bcachefs/io_read.c -@@ -80,6 +80,7 @@ struct promote_op { - struct rhash_head hash; - struct bpos pos; - -+ struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ - }; -@@ -96,6 +97,26 @@ static inline bool have_io_error(struct bch_io_failures *failed) - return failed && failed->nr; - } - -+static bool ptr_being_rewritten(struct bch_read_bio *orig, -+ unsigned dev, -+ unsigned flags) -+{ -+ if (!(flags & BCH_READ_data_update)) -+ return false; -+ -+ struct data_update *u = container_of(orig, struct data_update, rbio); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); -+ unsigned i = 0; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (ptr->dev == dev && -+ u->data_opts.rewrite_ptrs & BIT(i)) -+ return true; -+ i++; -+ } -+ -+ return false; -+} -+ - static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, -@@ -105,7 +126,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - -- if (!(flags & BCH_READ_MAY_PROMOTE)) -+ if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; - - if (bch2_bkey_has_target(c, k, opts.promote_target)) -@@ -125,98 +146,94 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - return 0; - } - --static void promote_free(struct bch_fs *c, struct promote_op *op) -+static noinline void promote_free(struct bch_read_bio *rbio) - { -- int ret; -+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); -+ struct bch_fs *c = rbio->c; -+ -+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); - - bch2_data_update_exit(&op->write); - -- ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -- bch_promote_params); -- BUG_ON(ret); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); - } - - static void promote_done(struct bch_write_op *wop) - { -- struct promote_op *op = -- container_of(wop, struct promote_op, write.op); -- struct bch_fs *c = op->write.op.c; -+ struct promote_op *op = container_of(wop, struct promote_op, write.op); -+ struct bch_fs *c = op->write.rbio.c; - -- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -- op->start_time); -- promote_free(c, op); -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); -+ promote_free(&op->write.rbio); - } - --static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+static void promote_start_work(struct work_struct *work) - { -- struct bio *bio = &op->write.op.wbio.bio; -+ struct promote_op *op = container_of(work, struct promote_op, work); - -- trace_and_count(op->write.op.c, read_promote, &rbio->bio); -+ bch2_data_update_read_done(&op->write); -+} - -- /* we now own pages: */ -- BUG_ON(!rbio->bounce); -- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+static noinline void promote_start(struct bch_read_bio *rbio) -+{ -+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - -- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -- sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -- swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - -- bch2_data_update_read_done(&op->write, rbio->pick.crc); -+ INIT_WORK(&op->work, promote_start_work); -+ queue_work(rbio->c->write_ref_wq, &op->work); - } - --static struct promote_op *__promote_alloc(struct btree_trans *trans, -- enum btree_id btree_id, -- struct bkey_s_c k, -- struct bpos pos, -- struct extent_ptr_decoded *pick, -- struct bch_io_opts opts, -- unsigned sectors, -- struct bch_read_bio **rbio, -- struct bch_io_failures *failed) -+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ unsigned sectors, -+ unsigned flags, -+ struct bch_read_bio *orig, -+ struct bch_io_failures *failed) - { - struct bch_fs *c = trans->c; -- struct promote_op *op = NULL; -- struct bio *bio; -- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - int ret; - -- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) -- return ERR_PTR(-BCH_ERR_nopromote_no_writes); -+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - -- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); -- if (!op) { -- ret = -BCH_ERR_nopromote_enomem; -- goto err; -- } -+ if (!have_io_error(failed)) { -+ update_opts.target = orig->opts.promote_target; -+ update_opts.extra_replicas = 1; -+ update_opts.write_flags |= BCH_WRITE_cached; -+ update_opts.write_flags |= BCH_WRITE_only_specified_devs; -+ } else { -+ update_opts.target = orig->opts.foreground_target; - -- op->start_time = local_clock(); -- op->pos = pos; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ unsigned ptr_bit = 1; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (bch2_dev_io_failures(failed, ptr->dev) && -+ !ptr_being_rewritten(orig, ptr->dev, flags)) -+ update_opts.rewrite_ptrs |= ptr_bit; -+ ptr_bit <<= 1; -+ } - -- /* -- * We don't use the mempool here because extents that aren't -- * checksummed or compressed can be too big for the mempool: -- */ -- *rbio = kzalloc(sizeof(struct bch_read_bio) + -- sizeof(struct bio_vec) * pages, -- GFP_KERNEL); -- if (!*rbio) { -- ret = -BCH_ERR_nopromote_enomem; -- goto err; -+ if (!update_opts.rewrite_ptrs) -+ return NULL; - } - -- rbio_init(&(*rbio)->bio, opts); -- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) -+ return ERR_PTR(-BCH_ERR_nopromote_no_writes); - -- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { -+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); -+ if (!op) { - ret = -BCH_ERR_nopromote_enomem; -- goto err; -+ goto err_put; - } - -- (*rbio)->bounce = true; -- (*rbio)->split = true; -- (*rbio)->kmalloc = true; -+ op->start_time = local_clock(); -+ op->pos = pos; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) { -@@ -224,64 +241,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, - goto err; - } - -- bio = &op->write.op.wbio.bio; -- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); -- -- struct data_update_opts update_opts = {}; -- -- if (!have_io_error(failed)) { -- update_opts.target = opts.promote_target; -- update_opts.extra_replicas = 1; -- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; -- } else { -- update_opts.target = opts.foreground_target; -- -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned ptr_bit = 1; -- bkey_for_each_ptr(ptrs, ptr) { -- if (bch2_dev_io_failures(failed, ptr->dev)) -- update_opts.rewrite_ptrs |= ptr_bit; -- ptr_bit <<= 1; -- } -- } -- - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, - writepoint_hashed((unsigned long) current), -- opts, -+ &orig->opts, - update_opts, - btree_id, k); - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ -- if (ret) { -- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, -- bch_promote_params)); -- goto err; -- } -+ if (ret) -+ goto err_remove_hash; - -+ rbio_init_fragment(&op->write.rbio.bio, orig); -+ op->write.rbio.bounce = true; -+ op->write.rbio.promote = true; - op->write.op.end_io = promote_done; - -- return op; -+ return &op->write.rbio; -+err_remove_hash: -+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params)); - err: -- if (*rbio) -- bio_free_pages(&(*rbio)->bio); -- kfree(*rbio); -- *rbio = NULL; -+ bio_free_pages(&op->write.op.wbio.bio); - /* We may have added to the rhashtable and thus need rcu freeing: */ - kfree_rcu(op, rcu); -+err_put: - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return ERR_PTR(ret); - } - - noinline --static struct promote_op *promote_alloc(struct btree_trans *trans, -+static struct bch_read_bio *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, -- struct bch_io_opts opts, - unsigned flags, -- struct bch_read_bio **rbio, -+ struct bch_read_bio *orig, - bool *bounce, - bool *read_full, - struct bch_io_failures *failed) -@@ -301,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); -- struct promote_op *promote; - int ret; - -- ret = should_promote(c, k, pos, opts, flags, failed); -+ ret = should_promote(c, k, pos, orig->opts, flags, failed); - if (ret) - goto nopromote; - -- promote = __promote_alloc(trans, -- k.k->type == KEY_TYPE_reflink_v -- ? BTREE_ID_reflink -- : BTREE_ID_extents, -- k, pos, pick, opts, sectors, rbio, failed); -+ struct bch_read_bio *promote = -+ __promote_alloc(trans, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_reflink -+ : BTREE_ID_extents, -+ k, pos, pick, sectors, flags, orig, failed); -+ if (!promote) -+ return NULL; -+ - ret = PTR_ERR_OR_ZERO(promote); - if (ret) - goto nopromote; -@@ -321,7 +320,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - *read_full = promote_full; - return promote; - nopromote: -- trace_read_nopromote(c, ret); -+ trace_io_read_nopromote(c, ret); - return NULL; - } - -@@ -330,9 +329,10 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, - static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) - { -- return bch2_inum_offset_err_msg_trans(trans, out, -- (subvol_inum) { rbio->subvol, read_pos.inode }, -- read_pos.offset << 9); -+ return lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, out, -+ (subvol_inum) { rbio->subvol, read_pos.inode }, -+ read_pos.offset << 9)); - } - - static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, -@@ -375,20 +375,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) - { - BUG_ON(rbio->bounce && !rbio->split); - -- if (rbio->promote) -- promote_free(rbio->c, rbio->promote); -- rbio->promote = NULL; -- -- if (rbio->bounce) -- bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -- - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - -- if (rbio->kmalloc) -- kfree(rbio); -- else -+ if (unlikely(rbio->promote)) { -+ if (!rbio->bio.bi_status) -+ promote_start(rbio); -+ else -+ promote_free(rbio); -+ } else { -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ - bio_put(&rbio->bio); -+ } - - rbio = parent; - } -@@ -408,61 +408,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) - bio_endio(&rbio->bio); - } - --static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -+static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) - { -+ struct data_update *u = container_of(rbio, struct data_update, rbio); - struct btree_trans *trans = bch2_trans_get(c); -- struct btree_iter iter; -- struct bkey_buf sk; -- struct bkey_s_c k; -- int ret; -- -- flags &= ~BCH_READ_LAST_FRAGMENT; -- flags |= BCH_READ_MUST_CLONE; -- -- bch2_bkey_buf_init(&sk); -- -- bch2_trans_iter_init(trans, &iter, rbio->data_btree, -- rbio->read_pos, BTREE_ITER_slots); - retry: - bch2_trans_begin(trans); -- rbio->bio.bi_status = 0; - -- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = lockrestart_do(trans, -+ bkey_err(k = bch2_bkey_get_iter(trans, &iter, -+ u->btree_id, bkey_start_pos(&u->k.k->k), -+ 0))); - if (ret) - goto err; - -- bch2_bkey_buf_reassemble(&sk, c, k); -- k = bkey_i_to_s_c(sk.k); -- -- if (!bch2_bkey_matches_ptr(c, k, -- rbio->pick.ptr, -- rbio->data_pos.offset - -- rbio->pick.crc.offset)) { -+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ - rbio->hole = true; -- goto out; -+ goto err; - } - - ret = __bch2_read_extent(trans, rbio, bvec_iter, -- rbio->read_pos, -- rbio->data_btree, -- k, 0, failed, flags); -+ bkey_start_pos(&u->k.k->k), -+ u->btree_id, -+ bkey_i_to_s_c(u->k.k), -+ 0, failed, flags, -1); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ - if (ret == READ_RETRY) - goto retry; - if (ret) -- goto err; --out: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ -+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - bch2_rbio_done(rbio); -- bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); -- bch2_bkey_buf_exit(&sk, c); -- return; --err: -- rbio->bio.bi_status = BLK_STS_IOERR; -- goto out; - } - - static void bch2_rbio_retry(struct work_struct *work) -@@ -478,34 +464,36 @@ static void bch2_rbio_retry(struct work_struct *work) - }; - struct bch_io_failures failed = { .nr = 0 }; - -- trace_and_count(c, read_retry, &rbio->bio); -+ trace_io_read_retry(&rbio->bio); -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], -+ bvec_iter_sectors(rbio->bvec_iter)); - - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); - -- rbio->bio.bi_status = 0; -+ if (!rbio->split) -+ rbio->bio.bi_status = 0; - - rbio = bch2_rbio_free(rbio); - -- flags |= BCH_READ_IN_RETRY; -- flags &= ~BCH_READ_MAY_PROMOTE; -+ flags |= BCH_READ_in_retry; -+ flags &= ~BCH_READ_may_promote; -+ flags &= ~BCH_READ_last_fragment; -+ flags |= BCH_READ_must_clone; - -- if (flags & BCH_READ_NODECODE) { -+ if (flags & BCH_READ_data_update) - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); -- } else { -- flags &= ~BCH_READ_LAST_FRAGMENT; -- flags |= BCH_READ_MUST_CLONE; -- -+ else - __bch2_read(c, rbio, iter, inum, &failed, flags); -- } - } - - static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) - { - rbio->retry = retry; -+ rbio->saw_error = true; - -- if (rbio->flags & BCH_READ_IN_RETRY) -+ if (rbio->flags & BCH_READ_in_retry) - return; - - if (retry == READ_ERR) { -@@ -712,32 +700,40 @@ static void __bch2_read_endio(struct work_struct *work) - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - -- if (rbio->flags & BCH_READ_NODECODE) -- goto nodecode; -+ if (likely(!(rbio->flags & BCH_READ_data_update))) { -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - -- /* Adjust crc to point to subset of data we want: */ -- crc.offset += rbio->offset_into_extent; -- crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ if (crc_is_compressed(crc)) { -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; - -- if (crc_is_compressed(crc)) { -- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -- if (ret) -- goto decrypt_err; -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && -+ !c->opts.no_data_io) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); - -- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && -- !c->opts.no_data_io) -- goto decompression_err; -- } else { -- /* don't need to decrypt the entire bio: */ -- nonce = nonce_add(nonce, crc.offset << 9); -- bio_advance(src, crc.offset << 9); -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; - -- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -- src->bi_iter.bi_size = dst_iter.bi_size; -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; - -- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -- if (ret) -- goto decrypt_err; -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ } else { -+ if (rbio->split) -+ rbio->parent->pick = rbio->pick; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; -@@ -754,12 +750,9 @@ static void __bch2_read_endio(struct work_struct *work) - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; -- -- promote_start(rbio->promote, rbio); -- rbio->promote = NULL; - } --nodecode: -- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ -+ if (likely(!(rbio->flags & BCH_READ_in_retry))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -@@ -772,8 +765,8 @@ static void __bch2_read_endio(struct work_struct *work) - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ -- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -- rbio->flags |= BCH_READ_MUST_BOUNCE; -+ if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { -+ rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } -@@ -810,11 +803,11 @@ static void bch2_read_endio(struct bio *bio) - return; - } - -- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || - (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { -- trace_and_count(c, read_reuse_race, &rbio->bio); -+ trace_and_count(c, io_read_reuse_race, &rbio->bio); - -- if (rbio->flags & BCH_READ_RETRY_IF_STALE) -+ if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -@@ -883,12 +876,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, -- struct bch_io_failures *failed, unsigned flags) -+ struct bch_io_failures *failed, unsigned flags, int dev) - { - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; -- struct promote_op *promote = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; -@@ -902,10 +894,12 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], -+ bvec_iter_sectors(iter)); - goto out_read_done; - } - retry_pick: -- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); - - /* hole or reservation - just zero fill: */ - if (!pick_ret) -@@ -941,7 +935,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ -- if ((flags & BCH_READ_IN_RETRY) && -+ if ((flags & BCH_READ_in_retry) && - !pick.ptr.cached && - ca && - unlikely(dev_ptr_stale(ca, &pick.ptr))) { -@@ -955,48 +949,53 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ -- bch2_trans_unlock(trans); -+ if (!(flags & BCH_READ_in_retry)) -+ bch2_trans_unlock(trans); -+ else -+ bch2_trans_unlock_long(trans); -+ -+ if (!(flags & BCH_READ_data_update)) { -+ if (!(flags & BCH_READ_last_fragment) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_must_clone; -+ -+ narrow_crcs = !(flags & BCH_READ_in_retry) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_user_mapped)) -+ flags |= BCH_READ_must_bounce; - -- if (flags & BCH_READ_NODECODE) { -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_none && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_user_mapped)) || -+ (flags & BCH_READ_must_bounce)))) { -+ read_full = true; -+ bounce = true; -+ } -+ } else { -+ read_full = true; - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ -- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { -+ struct data_update *u = container_of(orig, struct data_update, rbio); -+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { -+ BUG(); - if (ca) - percpu_ref_put(&ca->io_ref); - goto hole; - } - - iter.bi_size = pick.crc.compressed_size << 9; -- goto get_bio; -- } -- -- if (!(flags & BCH_READ_LAST_FRAGMENT) || -- bio_flagged(&orig->bio, BIO_CHAIN)) -- flags |= BCH_READ_MUST_CLONE; -- -- narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -- bch2_can_narrow_extent_crcs(k, pick.crc); -- -- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -- flags |= BCH_READ_MUST_BOUNCE; -- -- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -- -- if (crc_is_compressed(pick.crc) || -- (pick.crc.csum_type != BCH_CSUM_none && -- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -- (bch2_csum_type_is_encryption(pick.crc.csum_type) && -- (flags & BCH_READ_USER_MAPPED)) || -- (flags & BCH_READ_MUST_BOUNCE)))) { -- read_full = true; -- bounce = true; - } - - if (orig->opts.promote_target || have_io_error(failed)) -- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, -- &rbio, &bounce, &read_full, failed); -+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig, -+ &bounce, &read_full, failed); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); -@@ -1015,7 +1014,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - } --get_bio: -+ - if (rbio) { - /* - * promote already allocated bounce rbio: -@@ -1030,17 +1029,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - -- rbio = rbio_init(bio_alloc_bioset(NULL, -+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), -- orig->opts); -+ orig); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; -- rbio->split = true; -- } else if (flags & BCH_READ_MUST_CLONE) { -+ } else if (flags & BCH_READ_must_clone) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't -@@ -1049,11 +1047,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ -- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, -+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), -- orig->opts); -+ orig); - rbio->bio.bi_iter = iter; -- rbio->split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; -@@ -1062,11 +1059,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - -- rbio->c = c; - rbio->submit_time = local_clock(); -- if (rbio->split) -- rbio->parent = orig; -- else -+ if (!rbio->split) - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; -@@ -1076,41 +1070,38 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - rbio->hole = 0; - rbio->retry = 0; - rbio->context = 0; -- /* XXX: only initialize this if needed */ -- rbio->devs_have = bch2_bkey_devs(k); - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->bversion; -- rbio->promote = promote; - INIT_WORK(&rbio->work, NULL); - -- if (flags & BCH_READ_NODECODE) -- orig->pick = pick; -- - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - if (rbio->bounce) -- trace_and_count(c, read_bounce, &rbio->bio); -+ trace_and_count(c, io_read_bounce, &rbio->bio); - -- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); -+ if (!(flags & BCH_READ_data_update)) -+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); -+ else -+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ -- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) -+ if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - -- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { - bio_inc_remaining(&orig->bio); -- trace_and_count(c, read_split, &orig->bio); -+ trace_and_count(c, io_read_split, &orig->bio); - } - - if (!rbio->pick.idx) { -@@ -1132,10 +1123,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } else { -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); -@@ -1153,11 +1144,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - goto out; - } - -- if (likely(!(flags & BCH_READ_IN_RETRY))) -+ if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } - out: -- if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ if (likely(!(flags & BCH_READ_in_retry))) { - return 0; - } else { - int ret; -@@ -1180,24 +1171,26 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - } - - err: -- if (flags & BCH_READ_IN_RETRY) -+ if (flags & BCH_READ_in_retry) - return READ_ERR; - - orig->bio.bi_status = BLK_STS_IOERR; - goto out_read_done; - - hole: -+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], -+ bvec_iter_sectors(iter)); - /* -- * won't normally happen in the BCH_READ_NODECODE -+ * won't normally happen in the BCH_READ_data_update - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: - */ -- if (flags & BCH_READ_NODECODE) -+ if (flags & BCH_READ_data_update) - orig->hole = true; - - zero_fill_bio_iter(&orig->bio, iter); - out_read_done: -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if (flags & BCH_READ_last_fragment) - bch2_rbio_done(orig); - return 0; - } -@@ -1212,7 +1205,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bkey_s_c k; - int ret; - -- BUG_ON(flags & BCH_READ_NODECODE); -+ BUG_ON(flags & BCH_READ_data_update); - - bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, -@@ -1262,15 +1255,15 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) -- flags |= BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_last_fragment; - - ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, - data_btree, k, -- offset_into_extent, failed, flags); -+ offset_into_extent, failed, flags, -1); - if (ret) - goto err; - -- if (flags & BCH_READ_LAST_FRAGMENT) -+ if (flags & BCH_READ_last_fragment) - break; - - swap(bvec_iter.bi_size, bytes); -@@ -1287,7 +1280,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - - if (ret) { - struct printbuf buf = PRINTBUF; -- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); -+ lockrestart_do(trans, -+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, -+ bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); -diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h -index a82e8a94ccb6..73275da5d2c4 100644 ---- a/fs/bcachefs/io_read.h -+++ b/fs/bcachefs/io_read.h -@@ -35,20 +35,19 @@ struct bch_read_bio { - u16 flags; - union { - struct { -- u16 bounce:1, -+ u16 promote:1, -+ bounce:1, - split:1, -- kmalloc:1, - have_ioref:1, - narrow_crcs:1, - hole:1, -+ saw_error:1, - retry:2, - context:2; - }; - u16 _state; - }; - -- struct bch_devs_list devs_have; -- - struct extent_ptr_decoded pick; - - /* -@@ -65,8 +64,6 @@ struct bch_read_bio { - struct bpos data_pos; - struct bversion version; - -- struct promote_op *promote; -- - struct bch_io_opts opts; - - struct work_struct work; -@@ -108,23 +105,32 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, - return 0; - } - -+#define BCH_READ_FLAGS() \ -+ x(retry_if_stale) \ -+ x(may_promote) \ -+ x(user_mapped) \ -+ x(data_update) \ -+ x(last_fragment) \ -+ x(must_bounce) \ -+ x(must_clone) \ -+ x(in_retry) -+ -+enum __bch_read_flags { -+#define x(n) __BCH_READ_##n, -+ BCH_READ_FLAGS() -+#undef x -+}; -+ - enum bch_read_flags { -- BCH_READ_RETRY_IF_STALE = 1 << 0, -- BCH_READ_MAY_PROMOTE = 1 << 1, -- BCH_READ_USER_MAPPED = 1 << 2, -- BCH_READ_NODECODE = 1 << 3, -- BCH_READ_LAST_FRAGMENT = 1 << 4, -- -- /* internal: */ -- BCH_READ_MUST_BOUNCE = 1 << 5, -- BCH_READ_MUST_CLONE = 1 << 6, -- BCH_READ_IN_RETRY = 1 << 7, -+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), -+ BCH_READ_FLAGS() -+#undef x - }; - - int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, -- struct bch_io_failures *, unsigned); -+ struct bch_io_failures *, unsigned, int); - - static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, -@@ -132,7 +138,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, - unsigned offset_into_extent, unsigned flags) - { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, -- data_btree, k, offset_into_extent, NULL, flags); -+ data_btree, k, offset_into_extent, NULL, flags, -1); - } - - void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, -@@ -145,24 +151,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - - BUG_ON(rbio->_state); - -- rbio->c = c; -- rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, -- BCH_READ_RETRY_IF_STALE| -- BCH_READ_MAY_PROMOTE| -- BCH_READ_USER_MAPPED); -+ BCH_READ_retry_if_stale| -+ BCH_READ_may_promote| -+ BCH_READ_user_mapped); - } - --static inline struct bch_read_bio *rbio_init(struct bio *bio, -- struct bch_io_opts opts) -+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, -+ struct bch_read_bio *orig) - { - struct bch_read_bio *rbio = to_rbio(bio); - -+ rbio->c = orig->c; - rbio->_state = 0; -- rbio->promote = NULL; -- rbio->opts = opts; -+ rbio->split = true; -+ rbio->parent = orig; -+ rbio->opts = orig->opts; -+ return rbio; -+} -+ -+static inline struct bch_read_bio *rbio_init(struct bio *bio, -+ struct bch_fs *c, -+ struct bch_io_opts opts, -+ bio_end_io_t end_io) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->start_time = local_clock(); -+ rbio->c = c; -+ rbio->_state = 0; -+ rbio->opts = opts; -+ rbio->bio.bi_end_io = end_io; - return rbio; - } - -diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c -index dd508d93e9fc..0177198e90eb 100644 ---- a/fs/bcachefs/io_write.c -+++ b/fs/bcachefs/io_write.c -@@ -374,7 +374,7 @@ static int bch2_write_index_default(struct bch_write_op *op) - bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, -- op->flags & BCH_WRITE_CHECK_ENOSPC); -+ op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -403,7 +403,7 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", -- op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -+ op->flags & BCH_WRITE_move ? "(internal move)" : ""); - } - - void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -@@ -483,7 +483,7 @@ static void bch2_write_done(struct closure *cl) - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - bch2_disk_reservation_put(c, &op->res); - -- if (!(op->flags & BCH_WRITE_MOVE)) -+ if (!(op->flags & BCH_WRITE_move)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - -@@ -529,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op) - unsigned dev; - int ret = 0; - -- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -+ if (unlikely(op->flags & BCH_WRITE_io_error)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; -@@ -538,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op) - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - -- ret = !(op->flags & BCH_WRITE_MOVE) -+ ret = !(op->flags & BCH_WRITE_move) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - -@@ -570,14 +570,22 @@ static void __bch2_write_index(struct bch_write_op *op) - err: - keys->top = keys->keys; - op->error = ret; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - goto out; - } - - static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) - { - if (state != wp->state) { -+ struct task_struct *p = current; - u64 now = ktime_get_ns(); -+ u64 runtime = p->se.sum_exec_runtime + -+ (now - p->se.exec_start); -+ -+ if (state == WRITE_POINT_runnable) -+ wp->last_runtime = runtime; -+ else if (wp->state == WRITE_POINT_runnable) -+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) -@@ -591,7 +599,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) - { - enum write_point_state state; - -- state = running ? WRITE_POINT_running : -+ state = running ? WRITE_POINT_runnable: - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - -@@ -605,8 +613,8 @@ static CLOSURE_CALLBACK(bch2_write_index) - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - -- if ((op->flags & BCH_WRITE_SUBMITTED) && -- (op->flags & BCH_WRITE_MOVE)) -+ if ((op->flags & BCH_WRITE_submitted) && -+ (op->flags & BCH_WRITE_move)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); -@@ -644,11 +652,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) - if (!op) - break; - -- op->flags |= BCH_WRITE_IN_WORKER; -+ op->flags |= BCH_WRITE_in_worker; - - __bch2_write_index(op); - -- if (!(op->flags & BCH_WRITE_SUBMITTED)) -+ if (!(op->flags & BCH_WRITE_submitted)) - __bch2_write(op); - else - bch2_write_done(&op->cl); -@@ -672,7 +680,7 @@ static void bch2_write_endio(struct bio *bio) - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - set_bit(wbio->dev, op->failed.d); -- op->flags |= BCH_WRITE_IO_ERROR; -+ op->flags |= BCH_WRITE_io_error; - } - - if (wbio->nocow) { -@@ -719,7 +727,7 @@ static void init_append_extent(struct bch_write_op *op, - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, -- op->flags & BCH_WRITE_CACHED); -+ op->flags & BCH_WRITE_cached); - - bch2_keylist_push(&op->insert_keys); - } -@@ -836,7 +844,7 @@ static enum prep_encoded_ret { - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; - -- if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -+ if (!(op->flags & BCH_WRITE_data_encoded)) - return PREP_ENCODED_OK; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); -@@ -944,9 +952,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - if (ec_buf || - op->compression_opt || - (op->csum_type && -- !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ !(op->flags & BCH_WRITE_pages_stable)) || - (bch2_csum_type_is_encryption(op->csum_type) && -- !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ !(op->flags & BCH_WRITE_pages_owned))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); -@@ -966,7 +974,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - break; - - BUG_ON(op->compression_opt && -- (op->flags & BCH_WRITE_DATA_ENCODED) && -+ (op->flags & BCH_WRITE_data_encoded) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - -@@ -1004,7 +1012,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - } - } - -- if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ if ((op->flags & BCH_WRITE_data_encoded) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { -@@ -1036,7 +1044,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { -- if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ if ((op->flags & BCH_WRITE_data_encoded) && - bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, -@@ -1210,9 +1218,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) - - static void __bch2_nocow_write_done(struct bch_write_op *op) - { -- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -+ if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -EIO; -- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) -+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) - bch2_nocow_write_convert_unwritten(op); - } - -@@ -1241,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - struct bucket_to_lock *stale_at; - int stale, ret; - -- if (op->flags & BCH_WRITE_MOVE) -+ if (op->flags & BCH_WRITE_move) - return; - - darray_init(&buckets); -@@ -1299,7 +1307,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) -- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; -+ op->flags |= BCH_WRITE_convert_unwritten; - } - - /* Unlock before taking nocow locks, doing IO: */ -@@ -1307,7 +1315,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - bch2_trans_unlock(trans); - - bch2_cut_front(op->pos, op->insert_keys.top); -- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) -+ if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - darray_for_each(buckets, i) { -@@ -1332,7 +1340,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - } - - op->pos.offset += bio_sectors(bio); -@@ -1346,7 +1354,7 @@ static void bch2_nocow_write(struct bch_write_op *op) - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); -- if (op->flags & BCH_WRITE_SUBMITTED) -+ if (op->flags & BCH_WRITE_submitted) - break; - bch2_btree_iter_advance(&iter); - } -@@ -1366,15 +1374,15 @@ static void bch2_nocow_write(struct bch_write_op *op) - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - op->error = ret; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - } - - /* fallback to cow write path? */ -- if (!(op->flags & BCH_WRITE_SUBMITTED)) { -+ if (!(op->flags & BCH_WRITE_submitted)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; -- } else if (op->flags & BCH_WRITE_SYNC) { -+ } else if (op->flags & BCH_WRITE_sync) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl.work); - } else { -@@ -1426,7 +1434,7 @@ static void __bch2_write(struct bch_write_op *op) - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); -- if (op->flags & BCH_WRITE_SUBMITTED) -+ if (op->flags & BCH_WRITE_submitted) - goto out_nofs_restore; - } - again: -@@ -1456,7 +1464,7 @@ static void __bch2_write(struct bch_write_op *op) - ret = bch2_trans_run(c, lockrestart_do(trans, - bch2_alloc_sectors_start_trans(trans, - op->target, -- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), -+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), - op->write_point, - &op->devs_have, - op->nr_replicas, -@@ -1479,10 +1487,10 @@ static void __bch2_write(struct bch_write_op *op) - bch2_alloc_sectors_done_inlined(c, wp); - err: - if (ret <= 0) { -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_submitted; - - if (unlikely(ret < 0)) { -- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { -+ if (!(op->flags & BCH_WRITE_alloc_nowait)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); -@@ -1514,14 +1522,14 @@ static void __bch2_write(struct bch_write_op *op) - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ -- if ((op->flags & BCH_WRITE_SYNC) || -- (!(op->flags & BCH_WRITE_SUBMITTED) && -- !(op->flags & BCH_WRITE_IN_WORKER))) { -+ if ((op->flags & BCH_WRITE_sync) || -+ (!(op->flags & BCH_WRITE_submitted) && -+ !(op->flags & BCH_WRITE_in_worker))) { - bch2_wait_on_allocator(c, &op->cl); - - __bch2_write_index(op); - -- if (!(op->flags & BCH_WRITE_SUBMITTED)) -+ if (!(op->flags & BCH_WRITE_submitted)) - goto again; - bch2_write_done(&op->cl); - } else { -@@ -1542,8 +1550,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) - - memset(&op->failed, 0, sizeof(op->failed)); - -- op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -- op->flags |= BCH_WRITE_SUBMITTED; -+ op->flags |= BCH_WRITE_wrote_data_inline; -+ op->flags |= BCH_WRITE_submitted; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - -@@ -1606,8 +1614,8 @@ CLOSURE_CALLBACK(bch2_write) - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - -- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) -- op->flags |= BCH_WRITE_ALLOC_NOWAIT; -+ if (op->flags & BCH_WRITE_only_specified_devs) -+ op->flags |= BCH_WRITE_alloc_nowait; - - op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); - op->start_time = local_clock(); -@@ -1628,13 +1636,14 @@ CLOSURE_CALLBACK(bch2_write) - goto err; - } - -- if (!(op->flags & BCH_WRITE_MOVE) && -+ if (!(op->flags & BCH_WRITE_move) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; - goto err; - } - -- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); -+ if (!(op->flags & BCH_WRITE_move)) -+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, -diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h -index b4626013abc8..02cca52be0bd 100644 ---- a/fs/bcachefs/io_write.h -+++ b/fs/bcachefs/io_write.h -@@ -23,21 +23,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); - - #define BCH_WRITE_FLAGS() \ -- x(ALLOC_NOWAIT) \ -- x(CACHED) \ -- x(DATA_ENCODED) \ -- x(PAGES_STABLE) \ -- x(PAGES_OWNED) \ -- x(ONLY_SPECIFIED_DEVS) \ -- x(WROTE_DATA_INLINE) \ -- x(FROM_INTERNAL) \ -- x(CHECK_ENOSPC) \ -- x(SYNC) \ -- x(MOVE) \ -- x(IN_WORKER) \ -- x(SUBMITTED) \ -- x(IO_ERROR) \ -- x(CONVERT_UNWRITTEN) -+ x(alloc_nowait) \ -+ x(cached) \ -+ x(data_encoded) \ -+ x(pages_stable) \ -+ x(pages_owned) \ -+ x(only_specified_devs) \ -+ x(wrote_data_inline) \ -+ x(check_enospc) \ -+ x(sync) \ -+ x(move) \ -+ x(in_worker) \ -+ x(submitted) \ -+ x(io_error) \ -+ x(convert_unwritten) - - enum __bch_write_flags { - #define x(f) __BCH_WRITE_##f, -diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h -index 6e878a6f2f0b..3ef6df9145ef 100644 ---- a/fs/bcachefs/io_write_types.h -+++ b/fs/bcachefs/io_write_types.h -@@ -64,7 +64,7 @@ struct bch_write_op { - struct bpos pos; - struct bversion version; - -- /* For BCH_WRITE_DATA_ENCODED: */ -+ /* For BCH_WRITE_data_encoded: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -index 24c294d4634e..ea96605cf162 100644 ---- a/fs/bcachefs/journal.c -+++ b/fs/bcachefs/journal.c -@@ -56,11 +56,18 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 - prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); - -- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); -+ if (!buf->write_started) -+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - -- prt_printf(out, "size:\t"); -- prt_human_readable_u64(out, vstruct_bytes(buf->data)); -- prt_newline(out); -+ struct closure *cl = &buf->io; -+ int r = atomic_read(&cl->remaining); -+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); -+ -+ if (buf->data) { -+ prt_printf(out, "size:\t"); -+ prt_human_readable_u64(out, vstruct_bytes(buf->data)); -+ prt_newline(out); -+ } - - prt_printf(out, "expires:\t"); - prt_printf(out, "%li jiffies\n", buf->expires - jiffies); -@@ -87,6 +94,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 - - static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) - { -+ lockdep_assert_held(&j->lock); -+ out->atomic++; -+ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); - -@@ -95,6 +105,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) - seq++) - bch2_journal_buf_to_text(out, j, seq); - prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); -+ -+ --out->atomic; - } - - static inline struct journal_buf * -@@ -104,10 +116,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) - - EBUG_ON(seq > journal_cur_seq(j)); - -- if (journal_seq_unwritten(j, seq)) { -+ if (journal_seq_unwritten(j, seq)) - buf = j->buf + (seq & JOURNAL_BUF_MASK); -- EBUG_ON(le64_to_cpu(buf->data->seq) != seq); -- } - return buf; - } - -@@ -195,7 +205,8 @@ void bch2_journal_do_writes(struct journal *j) - if (w->write_started) - continue; - -- if (!journal_state_count(j->reservations, idx)) { -+ if (!journal_state_seq_count(j, j->reservations, seq)) { -+ j->seq_write_started = seq; - w->write_started = true; - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } -@@ -306,7 +317,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t - - bch2_journal_space_available(j); - -- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); -+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); - } - - void bch2_journal_halt(struct journal *j) -@@ -391,6 +402,9 @@ static int journal_entry_open(struct journal *j) - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; - -+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) -+ return JOURNAL_ERR_max_open; -+ - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { - bch_err(c, "cannot start: journal seq overflow"); - if (bch2_fs_emergency_read_only_locked(c)) -@@ -398,8 +412,16 @@ static int journal_entry_open(struct journal *j) - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ - } - -+ if (!j->free_buf && !buf->data) -+ return JOURNAL_ERR_enomem; /* will retry after write completion frees up a buf */ -+ - BUG_ON(!j->cur_entry_sectors); - -+ if (!buf->data) { -+ swap(buf->data, j->free_buf); -+ swap(buf->buf_size, j->free_buf_size); -+ } -+ - buf->expires = - (journal_cur_seq(j) == j->flushed_seq_ondisk - ? jiffies -@@ -464,7 +486,7 @@ static int journal_entry_open(struct journal *j) - - new.idx++; - BUG_ON(journal_state_count(new, new.idx)); -- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); -+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); - - journal_state_inc(&new); - -@@ -514,6 +536,33 @@ static void journal_write_work(struct work_struct *work) - spin_unlock(&j->lock); - } - -+static void journal_buf_prealloc(struct journal *j) -+{ -+ if (j->free_buf && -+ j->free_buf_size >= j->buf_size_want) -+ return; -+ -+ unsigned buf_size = j->buf_size_want; -+ -+ spin_unlock(&j->lock); -+ void *buf = kvmalloc(buf_size, GFP_NOFS); -+ spin_lock(&j->lock); -+ -+ if (buf && -+ (!j->free_buf || -+ buf_size > j->free_buf_size)) { -+ swap(buf, j->free_buf); -+ swap(buf_size, j->free_buf_size); -+ } -+ -+ if (unlikely(buf)) { -+ spin_unlock(&j->lock); -+ /* kvfree can sleep */ -+ kvfree(buf); -+ spin_lock(&j->lock); -+ } -+} -+ - static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned flags) - { -@@ -544,6 +593,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - - spin_lock(&j->lock); - -+ journal_buf_prealloc(j); -+ - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call bch2_journal_entry_close() -@@ -571,20 +622,43 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, - can_discard = j->can_discard; - spin_unlock(&j->lock); - out: -+ if (likely(!ret)) -+ return 0; - if (ret == JOURNAL_ERR_retry) - goto retry; -- if (!ret) -- return 0; - - if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_res_get_blocked; - - if (ret == JOURNAL_ERR_max_in_flight && -- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { -+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && -+ trace_journal_entry_full_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_printbuf_make_room(&buf, 4096); - -+ spin_lock(&j->lock); -+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); -+ bch2_journal_bufs_to_text(&buf, j); -+ spin_unlock(&j->lock); -+ -+ trace_journal_entry_full(c, buf.buf); -+ printbuf_exit(&buf); -+ count_event(c, journal_entry_full); -+ } -+ -+ if (ret == JOURNAL_ERR_max_open && -+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && -+ trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; -+ -+ bch2_printbuf_make_room(&buf, 4096); -+ -+ spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); -+ spin_unlock(&j->lock); -+ - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); -@@ -951,7 +1025,8 @@ static void __bch2_journal_block(struct journal *j) - new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - -- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) -+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); - } - } - -@@ -992,7 +1067,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou - *blocked = true; - } - -- ret = journal_state_count(s, idx) > open -+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; -@@ -1342,6 +1417,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) - j->replay_journal_seq_end = cur_seq; - j->last_seq_ondisk = last_seq; - j->flushed_seq_ondisk = cur_seq - 1; -+ j->seq_write_started = cur_seq - 1; - j->seq_ondisk = cur_seq - 1; - j->pin.front = last_seq; - j->pin.back = cur_seq; -@@ -1382,8 +1458,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) - set_bit(JOURNAL_running, &j->flags); - j->last_flush_write = jiffies; - -- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); -- j->reservations.unwritten_idx++; -+ j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); - -@@ -1475,6 +1550,7 @@ void bch2_fs_journal_exit(struct journal *j) - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvfree(j->buf[i].data); -+ kvfree(j->free_buf); - free_fifo(&j->pin); - } - -@@ -1501,13 +1577,13 @@ int bch2_fs_journal_init(struct journal *j) - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; - -- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { -- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; -- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); -- if (!j->buf[i].data) -- return -BCH_ERR_ENOMEM_journal_buf; -+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; -+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); -+ if (!j->free_buf) -+ return -BCH_ERR_ENOMEM_journal_buf; -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - j->buf[i].idx = i; -- } - - j->pin.front = j->pin.back = 1; - -@@ -1557,6 +1633,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) - prt_printf(out, "average write size:\t"); - prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); - prt_newline(out); -+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -index 107f7f901cd9..1c460ded2a11 100644 ---- a/fs/bcachefs/journal.h -+++ b/fs/bcachefs/journal.h -@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) - closure_wake_up(&j->async_wait); - } - --static inline struct journal_buf *journal_cur_buf(struct journal *j) --{ -- return j->buf + j->reservations.idx; --} -- - /* Sequence number of oldest dirty journal entry */ - - static inline u64 journal_last_seq(struct journal *j) -@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) - return j->seq_ondisk + 1; - } - -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ unsigned idx = (journal_cur_seq(j) & -+ JOURNAL_BUF_MASK & -+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; -+ -+ return j->buf + idx; -+} -+ - static inline int journal_state_count(union journal_res_state s, int idx) - { - switch (idx) { -@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) - BUG(); - } - -+static inline int journal_state_seq_count(struct journal *j, -+ union journal_res_state s, u64 seq) -+{ -+ if (journal_cur_seq(j) - seq <= JOURNAL_STATE_BUF_NR) -+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); -+ else -+ return 0; -+} -+ - static inline void journal_state_inc(union journal_res_state *s) - { - s->buf0_count += s->idx == 0; -@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) - static inline struct jset_entry * - journal_res_entry(struct journal *j, struct journal_res *res) - { -- return vstruct_idx(j->buf[res->idx].data, res->offset); -+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); - } - - static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, -@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); - void bch2_journal_do_writes(struct journal *); - void bch2_journal_buf_put_final(struct journal *, u64); - --static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) -+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) - { -+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); -@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s - bch2_journal_buf_put_final(j, seq); - } - --static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) -+static inline void bch2_journal_buf_put(struct journal *j, u64 seq) - { -+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); -@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, - BCH_JSET_ENTRY_btree_keys, - 0, 0, 0); - -- bch2_journal_buf_put(j, res->idx, res->seq); -+ bch2_journal_buf_put(j, res->seq); - - res->ref = 0; - } -@@ -361,9 +376,9 @@ static inline int journal_res_get_fast(struct journal *j, - &old.v, new.v)); - - res->ref = true; -- res->idx = old.idx; - res->offset = old.cur_entry_offset; -- res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ res->seq = journal_cur_seq(j); -+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; - return 1; - } - -@@ -390,6 +405,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re - (flags & JOURNAL_RES_GET_NONBLOCK) != 0, - NULL, _THIS_IP_); - EBUG_ON(!res->ref); -+ BUG_ON(!res->seq); - } - return 0; - } -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -index 11c39e0c34f4..61f71e7baff2 100644 ---- a/fs/bcachefs/journal_io.c -+++ b/fs/bcachefs/journal_io.c -@@ -1611,7 +1611,6 @@ static CLOSURE_CALLBACK(journal_write_done) - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; -- union journal_res_state old, new; - u64 seq = le64_to_cpu(w->data->seq); - int err = 0; - -@@ -1641,6 +1640,21 @@ static CLOSURE_CALLBACK(journal_write_done) - j->err_seq = seq; - w->write_done = true; - -+ if (!j->free_buf || j->free_buf_size < w->buf_size) { -+ swap(j->free_buf, w->data); -+ swap(j->free_buf_size, w->buf_size); -+ } -+ -+ if (w->data) { -+ void *buf = w->data; -+ w->data = NULL; -+ w->buf_size = 0; -+ -+ spin_unlock(&j->lock); -+ kvfree(buf); -+ spin_lock(&j->lock); -+ } -+ - bool completed = false; - - for (seq = journal_last_unwritten_seq(j); -@@ -1650,7 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) - if (!w->write_done) - break; - -- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { -+ if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - -@@ -1671,16 +1685,6 @@ static CLOSURE_CALLBACK(journal_write_done) - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); - -- old.v = atomic64_read(&j->reservations.counter); -- do { -- new.v = old.v; -- BUG_ON(journal_state_count(new, new.unwritten_idx)); -- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); -- -- new.unwritten_idx++; -- } while (!atomic64_try_cmpxchg(&j->reservations.counter, -- &old.v, new.v)); -- - closure_wake_up(&w->wait); - completed = true; - } -@@ -1695,7 +1699,7 @@ static CLOSURE_CALLBACK(journal_write_done) - } - - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && -- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { -+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { - struct journal_buf *buf = journal_cur_buf(j); - long delta = buf->expires - jiffies; - -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -index 1f25c111c54c..e463d2d95359 100644 ---- a/fs/bcachefs/journal_seq_blacklist.c -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - -- unsigned i; -- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); -- src < bl->start + nr; -- src++, i = eytzinger0_next(i, nr)) { -+ src = bl->start; -+ eytzinger0_for_each(i, nr) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) - *dst++ = *src; -+ src++; - } - - unsigned new_nr = dst - bl->start; -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -index a198a81d7478..060ec991dd2b 100644 ---- a/fs/bcachefs/journal_types.h -+++ b/fs/bcachefs/journal_types.h -@@ -12,7 +12,11 @@ - /* btree write buffer steals 8 bits for its own purposes: */ - #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) - --#define JOURNAL_BUF_BITS 2 -+#define JOURNAL_STATE_BUF_BITS 2 -+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) -+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) -+ -+#define JOURNAL_BUF_BITS 4 - #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) - #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) - -@@ -79,7 +83,6 @@ struct journal_entry_pin { - - struct journal_res { - bool ref; -- u8 idx; - u16 u64s; - u32 offset; - u64 seq; -@@ -95,9 +98,8 @@ union journal_res_state { - }; - - struct { -- u64 cur_entry_offset:20, -+ u64 cur_entry_offset:22, - idx:2, -- unwritten_idx:2, - buf0_count:10, - buf1_count:10, - buf2_count:10, -@@ -107,13 +109,13 @@ union journal_res_state { - - /* bytes: */ - #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ --#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ - - /* - * We stash some journal state as sentinal values in cur_entry_offset: - * note - cur_entry_offset is in units of u64s - */ --#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) - - #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) - #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -@@ -152,9 +154,11 @@ enum journal_flags { - x(retry) \ - x(blocked) \ - x(max_in_flight) \ -+ x(max_open) \ - x(journal_full) \ - x(journal_pin_full) \ - x(journal_stuck) \ -+ x(enomem) \ - x(insufficient_devices) - - enum journal_errors { -@@ -217,6 +221,8 @@ struct journal { - * other is possibly being written out. - */ - struct journal_buf buf[JOURNAL_BUF_NR]; -+ void *free_buf; -+ unsigned free_buf_size; - - spinlock_t lock; - -@@ -234,6 +240,7 @@ struct journal { - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - -+ u64 seq_write_started; - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -index ddc187fb693d..57ad662871ba 100644 ---- a/fs/bcachefs/migrate.c -+++ b/fs/bcachefs/migrate.c -@@ -15,6 +15,7 @@ - #include "keylist.h" - #include "migrate.h" - #include "move.h" -+#include "progress.h" - #include "replicas.h" - #include "super-io.h" - -@@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - return 0; - } - --static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+static int bch2_dev_usrdata_drop(struct bch_fs *c, -+ struct progress_indicator_state *progress, -+ unsigned dev_idx, int flags) - { - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; -@@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, -- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, -- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); -+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ -+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); -+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -+ })); - if (ret) - break; - } -@@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - return ret; - } - --static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+static int bch2_dev_metadata_drop(struct bch_fs *c, -+ struct progress_indicator_state *progress, -+ unsigned dev_idx, int flags) - { - struct btree_trans *trans; - struct btree_iter iter; -@@ -125,6 +132,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { -+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); -+ - if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) - goto next; - -@@ -169,6 +178,11 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) - - int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) - { -- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -- bch2_dev_metadata_drop(c, dev_idx, flags); -+ struct progress_indicator_state progress; -+ bch2_progress_init(&progress, c, -+ BIT_ULL(BTREE_ID_extents)| -+ BIT_ULL(BTREE_ID_reflink)); -+ -+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags); - } -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -index c493ea625553..e0e10deaea73 100644 ---- a/fs/bcachefs/move.c -+++ b/fs/bcachefs/move.c -@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { - NULL - }; - --static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, -+static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) - { -- if (trace_move_extent_enabled()) { -+ if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); -- trace_move_extent(c, buf.buf); -+ trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } - } - --static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) -+static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) - { -- if (trace_move_extent_read_enabled()) { -+ if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); -- trace_move_extent_read(c, buf.buf); -+ trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); - } - } -@@ -74,11 +74,7 @@ struct moving_io { - unsigned read_sectors; - unsigned write_sectors; - -- struct bch_read_bio rbio; -- - struct data_update write; -- /* Must be last since it is variable size */ -- struct bio_vec bi_inline_vecs[]; - }; - - static void move_free(struct moving_io *io) -@@ -88,13 +84,17 @@ static void move_free(struct moving_io *io) - if (io->b) - atomic_dec(&io->b->count); - -- bch2_data_update_exit(&io->write); -- - mutex_lock(&ctxt->lock); - list_del(&io->io_list); - wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); - -+ if (!io->write.data_opts.scrub) { -+ bch2_data_update_exit(&io->write); -+ } else { -+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); -+ kfree(io->write.bvecs); -+ } - kfree(io); - } - -@@ -114,17 +114,30 @@ static void move_write_done(struct bch_write_op *op) - - static void move_write(struct moving_io *io) - { -- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ if (ctxt->stats) { -+ if (io->write.rbio.bio.bi_status) -+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, -+ &ctxt->stats->sectors_error_uncorrected); -+ else if (io->write.rbio.saw_error) -+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, -+ &ctxt->stats->sectors_error_corrected); -+ } -+ -+ if (unlikely(io->write.rbio.bio.bi_status || -+ io->write.rbio.hole || -+ io->write.data_opts.scrub)) { - move_free(io); - return; - } - -- if (trace_move_extent_write_enabled()) { -+ if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); -- trace_move_extent_write(c, buf.buf); -+ trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); - } - -@@ -132,7 +145,7 @@ static void move_write(struct moving_io *io) - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_inc(&io->write.ctxt->write_ios); - -- bch2_data_update_read_done(&io->write, io->rbio.pick.crc); -+ bch2_data_update_read_done(&io->write); - } - - struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -@@ -145,7 +158,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx - - static void move_read_endio(struct bio *bio) - { -- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); - struct moving_context *ctxt = io->write.ctxt; - - atomic_sub(io->read_sectors, &ctxt->read_sectors); -@@ -258,14 +271,10 @@ int bch2_move_extent(struct moving_context *ctxt, - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- struct moving_io *io; -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- unsigned sectors = k.k->size, pages; - int ret = -ENOMEM; - -- trace_move_extent2(c, k, &io_opts, &data_opts); -+ trace_io_move2(c, k, &io_opts, &data_opts); -+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); -@@ -273,7 +282,8 @@ int bch2_move_extent(struct moving_context *ctxt, - bch2_data_update_opts_normalize(k, &data_opts); - - if (!data_opts.rewrite_ptrs && -- !data_opts.extra_replicas) { -+ !data_opts.extra_replicas && -+ !data_opts.scrub) { - if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; -@@ -285,13 +295,7 @@ int bch2_move_extent(struct moving_context *ctxt, - */ - bch2_trans_unlock(trans); - -- /* write path might have to decompress data: */ -- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -- -- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -- io = kzalloc(sizeof(struct moving_io) + -- sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); - if (!io) - goto err; - -@@ -300,31 +304,27 @@ int bch2_move_extent(struct moving_context *ctxt, - io->read_sectors = k.k->size; - io->write_sectors = k.k->size; - -- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); -- bio_set_prio(&io->write.op.wbio.bio, -- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -- -- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -- GFP_KERNEL)) -- goto err_free; -+ if (!data_opts.scrub) { -+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, -+ &io_opts, data_opts, iter->btree_id, k); -+ if (ret) -+ goto err_free; - -- io->rbio.c = c; -- io->rbio.opts = io_opts; -- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); -- io->rbio.bio.bi_vcnt = pages; -- bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -- io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ io->write.op.end_io = move_write_done; -+ } else { -+ bch2_bkey_buf_init(&io->write.k); -+ bch2_bkey_buf_reassemble(&io->write.k, c, k); - -- io->rbio.bio.bi_opf = REQ_OP_READ; -- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -- io->rbio.bio.bi_end_io = move_read_endio; -+ io->write.op.c = c; -+ io->write.data_opts = data_opts; - -- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, -- io_opts, data_opts, iter->btree_id, k); -- if (ret) -- goto err_free_pages; -+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts); -+ if (ret) -+ goto err_free; -+ } - -- io->write.op.end_io = move_write_done; -+ io->write.rbio.bio.bi_end_io = move_read_endio; -+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); -@@ -339,9 +339,7 @@ int bch2_move_extent(struct moving_context *ctxt, - atomic_inc(&io->b->count); - } - -- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); -- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); -- trace_move_extent_read2(c, k); -+ trace_io_move_read2(c, k); - - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); -@@ -356,33 +354,34 @@ int bch2_move_extent(struct moving_context *ctxt, - * ctxt when doing wakeup - */ - closure_get(&ctxt->cl); -- bch2_read_extent(trans, &io->rbio, -- bkey_start_pos(k.k), -- iter->btree_id, k, 0, -- BCH_READ_NODECODE| -- BCH_READ_LAST_FRAGMENT); -+ __bch2_read_extent(trans, &io->write.rbio, -+ io->write.rbio.bio.bi_iter, -+ bkey_start_pos(k.k), -+ iter->btree_id, k, 0, -+ NULL, -+ BCH_READ_data_update| -+ BCH_READ_last_fragment, -+ data_opts.scrub ? data_opts.read_dev : -1); - return 0; --err_free_pages: -- bio_free_pages(&io->write.op.wbio.bio); - err_free: - kfree(io); - err: -- if (ret == -BCH_ERR_data_update_done) -+ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - - if (bch2_err_matches(ret, EROFS) || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - -- count_event(c, move_extent_start_fail); -+ count_event(c, io_move_start_fail); - -- if (trace_move_extent_start_fail_enabled()) { -+ if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, ": "); - prt_str(&buf, bch2_err_str(ret)); -- trace_move_extent_start_fail(c, buf.buf); -+ trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); - } - return ret; -@@ -627,7 +626,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; - -- if (ret2 == -ENOMEM) { -+ if (bch2_err_matches(ret2, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; -@@ -689,21 +688,22 @@ int bch2_move_data(struct bch_fs *c, - bool wait_on_copygc, - move_pred_fn pred, void *arg) - { -- - struct moving_context ctxt; -- int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -- ret = __bch2_move_data(&ctxt, start, end, pred, arg); -+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; - } - --int bch2_evacuate_bucket(struct moving_context *ctxt, -- struct move_bucket_in_flight *bucket_in_flight, -- struct bpos bucket, int gen, -- struct data_update_opts _data_opts) -+static int __bch2_move_data_phys(struct moving_context *ctxt, -+ struct move_bucket_in_flight *bucket_in_flight, -+ unsigned dev, -+ u64 bucket_start, -+ u64 bucket_end, -+ unsigned data_types, -+ move_pred_fn pred, void *arg) - { - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; -@@ -712,16 +712,20 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; - struct bkey_s_c k; -- struct data_update_opts data_opts; - unsigned sectors_moved = 0; - struct bkey_buf last_flushed; - int ret = 0; - -- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); -+ struct bch_dev *ca = bch2_dev_tryget(c, dev); - if (!ca) - return 0; - -- trace_bucket_evacuate(c, &bucket); -+ bucket_end = min(bucket_end, ca->mi.nbuckets); -+ -+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); -+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); -+ bch2_dev_put(ca); -+ ca = NULL; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); -@@ -732,8 +736,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - */ - bch2_trans_begin(trans); - -- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, -- bucket_pos_to_bp_start(ca, bucket), 0); -+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) -@@ -757,7 +760,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - if (ret) - goto err; - -- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) -+ if (!k.k || bkey_gt(k.k->p, bp_end)) - break; - - if (k.k->type != KEY_TYPE_backpointer) -@@ -765,107 +768,146 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - -- if (!bp.v->level) { -- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); -- ret = bkey_err(k); -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -- if (!k.k) -- goto next; -+ if (ctxt->stats) -+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - -- bch2_bkey_buf_reassemble(&sk, c, k); -- k = bkey_i_to_s_c(sk.k); -+ if (!(data_types & BIT(bp.v->data_type))) -+ goto next; - -+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); -+ ret = bkey_err(k); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ if (!k.k) -+ goto next; -+ -+ if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; - } -+ } - -- data_opts = _data_opts; -- data_opts.target = io_opts.background_target; -- data_opts.rewrite_ptrs = 0; -- -- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ -- unsigned i = 0; -- const union bch_extent_entry *entry; -- struct extent_ptr_decoded p; -- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { -- if (p.ptr.dev == bucket.inode) { -- if (p.ptr.cached) { -- bch2_trans_iter_exit(trans, &iter); -- goto next; -- } -- data_opts.rewrite_ptrs |= 1U << i; -- break; -- } -- i++; -- } -- -- ret = bch2_move_extent(ctxt, bucket_in_flight, -- &iter, k, io_opts, data_opts); -+ struct data_update_opts data_opts = {}; -+ if (!pred(c, arg, k, &io_opts, &data_opts)) { - bch2_trans_iter_exit(trans, &iter); -+ goto next; -+ } - -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret == -ENOMEM) { -- /* memory allocation failure, wait for some IO to finish */ -- bch2_move_ctxt_wait_for_io(ctxt); -- continue; -- } -- if (ret) -- goto err; -+ if (data_opts.scrub && -+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) { -+ bch2_trans_iter_exit(trans, &iter); -+ ret = -BCH_ERR_device_offline; -+ break; -+ } - -- if (ctxt->stats) -- atomic64_add(sectors, &ctxt->stats->sectors_seen); -- sectors_moved += sectors; -- } else { -- struct btree *b; -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); - -- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); -- ret = PTR_ERR_OR_ZERO(b); -- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -- goto next; -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -- if (!b) -- goto next; -+ /* move_extent will drop locks */ -+ unsigned sectors = bp.v->bucket_len; - -- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); -+ if (!bp.v->level) -+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); -+ else if (!data_opts.scrub) -+ ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); -+ else -+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - -- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -- bch2_trans_iter_exit(trans, &iter); -- -- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- continue; -- if (ret) -- goto err; -+ bch2_trans_iter_exit(trans, &iter); - -- if (ctxt->rate) -- bch2_ratelimit_increment(ctxt->rate, sectors); -- if (ctxt->stats) { -- atomic64_add(sectors, &ctxt->stats->sectors_seen); -- atomic64_add(sectors, &ctxt->stats->sectors_moved); -- } -- sectors_moved += btree_sectors(c); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); -+ continue; - } -+ if (ret) -+ goto err; -+ -+ if (ctxt->stats) -+ atomic64_add(sectors, &ctxt->stats->sectors_seen); -+ sectors_moved += sectors; - next: - bch2_btree_iter_advance(&bp_iter); - } -- -- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); - err: - bch2_trans_iter_exit(trans, &bp_iter); -- bch2_dev_put(ca); - bch2_bkey_buf_exit(&sk, c); - bch2_bkey_buf_exit(&last_flushed, c); - return ret; - } - -+static int bch2_move_data_phys(struct bch_fs *c, -+ unsigned dev, -+ u64 start, -+ u64 end, -+ unsigned data_types, -+ struct bch_ratelimit *rate, -+ struct bch_move_stats *stats, -+ struct write_point_specifier wp, -+ bool wait_on_copygc, -+ move_pred_fn pred, void *arg) -+{ -+ struct moving_context ctxt; -+ -+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); -+ -+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ ctxt.stats->phys = true; -+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; -+ -+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); -+ bch2_moving_ctxt_exit(&ctxt); -+ -+ return ret; -+} -+ -+struct evacuate_bucket_arg { -+ struct bpos bucket; -+ int gen; -+ struct data_update_opts data_opts; -+}; -+ -+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct evacuate_bucket_arg *arg = _arg; -+ -+ *data_opts = arg->data_opts; -+ -+ unsigned i = 0; -+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { -+ if (ptr->dev == arg->bucket.inode && -+ (arg->gen < 0 || arg->gen == ptr->gen) && -+ !ptr->cached) -+ data_opts->rewrite_ptrs |= BIT(i); -+ i++; -+ } -+ -+ return data_opts->rewrite_ptrs != 0; -+} -+ -+int bch2_evacuate_bucket(struct moving_context *ctxt, -+ struct move_bucket_in_flight *bucket_in_flight, -+ struct bpos bucket, int gen, -+ struct data_update_opts data_opts) -+{ -+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; -+ -+ return __bch2_move_data_phys(ctxt, bucket_in_flight, -+ bucket.inode, -+ bucket.offset, -+ bucket.offset + 1, -+ ~0, -+ evacuate_bucket_pred, &arg); -+} -+ - typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_update_opts *); -@@ -1007,14 +1049,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); - } - --static bool migrate_btree_pred(struct bch_fs *c, void *arg, -- struct btree *b, -- struct bch_io_opts *io_opts, -- struct data_update_opts *data_opts) --{ -- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); --} -- - /* - * Ancient versions of bcachefs produced packed formats which could represent - * keys that the in memory format cannot represent; this checks for those -@@ -1104,6 +1138,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); - } - -+static bool scrub_pred(struct bch_fs *c, void *_arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct bch_ioctl_data *arg = _arg; -+ -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == arg->migrate.dev) { -+ if (!p.crc.csum_type) -+ return false; -+ break; -+ } -+ } -+ -+ data_opts->scrub = true; -+ data_opts->read_dev = arg->migrate.dev; -+ return true; -+} -+ - int bch2_data_job(struct bch_fs *c, - struct bch_move_stats *stats, - struct bch_ioctl_data op) -@@ -1118,6 +1176,22 @@ int bch2_data_job(struct bch_fs *c, - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); - - switch (op.op) { -+ case BCH_DATA_OP_scrub: -+ /* -+ * prevent tests from spuriously failing, make sure we see all -+ * btree nodes that need to be repaired -+ */ -+ bch2_btree_interior_updates_flush(c); -+ -+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, -+ op.scrub.data_types, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ false, -+ scrub_pred, &op) ?: ret; -+ break; -+ - case BCH_DATA_OP_rereplicate: - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, -1); -@@ -1137,14 +1211,14 @@ int bch2_data_job(struct bch_fs *c, - - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -- ret = bch2_move_btree(c, start, end, -- migrate_btree_pred, &op, stats) ?: ret; -- ret = bch2_move_data(c, start, end, -- NULL, -- stats, -- writepoint_hashed((unsigned long) current), -- true, -- migrate_pred, &op) ?: ret; -+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, -+ ~0, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ true, -+ migrate_pred, &op) ?: ret; -+ bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_rewrite_old_nodes: -@@ -1216,7 +1290,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str - - mutex_lock(&ctxt->lock); - list_for_each_entry(io, &ctxt->ios, io_list) -- bch2_write_op_to_text(out, &io->write.op); -+ bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); - - printbuf_indent_sub(out, 4); -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -index e22841ef31e4..82e473ed48d2 100644 ---- a/fs/bcachefs/move_types.h -+++ b/fs/bcachefs/move_types.h -@@ -3,17 +3,31 @@ - #define _BCACHEFS_MOVE_TYPES_H - - #include "bbpos_types.h" -+#include "bcachefs_ioctl.h" - - struct bch_move_stats { -- enum bch_data_type data_type; -- struct bbpos pos; - char name[32]; -+ bool phys; -+ enum bch_ioctl_data_event_ret ret; -+ -+ union { -+ struct { -+ enum bch_data_type data_type; -+ struct bbpos pos; -+ }; -+ struct { -+ unsigned dev; -+ u64 offset; -+ }; -+ }; - - atomic64_t keys_moved; - atomic64_t keys_raced; - atomic64_t sectors_seen; - atomic64_t sectors_moved; - atomic64_t sectors_raced; -+ atomic64_t sectors_error_corrected; -+ atomic64_t sectors_error_uncorrected; - }; - - struct move_bucket_key { -diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c -new file mode 100644 -index 000000000000..bafd1c91a802 ---- /dev/null -+++ b/fs/bcachefs/progress.c -@@ -0,0 +1,63 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bbpos.h" -+#include "disk_accounting.h" -+#include "progress.h" -+ -+void bch2_progress_init(struct progress_indicator_state *s, -+ struct bch_fs *c, -+ u64 btree_id_mask) -+{ -+ memset(s, 0, sizeof(*s)); -+ -+ s->next_print = jiffies + HZ * 10; -+ -+ for (unsigned i = 0; i < BTREE_ID_NR; i++) { -+ if (!(btree_id_mask & BIT_ULL(i))) -+ continue; -+ -+ struct disk_accounting_pos acc = { -+ .type = BCH_DISK_ACCOUNTING_btree, -+ .btree.id = i, -+ }; -+ -+ u64 v; -+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); -+ s->nodes_total += div64_ul(v, btree_sectors(c)); -+ } -+} -+ -+static inline bool progress_update_p(struct progress_indicator_state *s) -+{ -+ bool ret = time_after_eq(jiffies, s->next_print); -+ -+ if (ret) -+ s->next_print = jiffies + HZ * 10; -+ return ret; -+} -+ -+void bch2_progress_update_iter(struct btree_trans *trans, -+ struct progress_indicator_state *s, -+ struct btree_iter *iter, -+ const char *msg) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = path_l(btree_iter_path(trans, iter))->b; -+ -+ s->nodes_seen += b != s->last_node; -+ s->last_node = b; -+ -+ if (progress_update_p(s)) { -+ struct printbuf buf = PRINTBUF; -+ unsigned percent = s->nodes_total -+ ? div64_u64(s->nodes_seen * 100, s->nodes_total) -+ : 0; -+ -+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", -+ msg, percent, s->nodes_seen, s->nodes_total); -+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); -+ -+ bch_info(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+} -diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h -new file mode 100644 -index 000000000000..23fb1811f943 ---- /dev/null -+++ b/fs/bcachefs/progress.h -@@ -0,0 +1,29 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_PROGRESS_H -+#define _BCACHEFS_PROGRESS_H -+ -+/* -+ * Lame progress indicators -+ * -+ * We don't like to use these because they print to the dmesg console, which is -+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via -+ * thread_with_file) and have it print the progress indicator. -+ * -+ * But some code is old and doesn't support that, or runs in a context where -+ * that's not yet practical (mount). -+ */ -+ -+struct progress_indicator_state { -+ unsigned long next_print; -+ u64 nodes_seen; -+ u64 nodes_total; -+ struct btree *last_node; -+}; -+ -+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); -+void bch2_progress_update_iter(struct btree_trans *, -+ struct progress_indicator_state *, -+ struct btree_iter *, -+ const char *); -+ -+#endif /* _BCACHEFS_PROGRESS_H */ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -index d0a1f5cd5c2b..58f6d97e506c 100644 ---- a/fs/bcachefs/rebalance.c -+++ b/fs/bcachefs/rebalance.c -@@ -341,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; -- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -+ data_opts->write_flags |= BCH_WRITE_only_specified_devs; - - if (!data_opts->rewrite_ptrs) { - /* -@@ -449,7 +449,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, - { - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; -- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -+ data_opts->write_flags |= BCH_WRITE_only_specified_devs; - return data_opts->rewrite_ptrs != 0; - } - -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -index 98825437381c..71c786cdb192 100644 ---- a/fs/bcachefs/recovery.c -+++ b/fs/bcachefs/recovery.c -@@ -32,7 +32,6 @@ - #include - #include - --#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - - int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) - { -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -index 376fd0a6e868..33b656c01942 100644 ---- a/fs/bcachefs/reflink.c -+++ b/fs/bcachefs/reflink.c -@@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, - BUG_ON(missing_start < refd_start); - BUG_ON(missing_end > refd_end); - -- if (fsck_err(trans, reflink_p_to_missing_reflink_v, -- "pointer to missing indirect extent\n" -- " %s\n" -- " missing range %llu-%llu", -- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), -- missing_start, missing_end)) { -+ struct bpos missing_pos = bkey_start_pos(p.k); -+ missing_pos.offset += missing_start - live_start; -+ -+ prt_printf(&buf, "pointer to missing indirect extent in "); -+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); -+ if (ret) -+ goto err; -+ -+ prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9); -+ bch2_bkey_val_to_text(&buf, c, p.s_c); -+ -+ prt_printf(&buf, "\n missing reflink btree range %llu-%llu", -+ missing_start, missing_end); -+ -+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - ret = PTR_ERR_OR_ZERO(new); - if (ret) -diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c -index 6992e7469112..2b4b8445d418 100644 ---- a/fs/bcachefs/sb-counters.c -+++ b/fs/bcachefs/sb-counters.c -@@ -5,7 +5,13 @@ - - /* BCH_SB_FIELD_counters */ - --static const char * const bch2_counter_names[] = { -+static const u8 counters_to_stable_map[] = { -+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+}; -+ -+const char * const bch2_counter_names[] = { - #define x(t, n, ...) (#t), - BCH_PERSISTENT_COUNTERS() - #undef x -@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) - return 0; - - return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; --}; -+} - - static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) - { - return 0; --}; -+} - - static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - -- for (unsigned i = 0; i < nr; i++) -- prt_printf(out, "%s \t%llu\n", -- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", -- le64_to_cpu(ctrs->d[i])); --}; -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) -+ prt_printf(out, "%s \t%llu\n", -+ bch2_counter_names[i], -+ le64_to_cpu(ctrs->d[stable])); -+ } -+} - - int bch2_sb_counters_to_cpu(struct bch_fs *c) - { - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); -- unsigned int i; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); -- u64 val = 0; - -- for (i = 0; i < BCH_COUNTER_NR; i++) -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) - c->counters_on_mount[i] = 0; - -- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { -- val = le64_to_cpu(ctrs->d[i]); -- percpu_u64_set(&c->counters[i], val); -- c->counters_on_mount[i] = val; -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) { -+ u64 v = le64_to_cpu(ctrs->d[stable]); -+ percpu_u64_set(&c->counters[i], v); -+ c->counters_on_mount[i] = v; -+ } - } -+ - return 0; --}; -+} - - int bch2_sb_counters_from_cpu(struct bch_fs *c) - { - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - struct bch_sb_field_counters *ret; -- unsigned int i; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_field_resize(&c->disk_sb, counters, -- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); -- -+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - if (ret) { - ctrs = ret; - nr = bch2_sb_counter_nr_entries(ctrs); - } - } - -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ if (stable < nr) -+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); -+ } - -- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) -- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); - return 0; - } - -@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { - .validate = bch2_sb_counters_validate, - .to_text = bch2_sb_counters_to_text, - }; -+ -+#ifndef NO_BCACHEFS_CHARDEV -+long bch2_ioctl_query_counters(struct bch_fs *c, -+ struct bch_ioctl_query_counters __user *user_arg) -+{ -+ struct bch_ioctl_query_counters arg; -+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); -+ if (ret) -+ return ret; -+ -+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || -+ arg.pad) -+ return -EINVAL; -+ -+ arg.nr = min(arg.nr, BCH_COUNTER_NR); -+ ret = put_user(arg.nr, &user_arg->nr); -+ if (ret) -+ return ret; -+ -+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { -+ unsigned stable = counters_to_stable_map[i]; -+ -+ if (stable < arg.nr) { -+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) -+ ? percpu_u64_get(&c->counters[i]) -+ : c->counters_on_mount[i]; -+ -+ ret = put_user(v, &user_arg->d[stable]); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+#endif -diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h -index 81f8aec9fcb1..a4329ad8dd1b 100644 ---- a/fs/bcachefs/sb-counters.h -+++ b/fs/bcachefs/sb-counters.h -@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); - void bch2_fs_counters_exit(struct bch_fs *); - int bch2_fs_counters_init(struct bch_fs *); - -+extern const char * const bch2_counter_names[]; - extern const struct bch_sb_field_ops bch_sb_field_ops_counters; - -+long bch2_ioctl_query_counters(struct bch_fs *, -+ struct bch_ioctl_query_counters __user *); -+ - #endif // _BCACHEFS_SB_COUNTERS_H -diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h -index fdcf598f08b1..c82a891026d3 100644 ---- a/fs/bcachefs/sb-counters_format.h -+++ b/fs/bcachefs/sb-counters_format.h -@@ -9,10 +9,23 @@ enum counters_flags { - - #define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0, TYPE_SECTORS) \ -+ x(io_read_inline, 80, TYPE_SECTORS) \ -+ x(io_read_hole, 81, TYPE_SECTORS) \ -+ x(io_read_promote, 30, TYPE_COUNTER) \ -+ x(io_read_bounce, 31, TYPE_COUNTER) \ -+ x(io_read_split, 33, TYPE_COUNTER) \ -+ x(io_read_reuse_race, 34, TYPE_COUNTER) \ -+ x(io_read_retry, 32, TYPE_COUNTER) \ - x(io_write, 1, TYPE_SECTORS) \ - x(io_move, 2, TYPE_SECTORS) \ -+ x(io_move_read, 35, TYPE_SECTORS) \ -+ x(io_move_write, 36, TYPE_SECTORS) \ -+ x(io_move_finish, 37, TYPE_SECTORS) \ -+ x(io_move_fail, 38, TYPE_COUNTER) \ -+ x(io_move_start_fail, 39, TYPE_COUNTER) \ - x(bucket_invalidate, 3, TYPE_COUNTER) \ - x(bucket_discard, 4, TYPE_COUNTER) \ -+ x(bucket_discard_fast, 79, TYPE_COUNTER) \ - x(bucket_alloc, 5, TYPE_COUNTER) \ - x(bucket_alloc_fail, 6, TYPE_COUNTER) \ - x(btree_cache_scan, 7, TYPE_COUNTER) \ -@@ -38,16 +51,6 @@ enum counters_flags { - x(journal_reclaim_finish, 27, TYPE_COUNTER) \ - x(journal_reclaim_start, 28, TYPE_COUNTER) \ - x(journal_write, 29, TYPE_COUNTER) \ -- x(read_promote, 30, TYPE_COUNTER) \ -- x(read_bounce, 31, TYPE_COUNTER) \ -- x(read_split, 33, TYPE_COUNTER) \ -- x(read_retry, 32, TYPE_COUNTER) \ -- x(read_reuse_race, 34, TYPE_COUNTER) \ -- x(move_extent_read, 35, TYPE_SECTORS) \ -- x(move_extent_write, 36, TYPE_SECTORS) \ -- x(move_extent_finish, 37, TYPE_SECTORS) \ -- x(move_extent_fail, 38, TYPE_COUNTER) \ -- x(move_extent_start_fail, 39, TYPE_COUNTER) \ - x(copygc, 40, TYPE_COUNTER) \ - x(copygc_wait, 41, TYPE_COUNTER) \ - x(gc_gens_end, 42, TYPE_COUNTER) \ -@@ -95,6 +98,13 @@ enum bch_persistent_counters { - BCH_COUNTER_NR - }; - -+enum bch_persistent_counters_stable { -+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+ BCH_COUNTER_STABLE_NR -+}; -+ - struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h -index 762083b564ee..b29b6c6c21dd 100644 ---- a/fs/bcachefs/sb-members.h -+++ b/fs/bcachefs/sb-members.h -@@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) - return !percpu_ref_is_zero(&ca->io_ref); - } - -+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); -+ -+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) -+{ -+ rcu_read_lock(); -+ struct bch_dev *ca = bch2_dev_rcu(c, dev); -+ bool ret = ca && bch2_dev_is_online(ca); -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ - static inline bool bch2_dev_is_readable(struct bch_dev *ca) - { - return bch2_dev_is_online(ca) && -diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c -index c54091a28909..e7f197896db1 100644 ---- a/fs/bcachefs/snapshot.c -+++ b/fs/bcachefs/snapshot.c -@@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) - goto out; - } - -- while (id && id < ancestor - IS_ANCESTOR_BITMAP) -- id = get_ancestor_below(t, id, ancestor); -+ if (likely(ancestor >= IS_ANCESTOR_BITMAP)) -+ while (id && id < ancestor - IS_ANCESTOR_BITMAP) -+ id = get_ancestor_below(t, id, ancestor); - - ret = id && id < ancestor - ? test_ancestor_bitmap(t, id, ancestor) -@@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) - return 0; - } - --static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) -+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) - { - u32 id = snapshot_root; - u32 subvol = 0, s; -diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h -index 00373cf32e7b..81180181d7c9 100644 ---- a/fs/bcachefs/snapshot.h -+++ b/fs/bcachefs/snapshot.h -@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) - return id; - } - -+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); - u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); - - static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -index a7eb1f511484..b3f2c651c1f8 100644 ---- a/fs/bcachefs/sysfs.c -+++ b/fs/bcachefs/sysfs.c -@@ -176,7 +176,6 @@ read_attribute(btree_reserve_cache); - read_attribute(stripes_heap); - read_attribute(open_buckets); - read_attribute(open_buckets_partial); --read_attribute(write_points); - read_attribute(nocow_lock_table); - - #ifdef BCH_WRITE_REF_DEBUG -@@ -364,9 +363,6 @@ SHOW(bch2_fs) - if (attr == &sysfs_open_buckets_partial) - bch2_open_buckets_partial_to_text(out, c); - -- if (attr == &sysfs_write_points) -- bch2_write_points_to_text(out, c); -- - if (attr == &sysfs_compression_stats) - bch2_compression_stats_to_text(out, c); - -@@ -569,7 +565,6 @@ struct attribute *bch2_fs_internal_files[] = { - &sysfs_stripes_heap, - &sysfs_open_buckets, - &sysfs_open_buckets_partial, -- &sysfs_write_points, - #ifdef BCH_WRITE_REF_DEBUG - &sysfs_write_refs, - #endif -diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h -index c1b51009edf6..5718988dd7d6 100644 ---- a/fs/bcachefs/trace.h -+++ b/fs/bcachefs/trace.h -@@ -295,12 +295,12 @@ TRACE_EVENT(write_super, - - /* io.c: */ - --DEFINE_EVENT(bio, read_promote, -+DEFINE_EVENT(bio, io_read_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --TRACE_EVENT(read_nopromote, -+TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - -@@ -319,22 +319,22 @@ TRACE_EVENT(read_nopromote, - __entry->ret) - ); - --DEFINE_EVENT(bio, read_bounce, -+DEFINE_EVENT(bio, io_read_bounce, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_split, -+DEFINE_EVENT(bio, io_read_split, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_retry, -+DEFINE_EVENT(bio, io_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); - --DEFINE_EVENT(bio, read_reuse_race, -+DEFINE_EVENT(bio, io_read_reuse_race, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) - ); -@@ -797,53 +797,32 @@ TRACE_EVENT(bucket_invalidate, - - /* Moving IO */ - --TRACE_EVENT(bucket_evacuate, -- TP_PROTO(struct bch_fs *c, struct bpos *bucket), -- TP_ARGS(c, bucket), -- -- TP_STRUCT__entry( -- __field(dev_t, dev ) -- __field(u32, dev_idx ) -- __field(u64, bucket ) -- ), -- -- TP_fast_assign( -- __entry->dev = c->dev; -- __entry->dev_idx = bucket->inode; -- __entry->bucket = bucket->offset; -- ), -- -- TP_printk("%d:%d %u:%llu", -- MAJOR(__entry->dev), MINOR(__entry->dev), -- __entry->dev_idx, __entry->bucket) --); -- --DEFINE_EVENT(fs_str, move_extent, -+DEFINE_EVENT(fs_str, io_move, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_read, -+DEFINE_EVENT(fs_str, io_move_read, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_write, -+DEFINE_EVENT(fs_str, io_move_write, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_finish, -+DEFINE_EVENT(fs_str, io_move_finish, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_fail, -+DEFINE_EVENT(fs_str, io_move_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); - --DEFINE_EVENT(fs_str, move_extent_start_fail, -+DEFINE_EVENT(fs_str, io_move_start_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) - ); -@@ -881,37 +860,6 @@ TRACE_EVENT(move_data, - __entry->sectors_raced) - ); - --TRACE_EVENT(evacuate_bucket, -- TP_PROTO(struct bch_fs *c, struct bpos *bucket, -- unsigned sectors, unsigned bucket_size, -- int ret), -- TP_ARGS(c, bucket, sectors, bucket_size, ret), -- -- TP_STRUCT__entry( -- __field(dev_t, dev ) -- __field(u64, member ) -- __field(u64, bucket ) -- __field(u32, sectors ) -- __field(u32, bucket_size ) -- __field(int, ret ) -- ), -- -- TP_fast_assign( -- __entry->dev = c->dev; -- __entry->member = bucket->inode; -- __entry->bucket = bucket->offset; -- __entry->sectors = sectors; -- __entry->bucket_size = bucket_size; -- __entry->ret = ret; -- ), -- -- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", -- MAJOR(__entry->dev), MINOR(__entry->dev), -- __entry->member, __entry->bucket, -- __entry->sectors, __entry->bucket_size, -- __entry->ret) --); -- - TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, - u64 buckets, -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -index e0a876cbaa6b..50a90e48f6dd 100644 ---- a/fs/bcachefs/util.c -+++ b/fs/bcachefs/util.c -@@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats - u64 last_q = 0; - - prt_printf(out, "quantiles (%s):\t", u->name); -- eytzinger0_for_each(i, NR_QUANTILES) { -- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ eytzinger0_for_each(j, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - -- u64 q = max(quantiles->entries[i].m, last_q); -+ u64 q = max(quantiles->entries[j].m, last_q); - prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); -@@ -701,9 +701,9 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) - #if 0 - void eytzinger1_test(void) - { -- unsigned inorder, eytz, size; -+ unsigned inorder, size; - -- pr_info("1 based eytzinger test:"); -+ pr_info("1 based eytzinger test:\n"); - - for (size = 2; - size < 65536; -@@ -711,13 +711,7 @@ void eytzinger1_test(void) - unsigned extra = eytzinger1_extra(size); - - if (!(size % 4096)) -- pr_info("tree size %u", size); -- -- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -- -- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ pr_info("tree size %u\n", size); - - inorder = 1; - eytzinger1_for_each(eytz, size) { -@@ -728,15 +722,16 @@ void eytzinger1_test(void) - - inorder++; - } -+ BUG_ON(inorder - 1 != size); - } - } - - void eytzinger0_test(void) - { - -- unsigned inorder, eytz, size; -+ unsigned inorder, size; - -- pr_info("0 based eytzinger test:"); -+ pr_info("0 based eytzinger test:\n"); - - for (size = 1; - size < 65536; -@@ -744,13 +739,7 @@ void eytzinger0_test(void) - unsigned extra = eytzinger0_extra(size); - - if (!(size % 4096)) -- pr_info("tree size %u", size); -- -- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -- -- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ pr_info("tree size %u\n", size); - - inorder = 0; - eytzinger0_for_each(eytz, size) { -@@ -761,54 +750,191 @@ void eytzinger0_test(void) - - inorder++; - } -+ BUG_ON(inorder != size); -+ -+ inorder = size - 1; -+ eytzinger0_for_each_prev(eytz, size) { -+ BUG_ON(eytz != eytzinger0_first(size) && -+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); -+ -+ inorder--; -+ } -+ BUG_ON(inorder != -1); - } - } - --static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+static inline int cmp_u16(const void *_l, const void *_r) - { - const u16 *l = _l, *r = _r; - -- return (*l > *r) - (*r - *l); -+ return (*l > *r) - (*r > *l); - } - --static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) - { -- int i, c1 = -1, c2 = -1; -- ssize_t r; -+ int r, s; -+ bool bad; - - r = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); -- if (r >= 0) -- c1 = test_array[r]; -- -- for (i = 0; i < nr; i++) -- if (test_array[i] <= search && test_array[i] > c2) -- c2 = test_array[i]; -- -- if (c1 != c2) { -- eytzinger0_for_each(i, nr) -- pr_info("[%3u] = %12u", i, test_array[i]); -- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -- i, r, c1, c2); -+ if (r >= 0) { -+ if (test_array[r] > search) { -+ bad = true; -+ } else { -+ s = eytzinger0_next(r, nr); -+ bad = s >= 0 && test_array[s] <= search; -+ } -+ } else { -+ s = eytzinger0_last(nr); -+ bad = s >= 0 && test_array[s] <= search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each_prev(j, nr) { -+ if (test_array[j] <= search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_le(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) -+{ -+ int r, s; -+ bool bad; -+ -+ r = eytzinger0_find_gt(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) { -+ if (test_array[r] <= search) { -+ bad = true; -+ } else { -+ s = eytzinger0_prev(r, nr); -+ bad = s >= 0 && test_array[s] > search; -+ } -+ } else { -+ s = eytzinger0_first(nr); -+ bad = s >= 0 && test_array[s] > search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each(j, nr) { -+ if (test_array[j] > search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_gt(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); - } - } - -+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) -+{ -+ int r, s; -+ bool bad; -+ -+ r = eytzinger0_find_ge(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) { -+ if (test_array[r] < search) { -+ bad = true; -+ } else { -+ s = eytzinger0_prev(r, nr); -+ bad = s >= 0 && test_array[s] >= search; -+ } -+ } else { -+ s = eytzinger0_first(nr); -+ bad = s >= 0 && test_array[s] >= search; -+ } -+ -+ if (bad) { -+ s = -1; -+ eytzinger0_for_each(j, nr) { -+ if (test_array[j] >= search) { -+ s = j; -+ break; -+ } -+ } -+ -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find_ge(%12u) = %3i should be %3i\n", -+ search, r, s); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) -+{ -+ unsigned r; -+ int s; -+ bool bad; -+ -+ r = eytzinger0_find(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ -+ if (r < nr) { -+ bad = test_array[r] != search; -+ } else { -+ s = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ bad = s >= 0 && test_array[s] == search; -+ } -+ -+ if (bad) { -+ eytzinger0_for_each(j, nr) -+ pr_info("[%3u] = %12u\n", j, test_array[j]); -+ pr_info("find(%12u) = %3i is incorrect\n", -+ search, r); -+ BUG(); -+ } -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ eytzinger0_find_test_le(test_array, nr, search); -+ eytzinger0_find_test_gt(test_array, nr, search); -+ eytzinger0_find_test_ge(test_array, nr, search); -+ eytzinger0_find_test_eq(test_array, nr, search); -+} -+ - void eytzinger0_find_test(void) - { - unsigned i, nr, allocated = 1 << 12; - u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); - - for (nr = 1; nr < allocated; nr++) { -- pr_info("testing %u elems", nr); -+ u16 prev = 0; -+ -+ pr_info("testing %u elems\n", nr); - - get_random_bytes(test_array, nr * sizeof(test_array[0])); - eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); - - /* verify array is sorted correctly: */ -- eytzinger0_for_each(i, nr) -- BUG_ON(i != eytzinger0_last(nr) && -- test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ eytzinger0_for_each(j, nr) { -+ BUG_ON(test_array[j] < prev); -+ prev = test_array[j]; -+ } - - for (i = 0; i < U16_MAX; i += 1 << 12) - eytzinger0_find_test_val(test_array, nr, i); -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -index 1a1720116071..e7c3541b38f3 100644 ---- a/fs/bcachefs/util.h -+++ b/fs/bcachefs/util.h -@@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r) - - #include - --#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -- - static inline bool qstr_eq(const struct qstr l, const struct qstr r) - { - return l.len == r.len && !memcmp(l.name, r.name, l.len); --- -2.45.3 -