diff --git a/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch b/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch new file mode 100644 index 0000000..f794535 --- /dev/null +++ b/sys-kernel/hardened-kernel/files/linux-6.12/1192-bcachefs-upd-from-master-81b5431.patch @@ -0,0 +1,6119 @@ +From 21a9c2ace04f6c699870b9222c3da9b8a9aaedf6 Mon Sep 17 00:00:00 2001 +From: Alexander Miroshnichenko +Date: Sun, 9 Feb 2025 22:05:21 +0300 +Subject: [PATCH] bcachefs: cherry-pick updates from master 81b5431 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Signed-off-by: Alexander Miroshnichenko +--- + fs/bcachefs/Kconfig | 2 + + fs/bcachefs/Makefile | 1 + + fs/bcachefs/alloc_background.c | 12 +- + fs/bcachefs/alloc_background.h | 2 +- + fs/bcachefs/alloc_foreground.c | 25 +- + fs/bcachefs/alloc_foreground.h | 17 + + fs/bcachefs/alloc_types.h | 2 + + fs/bcachefs/backpointers.c | 108 ++---- + fs/bcachefs/backpointers.h | 11 +- + fs/bcachefs/bcachefs.h | 5 +- + fs/bcachefs/bcachefs_ioctl.h | 29 +- + fs/bcachefs/btree_gc.c | 18 +- + fs/bcachefs/btree_io.c | 205 ++++++++++- + fs/bcachefs/btree_io.h | 4 + + fs/bcachefs/btree_update_interior.c | 20 ++ + fs/bcachefs/btree_update_interior.h | 4 + + fs/bcachefs/chardev.c | 38 +- + fs/bcachefs/clock.c | 25 +- + fs/bcachefs/data_update.c | 220 +++++++++--- + fs/bcachefs/data_update.h | 17 +- + fs/bcachefs/debug.c | 34 +- + fs/bcachefs/ec.c | 25 +- + fs/bcachefs/errcode.h | 6 + + fs/bcachefs/error.c | 50 ++- + fs/bcachefs/error.h | 4 +- + fs/bcachefs/extents.c | 9 +- + fs/bcachefs/extents.h | 2 +- + fs/bcachefs/eytzinger.c | 76 ++-- + fs/bcachefs/eytzinger.h | 95 ++--- + fs/bcachefs/fs-io-buffered.c | 26 +- + fs/bcachefs/fs-io-direct.c | 20 +- + fs/bcachefs/fsck.c | 2 +- + fs/bcachefs/io_misc.c | 3 +- + fs/bcachefs/io_read.c | 515 ++++++++++++++-------------- + fs/bcachefs/io_read.h | 75 ++-- + fs/bcachefs/io_write.c | 95 ++--- + fs/bcachefs/io_write.h | 29 +- + fs/bcachefs/io_write_types.h | 2 +- + fs/bcachefs/journal.c | 123 +++++-- + fs/bcachefs/journal.h | 38 +- + fs/bcachefs/journal_io.c | 30 +- + fs/bcachefs/journal_seq_blacklist.c | 7 +- + fs/bcachefs/journal_types.h | 19 +- + fs/bcachefs/migrate.c | 26 +- + fs/bcachefs/move.c | 418 ++++++++++++---------- + fs/bcachefs/move_types.h | 18 +- + fs/bcachefs/progress.c | 63 ++++ + fs/bcachefs/progress.h | 29 ++ + fs/bcachefs/rebalance.c | 4 +- + fs/bcachefs/recovery.c | 1 - + fs/bcachefs/reflink.c | 21 +- + fs/bcachefs/sb-counters.c | 90 +++-- + fs/bcachefs/sb-counters.h | 4 + + fs/bcachefs/sb-counters_format.h | 30 +- + fs/bcachefs/sb-members.h | 12 + + fs/bcachefs/snapshot.c | 7 +- + fs/bcachefs/snapshot.h | 1 + + fs/bcachefs/sysfs.c | 5 - + fs/bcachefs/trace.h | 76 +--- + fs/bcachefs/util.c | 210 +++++++++--- + fs/bcachefs/util.h | 2 - + 61 files changed, 1967 insertions(+), 1100 deletions(-) + create mode 100644 fs/bcachefs/progress.c + create mode 100644 fs/bcachefs/progress.h + +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +index e8549d04dcb8..85eea7a4dea3 100644 +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -15,6 +15,7 @@ config BCACHEFS_FS + select ZLIB_INFLATE + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS ++ select CRYPTO + select CRYPTO_SHA256 + select CRYPTO_CHACHA20 + select CRYPTO_POLY1305 +@@ -24,6 +25,7 @@ config BCACHEFS_FS + select XXHASH + select SRCU + select SYMBOLIC_ERRNAME ++ select MIN_HEAP + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index d2689388d5e8..1cf17a16af9f 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -67,6 +67,7 @@ bcachefs-y := \ + nocow_locking.o \ + opts.o \ + printbuf.o \ ++ progress.o \ + quota.o \ + rebalance.o \ + rcu_pending.o \ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 3ea809990ef1..a35455802280 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -1897,7 +1897,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + if (ret) + goto out; + +- count_event(c, bucket_discard); ++ if (!fastpath) ++ count_event(c, bucket_discard); ++ else ++ count_event(c, bucket_discard_fast); + out: + fsck_err: + if (discard_locked) +@@ -2090,6 +2093,13 @@ static int invalidate_one_bucket(struct btree_trans *trans, + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + goto out; + ++ /* ++ * Impossible since alloc_lru_idx_read() only returns nonzero if the ++ * bucket is supposed to be on the cached bucket LRU (i.e. ++ * BCH_DATA_cached) ++ * ++ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 ++ */ + BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index de25ba4ee94b..c556ccaffe89 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + if (a.stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (bch2_bucket_sectors_dirty(a)) +- return data_type; ++ return bucket_data_type(data_type); + if (a.cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 5a781fb4c794..1759c15a7745 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) + closure_wake_up(&c->freelist_wait); + } + +-static inline unsigned open_buckets_reserved(enum bch_watermark watermark) +-{ +- switch (watermark) { +- case BCH_WATERMARK_interior_updates: +- return 0; +- case BCH_WATERMARK_reclaim: +- return OPEN_BUCKETS_COUNT / 6; +- case BCH_WATERMARK_btree: +- case BCH_WATERMARK_btree_copygc: +- return OPEN_BUCKETS_COUNT / 4; +- case BCH_WATERMARK_copygc: +- return OPEN_BUCKETS_COUNT / 3; +- default: +- return OPEN_BUCKETS_COUNT / 2; +- } +-} +- + static inline bool may_alloc_bucket(struct bch_fs *c, + struct bpos bucket, + struct bucket_alloc_state *s) +@@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + + spin_lock(&c->freelist_lock); + +- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + +@@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, +- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); ++ cl, flags & BCH_WRITE_alloc_nowait, &usage); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + bch2_dev_put(ca); +@@ -1336,7 +1319,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (wp->data_type != BCH_DATA_user) + have_cache = true; + +- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ if (target && !(flags & BCH_WRITE_only_specified_devs)) { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, +@@ -1426,7 +1409,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + +- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && ++ if (cl && !(flags & BCH_WRITE_alloc_nowait) && + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index f25481a0d1a0..baf5dc163c8a 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) + return bch2_dev_have_ref(c, ob->dev); + } + ++static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) ++{ ++ switch (watermark) { ++ case BCH_WATERMARK_interior_updates: ++ return 0; ++ case BCH_WATERMARK_reclaim: ++ return OPEN_BUCKETS_COUNT / 6; ++ case BCH_WATERMARK_btree: ++ case BCH_WATERMARK_btree_copygc: ++ return OPEN_BUCKETS_COUNT / 4; ++ case BCH_WATERMARK_copygc: ++ return OPEN_BUCKETS_COUNT / 3; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum bch_watermark, enum bch_data_type, + struct closure *); +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 4aa8ee026cb8..8f79f46c2a78 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -90,6 +90,7 @@ struct dev_stripe_state { + x(stopped) \ + x(waiting_io) \ + x(waiting_work) \ ++ x(runnable) \ + x(running) + + enum write_point_state { +@@ -125,6 +126,7 @@ struct write_point { + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; ++ u64 last_runtime; + } __aligned(SMP_CACHE_BYTES); + }; + +diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c +index ebeb6a5ff9d2..eb374d1970fe 100644 +--- a/fs/bcachefs/backpointers.c ++++ b/fs/bcachefs/backpointers.c +@@ -11,6 +11,7 @@ + #include "checksum.h" + #include "disk_accounting.h" + #include "error.h" ++#include "progress.h" + + #include + +@@ -244,27 +245,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) + return bkey_s_c_null; + +- if (likely(!bp.v->level)) { +- bch2_trans_node_iter_init(trans, iter, +- bp.v->btree_id, +- bp.v->pos, +- 0, 0, +- iter_flags); +- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); +- if (bkey_err(k)) { +- bch2_trans_iter_exit(trans, iter); +- return k; +- } ++ bch2_trans_node_iter_init(trans, iter, ++ bp.v->btree_id, ++ bp.v->pos, ++ 0, ++ bp.v->level, ++ iter_flags); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) { ++ bch2_trans_iter_exit(trans, iter); ++ return k; ++ } + +- if (k.k && +- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) +- return k; ++ if (k.k && ++ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) ++ return k; + +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(trans, iter); ++ ++ if (!bp.v->level) { + int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } else { + struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); ++ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) ++ return bkey_s_c_null; + if (IS_ERR_OR_NULL(b)) + return ((struct bkey_s_c) { .k = ERR_CAST(b) }); + +@@ -715,71 +720,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + return ret; + } + +-struct progress_indicator_state { +- unsigned long next_print; +- u64 nodes_seen; +- u64 nodes_total; +- struct btree *last_node; +-}; +- +-static inline void progress_init(struct progress_indicator_state *s, +- struct bch_fs *c, +- u64 btree_id_mask) +-{ +- memset(s, 0, sizeof(*s)); +- +- s->next_print = jiffies + HZ * 10; +- +- for (unsigned i = 0; i < BTREE_ID_NR; i++) { +- if (!(btree_id_mask & BIT_ULL(i))) +- continue; +- +- struct disk_accounting_pos acc = { +- .type = BCH_DISK_ACCOUNTING_btree, +- .btree.id = i, +- }; +- +- u64 v; +- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); +- s->nodes_total += div64_ul(v, btree_sectors(c)); +- } +-} +- +-static inline bool progress_update_p(struct progress_indicator_state *s) +-{ +- bool ret = time_after_eq(jiffies, s->next_print); +- +- if (ret) +- s->next_print = jiffies + HZ * 10; +- return ret; +-} +- +-static void progress_update_iter(struct btree_trans *trans, +- struct progress_indicator_state *s, +- struct btree_iter *iter, +- const char *msg) +-{ +- struct bch_fs *c = trans->c; +- struct btree *b = path_l(btree_iter_path(trans, iter))->b; +- +- s->nodes_seen += b != s->last_node; +- s->last_node = b; +- +- if (progress_update_p(s)) { +- struct printbuf buf = PRINTBUF; +- unsigned percent = s->nodes_total +- ? div64_u64(s->nodes_seen * 100, s->nodes_total) +- : 0; +- +- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", +- msg, percent, s->nodes_seen, s->nodes_total); +- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); +- +- bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); +- } +-} +- + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct extents_to_bp_state *s) + { +@@ -787,7 +727,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct progress_indicator_state progress; + int ret = 0; + +- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); ++ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); +@@ -806,7 +746,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + BTREE_ITER_prefetch); + + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ +- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); ++ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); +@@ -1206,11 +1146,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); +- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); ++ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + + int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_prefetch, k, ({ +- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); ++ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + check_one_backpointer(trans, start, end, k, &last_flushed); + })); + +diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h +index 060dad1521ee..7786731d4ada 100644 +--- a/fs/bcachefs/backpointers.h ++++ b/fs/bcachefs/backpointers.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H ++#ifndef _BCACHEFS_BACKPOINTERS_H ++#define _BCACHEFS_BACKPOINTERS_H + + #include "btree_cache.h" + #include "btree_iter.h" +@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: +- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; ++ if (p.has_ec) ++ return BCH_DATA_stripe; ++ if (p.ptr.cached) ++ return BCH_DATA_cached; ++ else ++ return BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 161cf2f05d2a..e8f4999806b6 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -444,6 +444,7 @@ BCH_DEBUG_PARAMS_DEBUG() + x(btree_node_sort) \ + x(btree_node_read) \ + x(btree_node_read_done) \ ++ x(btree_node_write) \ + x(btree_interior_update_foreground) \ + x(btree_interior_update_total) \ + x(btree_gc) \ +@@ -456,6 +457,7 @@ BCH_DEBUG_PARAMS_DEBUG() + x(blocked_journal_low_on_space) \ + x(blocked_journal_low_on_pin) \ + x(blocked_journal_max_in_flight) \ ++ x(blocked_journal_max_open) \ + x(blocked_key_cache_flush) \ + x(blocked_allocate) \ + x(blocked_allocate_open_bucket) \ +@@ -687,7 +689,8 @@ struct btree_trans_buf { + x(gc_gens) \ + x(snapshot_delete_pagecache) \ + x(sysfs) \ +- x(btree_write_buffer) ++ x(btree_write_buffer) \ ++ x(btree_node_scrub) + + enum bch_write_ref { + #define x(n) BCH_WRITE_REF_##n, +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +index 3c23bdf788ce..52594e925eb7 100644 +--- a/fs/bcachefs/bcachefs_ioctl.h ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -87,6 +87,7 @@ struct bch_ioctl_incremental { + #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) + #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) + #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) ++#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) + + /* ioctl below act on a particular file, not the filesystem as a whole: */ + +@@ -213,6 +214,10 @@ struct bch_ioctl_data { + struct bpos end_pos; + + union { ++ struct { ++ __u32 dev; ++ __u32 data_types; ++ } scrub; + struct { + __u32 dev; + __u32 pad; +@@ -229,6 +234,11 @@ enum bch_data_event { + BCH_DATA_EVENT_NR = 1, + }; + ++enum data_progress_data_type_special { ++ DATA_PROGRESS_DATA_TYPE_phys = 254, ++ DATA_PROGRESS_DATA_TYPE_done = 255, ++}; ++ + struct bch_ioctl_data_progress { + __u8 data_type; + __u8 btree_id; +@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { + + __u64 sectors_done; + __u64 sectors_total; ++ __u64 sectors_error_corrected; ++ __u64 sectors_error_uncorrected; + } __packed __aligned(8); + ++enum bch_ioctl_data_event_ret { ++ BCH_IOCTL_DATA_EVENT_RET_done = 1, ++ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, ++}; ++ + struct bch_ioctl_data_event { + __u8 type; +- __u8 pad[7]; ++ __u8 ret; ++ __u8 pad[6]; + union { + struct bch_ioctl_data_progress p; + __u64 pad2[15]; +@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { + struct bkey_i_accounting accounting[]; + }; + ++#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) ++ ++struct bch_ioctl_query_counters { ++ __u16 nr; ++ __u16 flags; ++ __u32 pad; ++ __u64 d[]; ++}; ++ + #endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index dd1d9b74076e..ff681e733598 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -27,6 +27,7 @@ + #include "journal.h" + #include "keylist.h" + #include "move.h" ++#include "progress.h" + #include "recovery_passes.h" + #include "reflink.h" + #include "recovery.h" +@@ -656,7 +657,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + return ret; + } + +-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) ++static int bch2_gc_btree(struct btree_trans *trans, ++ struct progress_indicator_state *progress, ++ enum btree_id btree, bool initial) + { + struct bch_fs *c = trans->c; + unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; +@@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in + BTREE_ITER_prefetch); + + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ ++ bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); +@@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + static int bch2_gc_btrees(struct bch_fs *c) + { + struct btree_trans *trans = bch2_trans_get(c); +- enum btree_id ids[BTREE_ID_NR]; + struct printbuf buf = PRINTBUF; +- unsigned i; + int ret = 0; + +- for (i = 0; i < BTREE_ID_NR; i++) ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, ~0ULL); ++ ++ enum btree_id ids[BTREE_ID_NR]; ++ for (unsigned i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + +- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { ++ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + unsigned btree = i < BTREE_ID_NR ? ids[i] : i; + + if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) + continue; + +- ret = bch2_gc_btree(trans, btree, true); ++ ret = bch2_gc_btree(trans, &progress, btree, true); + } + + printbuf_exit(&buf); +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index e371e60e3133..e71b278672b6 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bkey_buf.h" + #include "bkey_methods.h" + #include "bkey_sort.h" + #include "btree_cache.h" +@@ -1352,7 +1353,7 @@ static void btree_node_read_work(struct work_struct *work) + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), +- &failed, &rb->pick) > 0; ++ &failed, &rb->pick, -1) > 0; + + if (!bio->bi_status && + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { +@@ -1697,7 +1698,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, + return; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), +- NULL, &pick); ++ NULL, &pick, -1); + + if (ret <= 0) { + struct printbuf buf = PRINTBUF; +@@ -1811,6 +1812,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); + } + ++struct btree_node_scrub { ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ void *buf; ++ bool used_mempool; ++ unsigned written; ++ ++ enum btree_id btree; ++ unsigned level; ++ struct bkey_buf key; ++ __le64 seq; ++ ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, ++ struct printbuf *err) ++{ ++ unsigned written = 0; ++ ++ if (le64_to_cpu(data->magic) != bset_magic(c)) { ++ prt_printf(err, "bad magic: want %llx, got %llx", ++ bset_magic(c), le64_to_cpu(data->magic)); ++ return false; ++ } ++ ++ while (written < (ptr_written ?: btree_sectors(c))) { ++ struct btree_node_entry *bne; ++ struct bset *i; ++ bool first = !written; ++ ++ if (first) { ++ bne = NULL; ++ i = &data->keys; ++ } else { ++ bne = (void *) data + (written << 9); ++ i = &bne->keys; ++ ++ if (!ptr_written && i->seq != data->keys.seq) ++ break; ++ } ++ ++ struct nonce nonce = btree_nonce(i, written << 9); ++ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); ++ ++ if (first) { ++ if (good_csum_type) { ++ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); ++ if (bch2_crc_cmp(data->csum, csum)) { ++ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); ++ return false; ++ } ++ } ++ ++ written += vstruct_sectors(data, c->block_bits); ++ } else { ++ if (good_csum_type) { ++ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ if (bch2_crc_cmp(bne->csum, csum)) { ++ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); ++ return false; ++ } ++ } ++ ++ written += vstruct_sectors(bne, c->block_bits); ++ } ++ } ++ ++ return true; ++} ++ ++static void btree_node_scrub_work(struct work_struct *work) ++{ ++ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); ++ struct bch_fs *c = scrub->c; ++ struct printbuf err = PRINTBUF; ++ ++ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, ++ bkey_i_to_s_c(scrub->key.k)); ++ prt_newline(&err); ++ ++ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { ++ struct btree_trans *trans = bch2_trans_get(c); ++ ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, scrub->btree, ++ scrub->key.k->k.p, 0, scrub->level - 1, 0); ++ ++ struct btree *b; ++ int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); ++ if (ret) ++ goto err; ++ ++ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { ++ bch_err(c, "error validating btree node during scrub on %s at btree %s", ++ scrub->ca->name, err.buf); ++ ++ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_begin(trans); ++ bch2_trans_put(trans); ++ } ++ ++ printbuf_exit(&err); ++ bch2_bkey_buf_exit(&scrub->key, c);; ++ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); ++ percpu_ref_put(&scrub->ca->io_ref); ++ kfree(scrub); ++ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); ++} ++ ++static void btree_node_scrub_endio(struct bio *bio) ++{ ++ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); ++ ++ queue_work(scrub->c->btree_read_complete_wq, &scrub->work); ++} ++ ++int bch2_btree_node_scrub(struct btree_trans *trans, ++ enum btree_id btree, unsigned level, ++ struct bkey_s_c k, unsigned dev) ++{ ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ return 0; ++ ++ struct bch_fs *c = trans->c; ++ ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) ++ return -BCH_ERR_erofs_no_writes; ++ ++ struct extent_ptr_decoded pick; ++ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); ++ if (ret <= 0) ++ goto err; ++ ++ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ if (!ca) { ++ ret = -BCH_ERR_device_offline; ++ goto err; ++ } ++ ++ bool used_mempool = false; ++ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); ++ ++ unsigned vecs = buf_pages(buf, c->opts.btree_node_size); ++ ++ struct btree_node_scrub *scrub = ++ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); ++ if (!scrub) { ++ ret = -ENOMEM; ++ goto err_free; ++ } ++ ++ scrub->c = c; ++ scrub->ca = ca; ++ scrub->buf = buf; ++ scrub->used_mempool = used_mempool; ++ scrub->written = btree_ptr_sectors_written(k); ++ ++ scrub->btree = btree; ++ scrub->level = level; ++ bch2_bkey_buf_init(&scrub->key); ++ bch2_bkey_buf_reassemble(&scrub->key, c, k); ++ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; ++ ++ INIT_WORK(&scrub->work, btree_node_scrub_work); ++ ++ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); ++ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); ++ scrub->bio.bi_iter.bi_sector = pick.ptr.offset; ++ scrub->bio.bi_end_io = btree_node_scrub_endio; ++ submit_bio(&scrub->bio); ++ return 0; ++err_free: ++ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); ++ percpu_ref_put(&ca->io_ref); ++err: ++ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); ++ return ret; ++} ++ + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) + { +@@ -1831,7 +2016,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + bch2_journal_pin_drop(&c->journal, &w->journal); + } + +-static void __btree_node_write_done(struct bch_fs *c, struct btree *b) ++static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) + { + struct btree_write *w = btree_prev_write(b); + unsigned long old, new; +@@ -1839,6 +2024,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + + bch2_btree_complete_write(c, b, w); + ++ if (start_time) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); ++ + old = READ_ONCE(b->flags); + do { + new = old; +@@ -1869,7 +2057,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + } + +-static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) + { + struct btree_trans *trans = bch2_trans_get(c); + +@@ -1877,7 +2065,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) + + /* we don't need transaction context anymore after we got the lock. */ + bch2_trans_put(trans); +- __btree_node_write_done(c, b); ++ __btree_node_write_done(c, b, start_time); + six_unlock_read(&b->c.lock); + } + +@@ -1887,6 +2075,7 @@ static void btree_node_write_work(struct work_struct *work) + container_of(work, struct btree_write_bio, work); + struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; ++ u64 start_time = wbio->start_time; + int ret = 0; + + btree_bounce_free(c, +@@ -1919,7 +2108,7 @@ static void btree_node_write_work(struct work_struct *work) + } + out: + bio_put(&wbio->wbio.bio); +- btree_node_write_done(c, b); ++ btree_node_write_done(c, b, start_time); + return; + err: + set_btree_node_noevict(b); +@@ -2023,6 +2212,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; + void *data; ++ u64 start_time = local_clock(); + int ret; + + if (flags & BTREE_WRITE_ALREADY_STARTED) +@@ -2231,6 +2421,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + wbio->data = data; + wbio->data_bytes = bytes; + wbio->sector_offset = b->written; ++ wbio->start_time = start_time; + wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.first_btree_write = !b->written; +@@ -2258,7 +2449,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + b->written += sectors_to_write; + nowrite: + btree_bounce_free(c, bytes, used_mempool, data); +- __btree_node_write_done(c, b); ++ __btree_node_write_done(c, b, 0); + } + + /* +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 6f9e4a6dacf7..dbf76d22c660 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -52,6 +52,7 @@ struct btree_write_bio { + void *data; + unsigned data_bytes; + unsigned sector_offset; ++ u64 start_time; + struct bch_write_bio wbio; + }; + +@@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); + int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); + ++int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, ++ struct bkey_s_c, unsigned); ++ + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + + enum btree_write_flags { +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index f4aeadbe53c1..ab111fec1701 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -2189,6 +2189,26 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + goto out; + } + ++int bch2_btree_node_rewrite_key(struct btree_trans *trans, ++ enum btree_id btree, unsigned level, ++ struct bpos pos, unsigned flags) ++{ ++ BUG_ON(!level); ++ ++ /* Traverse one depth lower to get a pointer to the node itself: */ ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); ++ int ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_node_rewrite(trans, &iter, b, flags); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 7930ffea3075..fa5a88f95d89 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -169,7 +169,11 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + + int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); ++int bch2_btree_node_rewrite_key(struct btree_trans *, ++ enum btree_id, unsigned, ++ struct bpos, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); ++ + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, + unsigned, bool); +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 46e9e32105a9..57d55b3ddc71 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -11,6 +11,7 @@ + #include "move.h" + #include "recovery_passes.h" + #include "replicas.h" ++#include "sb-counters.h" + #include "super-io.h" + #include "thread_with_file.h" + +@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); +- ctx->stats.data_type = U8_MAX; ++ if (ctx->thr.ret == -BCH_ERR_device_offline) ++ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; ++ else { ++ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; ++ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; ++ } + return 0; + } + +@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); + struct bch_fs *c = ctx->c; + struct bch_ioctl_data_event e = { +- .type = BCH_DATA_EVENT_PROGRESS, +- .p.data_type = ctx->stats.data_type, +- .p.btree_id = ctx->stats.pos.btree, +- .p.pos = ctx->stats.pos.pos, +- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), +- .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .ret = ctx->stats.ret, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.pos.btree, ++ .p.pos = ctx->stats.pos.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), ++ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), + }; + ++ if (ctx->arg.op == BCH_DATA_OP_scrub) { ++ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); ++ if (ca) { ++ struct bch_dev_usage u; ++ bch2_dev_usage_read_fast(ca, &u); ++ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) ++ if (ctx->arg.scrub.data_types & BIT(i)) ++ e.p.sectors_total += u.d[i].sectors; ++ bch2_dev_put(ca); ++ } ++ } else { ++ e.p.sectors_total = bch2_fs_usage_read_short(c).used; ++ } ++ + if (len < sizeof(e)) + return -EINVAL; + +@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) + BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); + case BCH_IOCTL_QUERY_ACCOUNTING: + return bch2_ioctl_query_accounting(c, arg); ++ case BCH_IOCTL_QUERY_COUNTERS: ++ return bch2_ioctl_query_counters(c, arg); + default: + return -ENOTTY; + } +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index 1d6b691e8da6..1f8e035d7119 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -14,21 +14,13 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus + return (*_l)->expire < (*_r)->expire; + } + +-static inline void io_timer_swp(void *l, void *r, void __always_unused *args) +-{ +- struct io_timer **_l = (struct io_timer **)l; +- struct io_timer **_r = (struct io_timer **)r; +- +- swap(*_l, *_r); +-} ++static const struct min_heap_callbacks callbacks = { ++ .less = io_timer_cmp, ++ .swp = NULL, ++}; + + void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) + { +- const struct min_heap_callbacks callbacks = { +- .less = io_timer_cmp, +- .swp = io_timer_swp, +- }; +- + spin_lock(&clock->timer_lock); + + if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { +@@ -48,11 +40,6 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) + + void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) + { +- const struct min_heap_callbacks callbacks = { +- .less = io_timer_cmp, +- .swp = io_timer_swp, +- }; +- + spin_lock(&clock->timer_lock); + + for (size_t i = 0; i < clock->timers.nr; i++) +@@ -142,10 +129,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, + static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) + { + struct io_timer *ret = NULL; +- const struct min_heap_callbacks callbacks = { +- .less = io_timer_cmp, +- .swp = io_timer_swp, +- }; + + if (clock->timers.nr && + time_after_eq64(now, clock->timers.data[0]->expire)) { +diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c +index 337494facac6..c66ef8a1b5f2 100644 +--- a/fs/bcachefs/data_update.c ++++ b/fs/bcachefs/data_update.c +@@ -20,6 +20,8 @@ + #include "subvolume.h" + #include "trace.h" + ++#include ++ + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +@@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { +- if (!bch2_dev_tryget(c, ptr->dev)) { ++ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; +@@ -91,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc + return true; + } + +-static noinline void trace_move_extent_finish2(struct data_update *u, ++static noinline void trace_io_move_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) + { +@@ -111,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + +- trace_move_extent_finish(c, buf.buf); ++ trace_io_move_finish(c, buf.buf); + printbuf_exit(&buf); + } + +-static void trace_move_extent_fail2(struct data_update *m, ++static void trace_io_move_fail2(struct data_update *m, + struct bkey_s_c new, + struct bkey_s_c wrote, + struct bkey_i *insert, +@@ -126,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m, + struct printbuf buf = PRINTBUF; + unsigned rewrites_found = 0; + +- if (!trace_move_extent_fail_enabled()) ++ if (!trace_io_move_fail_enabled()) + return; + + prt_str(&buf, msg); +@@ -166,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m, + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + } + +- trace_move_extent_fail(c, buf.buf); ++ trace_io_move_fail(c, buf.buf); + printbuf_exit(&buf); + } + +@@ -214,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (!bch2_extents_match(k, old)) { +- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), ++ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), + NULL, "no match:"); + goto nowork; + } +@@ -254,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + if (m->data_opts.rewrite_ptrs && + !rewrites_found && + bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { +- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); ++ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + goto nowork; + } + +@@ -271,7 +273,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + } + + if (!bkey_val_u64s(&new->k)) { +- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); ++ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + goto nowork; + } + +@@ -384,9 +386,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + +- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); +- if (trace_move_extent_finish_enabled()) +- trace_move_extent_finish2(m, &new->k_i, insert); ++ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); ++ if (trace_io_move_finish_enabled()) ++ trace_io_move_finish2(m, &new->k_i, insert); + } + err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -408,7 +410,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + &m->stats->sectors_raced); + } + +- count_event(c, move_extent_fail); ++ count_event(c, io_move_fail); + + bch2_btree_iter_advance(&iter); + goto next; +@@ -426,14 +428,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); + } + +-void bch2_data_update_read_done(struct data_update *m, +- struct bch_extent_crc_unpacked crc) ++void bch2_data_update_read_done(struct data_update *m) + { ++ m->read_done = true; ++ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + +- m->op.crc = crc; +- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; ++ m->op.crc = m->rbio.pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; ++ ++ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); + + closure_call(&m->op.cl, bch2_write, NULL, NULL); + } +@@ -443,31 +448,34 @@ void bch2_data_update_exit(struct data_update *update) + struct bch_fs *c = update->op.c; + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + ++ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ kfree(update->bvecs); ++ update->bvecs = NULL; ++ + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); +- bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); +- bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ bch2_bkey_buf_exit(&update->k, c); + } + +-static void bch2_update_unwritten_extent(struct btree_trans *trans, +- struct data_update *update) ++static int bch2_update_unwritten_extent(struct btree_trans *trans, ++ struct data_update *update) + { + struct bch_fs *c = update->op.c; +- struct bio *bio = &update->op.wbio.bio; + struct bkey_i_extent *e; + struct write_point *wp; + struct closure cl; + struct btree_iter iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + + closure_init_stack(&cl); + bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); + +- while (bio_sectors(bio)) { +- unsigned sectors = bio_sectors(bio); ++ while (bpos_lt(update->op.pos, update->k.k->k.p)) { ++ unsigned sectors = update->k.k->k.p.offset - ++ update->op.pos.offset; + + bch2_trans_begin(trans); + +@@ -503,7 +511,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch_err_fn_ratelimited(c, ret); + + if (ret) +- return; ++ break; + + sectors = min(sectors, wp->sectors_free); + +@@ -513,7 +521,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + +- bio_advance(bio, sectors << 9); + update->op.pos.offset += sectors; + + extent_for_each_ptr(extent_i_to_s(e), ptr) +@@ -532,13 +539,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_trans_unlock(trans); + closure_sync(&cl); + } ++ ++ return ret; + } + + void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +- printbuf_tabstop_push(out, 20); ++ if (!out->nr_tabstops) ++ printbuf_tabstop_push(out, 20); + + prt_str_indented(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); +@@ -562,6 +572,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + + prt_str_indented(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); ++ prt_newline(out); + } + + void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +@@ -573,6 +584,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + } + ++void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) ++{ ++ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); ++ prt_printf(out, "read_done:\t\%u\n", m->read_done); ++ bch2_write_op_to_text(out, &m->op); ++ printbuf_indent_sub(out, 2); ++} ++ + int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, +@@ -616,12 +638,80 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } + ++static bool can_allocate_without_blocking(struct bch_fs *c, ++ struct data_update *m) ++{ ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) ++ return false; ++ ++ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ++ ? m->op.target ++ : 0; ++ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ darray_for_each(m->op.devs_have, i) ++ __clear_bit(*i, devs.d); ++ ++ rcu_read_lock(); ++ unsigned nr_replicas = 0, i; ++ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { ++ struct bch_dev *ca = bch2_dev_rcu(c, i); ++ ++ struct bch_dev_usage usage; ++ bch2_dev_usage_read_fast(ca, &usage); ++ ++ if (!dev_buckets_free(ca, usage, m->op.watermark)) ++ continue; ++ ++ nr_replicas += ca->mi.durability; ++ if (nr_replicas >= m->op.nr_replicas) ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return nr_replicas >= m->op.nr_replicas; ++} ++ ++int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ /* write path might have to decompress data: */ ++ unsigned buf_bytes = 0; ++ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) ++ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); ++ ++ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); ++ ++ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); ++ if (!m->bvecs) ++ return -ENOMEM; ++ ++ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); ++ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); ++ ++ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { ++ kfree(m->bvecs); ++ m->bvecs = NULL; ++ return -ENOMEM; ++ } ++ ++ rbio_init(&m->rbio.bio, c, *io_opts, NULL); ++ m->rbio.bio.bi_iter.bi_size = buf_bytes; ++ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); ++ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); ++ return 0; ++} ++ + int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, + struct moving_context *ctxt, + struct data_update *m, + struct write_point_specifier wp, +- struct bch_io_opts io_opts, ++ struct bch_io_opts *io_opts, + struct data_update_opts data_opts, + enum btree_id btree_id, + struct bkey_s_c k) +@@ -639,16 +729,7 @@ int bch2_data_update_init(struct btree_trans *trans, + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) +- return -BCH_ERR_data_update_done; +- +- if (!bkey_get_dev_refs(c, k)) +- return -BCH_ERR_data_update_done; +- +- if (c->opts.nocow_enabled && +- !bkey_nocow_lock(c, ctxt, k)) { +- bkey_put_dev_refs(c, k); +- return -BCH_ERR_nocow_lock_blocked; +- } ++ return -BCH_ERR_data_update_done_no_snapshot; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); +@@ -657,18 +738,18 @@ int bch2_data_update_init(struct btree_trans *trans, + m->ctxt = ctxt; + m->stats = ctxt ? ctxt->stats : NULL; + +- bch2_write_op_init(&m->op, c, io_opts); ++ bch2_write_op_init(&m->op, c, *io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->bversion; + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.nr_replicas = 0; +- m->op.flags |= BCH_WRITE_PAGES_STABLE| +- BCH_WRITE_PAGES_OWNED| +- BCH_WRITE_DATA_ENCODED| +- BCH_WRITE_MOVE| ++ m->op.flags |= BCH_WRITE_pages_stable| ++ BCH_WRITE_pages_owned| ++ BCH_WRITE_data_encoded| ++ BCH_WRITE_move| + m->data_opts.write_flags; +- m->op.compression_opt = io_opts.background_compression; ++ m->op.compression_opt = io_opts->background_compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + unsigned durability_have = 0, durability_removing = 0; +@@ -706,7 +787,7 @@ int bch2_data_update_init(struct btree_trans *trans, + ptr_bit <<= 1; + } + +- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); ++ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); + + /* + * If current extent durability is less than io_opts.data_replicas, +@@ -739,8 +820,16 @@ int bch2_data_update_init(struct btree_trans *trans, + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) +- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); +- goto out; ++ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); ++ if (!ret) ++ ret = -BCH_ERR_data_update_done_no_writes_needed; ++ goto out_bkey_buf_exit; ++ } ++ ++ if ((m->op.flags & BCH_WRITE_alloc_nowait) && ++ !can_allocate_without_blocking(c, m)) { ++ ret = -BCH_ERR_data_update_done_would_block; ++ goto out_bkey_buf_exit; + } + + if (reserve_sectors) { +@@ -749,18 +838,41 @@ int bch2_data_update_init(struct btree_trans *trans, + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) +- goto out; ++ goto out_bkey_buf_exit; ++ } ++ ++ if (!bkey_get_dev_refs(c, k)) { ++ ret = -BCH_ERR_data_update_done_no_dev_refs; ++ goto out_put_disk_res; ++ } ++ ++ if (c->opts.nocow_enabled && ++ !bkey_nocow_lock(c, ctxt, k)) { ++ ret = -BCH_ERR_nocow_lock_blocked; ++ goto out_put_dev_refs; + } + + if (bkey_extent_is_unwritten(k)) { +- bch2_update_unwritten_extent(trans, m); +- goto out; ++ ret = bch2_update_unwritten_extent(trans, m) ?: ++ -BCH_ERR_data_update_done_unwritten; ++ goto out_nocow_unlock; + } + ++ ret = bch2_data_update_bios_init(m, c, io_opts); ++ if (ret) ++ goto out_nocow_unlock; ++ + return 0; +-out: +- bch2_data_update_exit(m); +- return ret ?: -BCH_ERR_data_update_done; ++out_nocow_unlock: ++ if (c->opts.nocow_enabled) ++ bkey_nocow_unlock(c, k); ++out_put_dev_refs: ++ bkey_put_dev_refs(c, k); ++out_put_disk_res: ++ bch2_disk_reservation_put(c, &m->op.res); ++out_bkey_buf_exit: ++ bch2_bkey_buf_exit(&m->k, c); ++ return ret; + } + + void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) +diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h +index e4b50723428e..c194cbbf5b51 100644 +--- a/fs/bcachefs/data_update.h ++++ b/fs/bcachefs/data_update.h +@@ -4,6 +4,7 @@ + #define _BCACHEFS_DATA_UPDATE_H + + #include "bkey_buf.h" ++#include "io_read.h" + #include "io_write_types.h" + + struct moving_context; +@@ -15,6 +16,9 @@ struct data_update_opts { + u8 extra_replicas; + unsigned btree_insert_flags; + unsigned write_flags; ++ ++ int read_dev; ++ bool scrub; + }; + + void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, +@@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + + struct data_update { + /* extent being updated: */ ++ bool read_done; + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_move_stats *stats; ++ ++ struct bch_read_bio rbio; + struct bch_write_op op; ++ struct bio_vec *bvecs; + }; + + void bch2_data_update_to_text(struct printbuf *, struct data_update *); ++void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); + + int bch2_data_update_index_update(struct bch_write_op *); + +-void bch2_data_update_read_done(struct data_update *, +- struct bch_extent_crc_unpacked); ++void bch2_data_update_read_done(struct data_update *); + + int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, +@@ -43,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, + struct bch_io_opts *, + struct data_update_opts *); + ++int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, ++ struct bch_io_opts *); ++ + void bch2_data_update_exit(struct data_update *); + int bch2_data_update_init(struct btree_trans *, struct btree_iter *, + struct moving_context *, + struct data_update *, + struct write_point_specifier, +- struct bch_io_opts, struct data_update_opts, ++ struct bch_io_opts *, struct data_update_opts, + enum btree_id, struct bkey_s_c); + void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 55333e82d1fe..788af88f6979 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -7,6 +7,7 @@ + */ + + #include "bcachefs.h" ++#include "alloc_foreground.h" + #include "bkey_methods.h" + #include "btree_cache.h" + #include "btree_io.h" +@@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + unsigned offset = 0; + int ret; + +- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { ++ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { + prt_printf(out, "error getting device to read from: invalid device\n"); + return; + } +@@ -844,8 +845,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) + seqmutex_unlock(&c->btree_trans_lock); + } + +-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, +- size_t size, loff_t *ppos) ++typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); ++ ++static ssize_t bch2_simple_print(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos, ++ fs_to_text_fn fn) + { + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; +@@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + i->ret = 0; + + if (!i->iter) { +- btree_deadlock_to_text(&i->buf, c); ++ fn(&i->buf, c); + i->iter++; + } + +@@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + return ret ?: i->ret; + } + ++static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); ++} ++ + static const struct file_operations btree_deadlock_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, +@@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { + .read = bch2_btree_deadlock_read, + }; + ++static ssize_t bch2_write_points_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); ++} ++ ++static const struct file_operations write_points_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_write_points_read, ++}; ++ + void bch2_fs_debug_exit(struct bch_fs *c) + { + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) +@@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) + debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, + c->btree_debug, &btree_deadlock_ops); + ++ debugfs_create_file("write_points", 0400, c->fs_debug_dir, ++ c->btree_debug, &write_points_ops); ++ + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index b211e90ac54e..1aa56d28de33 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -1056,6 +1056,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h) + ec_stripes_heap_set_backpointer(_h, j); + } + ++static const struct min_heap_callbacks callbacks = { ++ .less = ec_stripes_heap_cmp, ++ .swp = ec_stripes_heap_swap, ++}; ++ + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) + { + ec_stripes_heap *h = &c->ec_stripes_heap; +@@ -1068,11 +1073,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) + void bch2_stripes_heap_del(struct bch_fs *c, + struct stripe *m, size_t idx) + { +- const struct min_heap_callbacks callbacks = { +- .less = ec_stripes_heap_cmp, +- .swp = ec_stripes_heap_swap, +- }; +- + mutex_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + +@@ -1083,11 +1083,6 @@ void bch2_stripes_heap_del(struct bch_fs *c, + void bch2_stripes_heap_insert(struct bch_fs *c, + struct stripe *m, size_t idx) + { +- const struct min_heap_callbacks callbacks = { +- .less = ec_stripes_heap_cmp, +- .swp = ec_stripes_heap_swap, +- }; +- + mutex_lock(&c->ec_stripes_heap_lock); + BUG_ON(min_heap_full(&c->ec_stripes_heap)); + +@@ -1106,10 +1101,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c, + void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) + { +- const struct min_heap_callbacks callbacks = { +- .less = ec_stripes_heap_cmp, +- .swp = ec_stripes_heap_swap, +- }; + ec_stripes_heap *h = &c->ec_stripes_heap; + bool do_deletes; + size_t i; +@@ -1389,8 +1380,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + ++ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); ++ if (bp.v->btree_id == BTREE_ID_stripes) ++ continue; ++ + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, +- bkey_s_c_to_backpointer(bp_k), &last_flushed); ++ bp, &last_flushed); + })); + + bch2_bkey_buf_exit(&last_flushed, c); +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +index 4590cd0c7c90..89df97810076 100644 +--- a/fs/bcachefs/errcode.h ++++ b/fs/bcachefs/errcode.h +@@ -180,6 +180,11 @@ + x(EINVAL, not_in_recovery) \ + x(EINVAL, cannot_rewind_recovery) \ + x(0, data_update_done) \ ++ x(BCH_ERR_data_update_done, data_update_done_would_block) \ ++ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(EINVAL, device_state_not_allowed) \ + x(EINVAL, member_info_missing) \ + x(EINVAL, mismatched_block_size) \ +@@ -269,6 +274,7 @@ + x(EIO, invalidate_stripe_to_dev) \ + x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ ++ x(EIO, device_offline) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 038da6a61f6b..c8fc58fab958 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -530,35 +530,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c) + mutex_unlock(&c->fsck_error_msgs_lock); + } + +-int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) ++int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, ++ subvol_inum inum, u64 offset) + { + u32 restart_count = trans->restart_count; + int ret = 0; + +- /* XXX: we don't yet attempt to print paths when we don't know the subvol */ +- if (inum.subvol) +- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); ++ if (inum.subvol) { ++ ret = bch2_inum_to_path(trans, inum, out); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ } + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); ++ prt_printf(out, " offset %llu: ", offset); + + return trans_was_restarted(trans, restart_count); + } + +-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, +- subvol_inum inum, u64 offset) ++void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, ++ subvol_inum inum, u64 offset) + { +- int ret = bch2_inum_err_msg_trans(trans, out, inum); +- prt_printf(out, " offset %llu: ", offset); +- return ret; ++ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + } + +-void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) ++int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, ++ struct bpos pos) + { +- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); +-} ++ struct bch_fs *c = trans->c; ++ int ret = 0; + +-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, +- subvol_inum inum, u64 offset) +-{ +- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); ++ if (!bch2_snapshot_is_leaf(c, pos.snapshot)) ++ prt_str(out, "(multiple snapshots) "); ++ ++ subvol_inum inum = { ++ .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), ++ .inum = pos.inode, ++ }; ++ ++ if (inum.subvol) { ++ ret = bch2_inum_to_path(trans, inum, out); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ } ++ ++ if (!inum.subvol || ret) ++ prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); ++ ++ prt_printf(out, " offset %llu: ", pos.offset << 8); ++ return 0; + } +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 7acf2a27ca28..76da0e88cee8 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -238,10 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); + _ret; \ + }) + +-int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); + int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); + +-void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); + void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); + ++int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); ++ + #endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 05d5f71a7ca9..78a51d96bd2d 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -114,8 +114,9 @@ static inline bool ptr_better(struct bch_fs *c, + * other devices, it will still pick a pointer from avoid. + */ + int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, +- struct bch_io_failures *failed, +- struct extent_ptr_decoded *pick) ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick, ++ int dev) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +@@ -137,6 +138,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + break; + } + ++ /* Are we being asked to read from a specific device? */ ++ if (dev >= 0 && p.ptr.dev != dev) ++ continue; ++ + /* + * If there are any dirty pointers it's an error if we can't + * read: +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 620b284aa34f..8fae6b23a341 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -404,7 +404,7 @@ void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); + int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, +- struct extent_ptr_decoded *); ++ struct extent_ptr_decoded *, int); + + /* KEY_TYPE_btree_ptr: */ + +diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c +index 2eaffe37b5e7..0e742555cb0a 100644 +--- a/fs/bcachefs/eytzinger.c ++++ b/fs/bcachefs/eytzinger.c +@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr + return cmp(a, b, priv); + } + +-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, ++static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) + { +- return do_cmp(base + inorder_to_eytzinger0(l, n) * size, +- base + inorder_to_eytzinger0(r, n) * size, ++ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, ++ base1 + inorder_to_eytzinger1(r, n) * size, + cmp_func, priv); + } + +-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, ++static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) + { +- do_swap(base + inorder_to_eytzinger0(l, n) * size, +- base + inorder_to_eytzinger0(r, n) * size, ++ do_swap(base1 + inorder_to_eytzinger1(l, n) * size, ++ base1 + inorder_to_eytzinger1(r, n) * size, + size, swap_func, priv); + } + +-void eytzinger0_sort_r(void *base, size_t n, size_t size, +- cmp_r_func_t cmp_func, +- swap_r_func_t swap_func, +- const void *priv) ++static void eytzinger1_sort_r(void *base1, size_t n, size_t size, ++ cmp_r_func_t cmp_func, ++ swap_r_func_t swap_func, ++ const void *priv) + { +- int i, j, k; ++ unsigned i, j, k; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) + swap_func = NULL; + + if (!swap_func) { +- if (is_aligned(base, size, 8)) ++ if (is_aligned(base1, size, 8)) + swap_func = SWAP_WORDS_64; +- else if (is_aligned(base, size, 4)) ++ else if (is_aligned(base1, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ +- for (i = n / 2 - 1; i >= 0; --i) { ++ for (i = n / 2; i >= 1; --i) { + /* Find the sift-down path all the way to the leaves. */ +- for (j = i; k = j * 2 + 1, k + 1 < n;) +- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; ++ for (j = i; k = j * 2, k < n;) ++ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + + /* Special case for the last leaf with no sibling. */ +- if (j * 2 + 2 == n) +- j = j * 2 + 1; ++ if (j * 2 == n) ++ j *= 2; + + /* Backtrack to the correct location. */ +- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) +- j = (j - 1) / 2; ++ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) ++ j /= 2; + + /* Shift the element into its correct place. */ + for (k = j; j != i;) { +- j = (j - 1) / 2; +- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); ++ j /= 2; ++ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); + } + } + + /* sort */ +- for (i = n - 1; i > 0; --i) { +- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); ++ for (i = n; i > 1; --i) { ++ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); + + /* Find the sift-down path all the way to the leaves. */ +- for (j = 0; k = j * 2 + 1, k + 1 < i;) +- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; ++ for (j = 1; k = j * 2, k + 1 < i;) ++ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + + /* Special case for the last leaf with no sibling. */ +- if (j * 2 + 2 == i) +- j = j * 2 + 1; ++ if (j * 2 + 1 == i) ++ j *= 2; + + /* Backtrack to the correct location. */ +- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) +- j = (j - 1) / 2; ++ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) ++ j /= 2; + + /* Shift the element into its correct place. */ +- for (k = j; j;) { +- j = (j - 1) / 2; +- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); ++ for (k = j; j > 1;) { ++ j /= 2; ++ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); + } + } + } + ++void eytzinger0_sort_r(void *base, size_t n, size_t size, ++ cmp_r_func_t cmp_func, ++ swap_r_func_t swap_func, ++ const void *priv) ++{ ++ void *base1 = base - size; ++ ++ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); ++} ++ + void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +index 0541192d7bc0..643c1f716061 100644 +--- a/fs/bcachefs/eytzinger.h ++++ b/fs/bcachefs/eytzinger.h +@@ -6,6 +6,7 @@ + #include + + #ifdef EYTZINGER_DEBUG ++#include + #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) + #else + #define EYTZINGER_BUG_ON(cond) +@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) + return rounddown_pow_of_two(size + 1) - 1; + } + +-/* +- * eytzinger1_next() and eytzinger1_prev() have the nice properties that +- * +- * eytzinger1_next(0) == eytzinger1_first()) +- * eytzinger1_prev(0) == eytzinger1_last()) +- * +- * eytzinger1_prev(eytzinger1_first()) == 0 +- * eytzinger1_next(eytzinger1_last()) == 0 +- */ +- + static inline unsigned eytzinger1_next(unsigned i, unsigned size) + { +- EYTZINGER_BUG_ON(i > size); ++ EYTZINGER_BUG_ON(i == 0 || i > size); + + if (eytzinger1_right_child(i) <= size) { + i = eytzinger1_right_child(i); + +- i <<= __fls(size + 1) - __fls(i); ++ i <<= __fls(size) - __fls(i); + i >>= i > size; + } else { + i >>= ffz(i) + 1; +@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) + + static inline unsigned eytzinger1_prev(unsigned i, unsigned size) + { +- EYTZINGER_BUG_ON(i > size); ++ EYTZINGER_BUG_ON(i == 0 || i > size); + + if (eytzinger1_left_child(i) <= size) { + i = eytzinger1_left_child(i) + 1; + +- i <<= __fls(size + 1) - __fls(i); ++ i <<= __fls(size) - __fls(i); + i -= 1; + i >>= i > size; + } else { +@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) + (_i) != -1; \ + (_i) = eytzinger0_next((_i), (_size))) + ++#define eytzinger0_for_each_prev(_i, _size) \ ++ for (unsigned (_i) = eytzinger0_last((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_prev((_i), (_size))) ++ + /* return greatest node <= @search, or -1 if not found */ + static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) + { +- unsigned i, n = 0; +- +- if (!nr) +- return -1; +- +- do { +- i = n; +- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); +- } while (n < nr); +- +- if (n & 1) { +- /* +- * @i was greater than @search, return previous node: +- * +- * if @i was leftmost/smallest element, +- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected +- */ +- return eytzinger0_prev(i, nr); +- } else { +- return i; +- } ++ void *base1 = base - size; ++ unsigned n = 1; ++ ++ while (n <= nr) ++ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); ++ n >>= __ffs(n) + 1; ++ return n - 1; + } + ++/* return smallest node > @search, or -1 if not found */ + static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) + { +- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); ++ void *base1 = base - size; ++ unsigned n = 1; + +- /* +- * if eytitzinger0_find_le() returned -1 - no element was <= search - we +- * want to return the first element; next/prev identities mean this work +- * as expected +- * +- * similarly if find_le() returns last element, we should return -1; +- * identities mean this all works out: +- */ +- return eytzinger0_next(idx, nr); ++ while (n <= nr) ++ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); ++ n >>= __ffs(n + 1) + 1; ++ return n - 1; + } + ++/* return smallest node >= @search, or -1 if not found */ + static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) + { +- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); +- +- if (idx < nr && !cmp(base + idx * size, search)) +- return idx; ++ void *base1 = base - size; ++ unsigned n = 1; + +- return eytzinger0_next(idx, nr); ++ while (n <= nr) ++ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); ++ n >>= __ffs(n + 1) + 1; ++ return n - 1; + } + + #define eytzinger0_find(base, nr, size, _cmp, search) \ + ({ \ +- void *_base = (base); \ ++ size_t _size = (size); \ ++ void *_base1 = (void *)(base) - _size; \ + const void *_search = (search); \ + size_t _nr = (nr); \ +- size_t _size = (size); \ +- size_t _i = 0; \ ++ size_t _i = 1; \ + int _res; \ + \ +- while (_i < _nr && \ +- (_res = _cmp(_search, _base + _i * _size))) \ +- _i = eytzinger0_child(_i, _res > 0); \ +- _i; \ ++ while (_i <= _nr && \ ++ (_res = _cmp(_search, _base1 + _i * _size))) \ ++ _i = eytzinger1_child(_i, _res > 0); \ ++ _i - 1; \ + }) + + void eytzinger0_sort_r(void *, size_t, size_t, +diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c +index ab1d5db2fa56..a1ccb9139b04 100644 +--- a/fs/bcachefs/fs-io-buffered.c ++++ b/fs/bcachefs/fs-io-buffered.c +@@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; +- int flags = BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE; ++ int flags = BCH_READ_retry_if_stale| ++ BCH_READ_may_promote; + int ret = 0; + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +@@ -211,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans, + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); +@@ -232,7 +230,8 @@ static void bchfs_read(struct btree_trans *trans, + + if (ret) { + struct printbuf buf = PRINTBUF; +- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); ++ lockrestart_do(trans, ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); +@@ -280,12 +279,13 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_readpages_end_io); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); +- rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(trans, rbio, inode_inum(inode), +@@ -323,10 +323,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_read_single_folio_end_io); + rbio->bio.bi_private = &done; +- rbio->bio.bi_end_io = bch2_read_single_folio_end_io; +- + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); +@@ -420,7 +420,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) + } + } + +- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ if (io->op.flags & BCH_WRITE_wrote_data_inline) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + +diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c +index 2089c36b5866..535bc5fcbcc0 100644 +--- a/fs/bcachefs/fs-io-direct.c ++++ b/fs/bcachefs/fs-io-direct.c +@@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + struct blk_plug plug; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); ++ bool split = false; + size_t shorten; + ssize_t ret; + +@@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + GFP_KERNEL, + &c->dio_read_bioset); + +- bio->bi_end_io = bch2_direct_IO_read_endio; +- + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + +@@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + + goto start; + while (iter->count) { ++ split = true; ++ + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); +- bio->bi_end_io = bch2_direct_IO_read_split_endio; + start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; +@@ -160,7 +160,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + if (iter->count) + closure_get(&dio->cl); + +- bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); ++ struct bch_read_bio *rbio = ++ rbio_init(bio, ++ c, ++ opts, ++ split ++ ? bch2_direct_IO_read_split_endio ++ : bch2_direct_IO_read_endio); ++ ++ bch2_read(c, rbio, inode_inum(inode)); + } + + blk_finish_plug(&plug); +@@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) +- dio->op.flags |= BCH_WRITE_SYNC; +- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ++ dio->op.flags |= BCH_WRITE_sync; ++ dio->op.flags |= BCH_WRITE_check_enospc; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 8fcf7c8e5ede..53a421ff136d 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + return ret; + + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); +- struct qstr name = (struct qstr) QSTR(name_buf); ++ struct qstr name = QSTR(name_buf); + + inode->bi_dir = lostfound.bi_inum; + +diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c +index 5353979117b0..6b842c8d21be 100644 +--- a/fs/bcachefs/io_misc.c ++++ b/fs/bcachefs/io_misc.c +@@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, + bch2_increment_clock(c, sectors_allocated, WRITE); + if (should_print_err(ret)) { + struct printbuf buf = PRINTBUF; +- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); ++ lockrestart_do(trans, ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); + prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); +diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c +index 8c7b2d3d779d..821ff222b361 100644 +--- a/fs/bcachefs/io_read.c ++++ b/fs/bcachefs/io_read.c +@@ -80,6 +80,7 @@ struct promote_op { + struct rhash_head hash; + struct bpos pos; + ++ struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ + }; +@@ -96,6 +97,26 @@ static inline bool have_io_error(struct bch_io_failures *failed) + return failed && failed->nr; + } + ++static bool ptr_being_rewritten(struct bch_read_bio *orig, ++ unsigned dev, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_data_update)) ++ return false; ++ ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); ++ unsigned i = 0; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr->dev == dev && ++ u->data_opts.rewrite_ptrs & BIT(i)) ++ return true; ++ i++; ++ } ++ ++ return false; ++} ++ + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, +@@ -105,7 +126,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + if (!have_io_error(failed)) { + BUG_ON(!opts.promote_target); + +- if (!(flags & BCH_READ_MAY_PROMOTE)) ++ if (!(flags & BCH_READ_may_promote)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) +@@ -125,98 +146,94 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static void promote_free(struct bch_fs *c, struct promote_op *op) ++static noinline void promote_free(struct bch_read_bio *rbio) + { +- int ret; ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); ++ struct bch_fs *c = rbio->c; ++ ++ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); + + bch2_data_update_exit(&op->write); + +- ret = rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params); +- BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); + } + + static void promote_done(struct bch_write_op *wop) + { +- struct promote_op *op = +- container_of(wop, struct promote_op, write.op); +- struct bch_fs *c = op->write.op.c; ++ struct promote_op *op = container_of(wop, struct promote_op, write.op); ++ struct bch_fs *c = op->write.rbio.c; + +- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], +- op->start_time); +- promote_free(c, op); ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); ++ promote_free(&op->write.rbio); + } + +-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++static void promote_start_work(struct work_struct *work) + { +- struct bio *bio = &op->write.op.wbio.bio; ++ struct promote_op *op = container_of(work, struct promote_op, work); + +- trace_and_count(op->write.op.c, read_promote, &rbio->bio); ++ bch2_data_update_read_done(&op->write); ++} + +- /* we now own pages: */ +- BUG_ON(!rbio->bounce); +- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++static noinline void promote_start(struct bch_read_bio *rbio) ++{ ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + +- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, +- sizeof(struct bio_vec) * rbio->bio.bi_vcnt); +- swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); + +- bch2_data_update_read_done(&op->write, rbio->pick.crc); ++ INIT_WORK(&op->work, promote_start_work); ++ queue_work(rbio->c->write_ref_wq, &op->work); + } + +-static struct promote_op *__promote_alloc(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bkey_s_c k, +- struct bpos pos, +- struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, +- unsigned sectors, +- struct bch_read_bio **rbio, +- struct bch_io_failures *failed) ++static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ unsigned sectors, ++ unsigned flags, ++ struct bch_read_bio *orig, ++ struct bch_io_failures *failed) + { + struct bch_fs *c = trans->c; +- struct promote_op *op = NULL; +- struct bio *bio; +- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) +- return ERR_PTR(-BCH_ERR_nopromote_no_writes); ++ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; + +- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); +- if (!op) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; +- } ++ if (!have_io_error(failed)) { ++ update_opts.target = orig->opts.promote_target; ++ update_opts.extra_replicas = 1; ++ update_opts.write_flags |= BCH_WRITE_cached; ++ update_opts.write_flags |= BCH_WRITE_only_specified_devs; ++ } else { ++ update_opts.target = orig->opts.foreground_target; + +- op->start_time = local_clock(); +- op->pos = pos; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ unsigned ptr_bit = 1; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (bch2_dev_io_failures(failed, ptr->dev) && ++ !ptr_being_rewritten(orig, ptr->dev, flags)) ++ update_opts.rewrite_ptrs |= ptr_bit; ++ ptr_bit <<= 1; ++ } + +- /* +- * We don't use the mempool here because extents that aren't +- * checksummed or compressed can be too big for the mempool: +- */ +- *rbio = kzalloc(sizeof(struct bch_read_bio) + +- sizeof(struct bio_vec) * pages, +- GFP_KERNEL); +- if (!*rbio) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ if (!update_opts.rewrite_ptrs) ++ return NULL; + } + +- rbio_init(&(*rbio)->bio, opts); +- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) ++ return ERR_PTR(-BCH_ERR_nopromote_no_writes); + +- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { ++ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); ++ if (!op) { + ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ goto err_put; + } + +- (*rbio)->bounce = true; +- (*rbio)->split = true; +- (*rbio)->kmalloc = true; ++ op->start_time = local_clock(); ++ op->pos = pos; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) { +@@ -224,64 +241,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, + goto err; + } + +- bio = &op->write.op.wbio.bio; +- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); +- +- struct data_update_opts update_opts = {}; +- +- if (!have_io_error(failed)) { +- update_opts.target = opts.promote_target; +- update_opts.extra_replicas = 1; +- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; +- } else { +- update_opts.target = opts.foreground_target; +- +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- unsigned ptr_bit = 1; +- bkey_for_each_ptr(ptrs, ptr) { +- if (bch2_dev_io_failures(failed, ptr->dev)) +- update_opts.rewrite_ptrs |= ptr_bit; +- ptr_bit <<= 1; +- } +- } +- + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, + writepoint_hashed((unsigned long) current), +- opts, ++ &orig->opts, + update_opts, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ +- if (ret) { +- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params)); +- goto err; +- } ++ if (ret) ++ goto err_remove_hash; + ++ rbio_init_fragment(&op->write.rbio.bio, orig); ++ op->write.rbio.bounce = true; ++ op->write.rbio.promote = true; + op->write.op.end_io = promote_done; + +- return op; ++ return &op->write.rbio; ++err_remove_hash: ++ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params)); + err: +- if (*rbio) +- bio_free_pages(&(*rbio)->bio); +- kfree(*rbio); +- *rbio = NULL; ++ bio_free_pages(&op->write.op.wbio.bio); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); ++err_put: + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return ERR_PTR(ret); + } + + noinline +-static struct promote_op *promote_alloc(struct btree_trans *trans, ++static struct bch_read_bio *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, + unsigned flags, +- struct bch_read_bio **rbio, ++ struct bch_read_bio *orig, + bool *bounce, + bool *read_full, + struct bch_io_failures *failed) +@@ -301,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); +- struct promote_op *promote; + int ret; + +- ret = should_promote(c, k, pos, opts, flags, failed); ++ ret = should_promote(c, k, pos, orig->opts, flags, failed); + if (ret) + goto nopromote; + +- promote = __promote_alloc(trans, +- k.k->type == KEY_TYPE_reflink_v +- ? BTREE_ID_reflink +- : BTREE_ID_extents, +- k, pos, pick, opts, sectors, rbio, failed); ++ struct bch_read_bio *promote = ++ __promote_alloc(trans, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_reflink ++ : BTREE_ID_extents, ++ k, pos, pick, sectors, flags, orig, failed); ++ if (!promote) ++ return NULL; ++ + ret = PTR_ERR_OR_ZERO(promote); + if (ret) + goto nopromote; +@@ -321,7 +320,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + *read_full = promote_full; + return promote; + nopromote: +- trace_read_nopromote(c, ret); ++ trace_io_read_nopromote(c, ret); + return NULL; + } + +@@ -330,9 +329,10 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) + { +- return bch2_inum_offset_err_msg_trans(trans, out, +- (subvol_inum) { rbio->subvol, read_pos.inode }, +- read_pos.offset << 9); ++ return lockrestart_do(trans, ++ bch2_inum_offset_err_msg_trans(trans, out, ++ (subvol_inum) { rbio->subvol, read_pos.inode }, ++ read_pos.offset << 9)); + } + + static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, +@@ -375,20 +375,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) + { + BUG_ON(rbio->bounce && !rbio->split); + +- if (rbio->promote) +- promote_free(rbio->c, rbio->promote); +- rbio->promote = NULL; +- +- if (rbio->bounce) +- bch2_bio_free_pages_pool(rbio->c, &rbio->bio); +- + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + +- if (rbio->kmalloc) +- kfree(rbio); +- else ++ if (unlikely(rbio->promote)) { ++ if (!rbio->bio.bi_status) ++ promote_start(rbio); ++ else ++ promote_free(rbio); ++ } else { ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ + bio_put(&rbio->bio); ++ } + + rbio = parent; + } +@@ -408,61 +408,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) + bio_endio(&rbio->bio); + } + +-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) + { ++ struct data_update *u = container_of(rbio, struct data_update, rbio); + struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_buf sk; +- struct bkey_s_c k; +- int ret; +- +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- +- bch2_bkey_buf_init(&sk); +- +- bch2_trans_iter_init(trans, &iter, rbio->data_btree, +- rbio->read_pos, BTREE_ITER_slots); + retry: + bch2_trans_begin(trans); +- rbio->bio.bi_status = 0; + +- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = lockrestart_do(trans, ++ bkey_err(k = bch2_bkey_get_iter(trans, &iter, ++ u->btree_id, bkey_start_pos(&u->k.k->k), ++ 0))); + if (ret) + goto err; + +- bch2_bkey_buf_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); +- +- if (!bch2_bkey_matches_ptr(c, k, +- rbio->pick.ptr, +- rbio->data_pos.offset - +- rbio->pick.crc.offset)) { ++ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; +- goto out; ++ goto err; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, +- rbio->read_pos, +- rbio->data_btree, +- k, 0, failed, flags); ++ bkey_start_pos(&u->k.k->k), ++ u->btree_id, ++ bkey_i_to_s_c(u->k.k), ++ 0, failed, flags, -1); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ + if (ret == READ_RETRY) + goto retry; + if (ret) +- goto err; +-out: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ ++ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + bch2_rbio_done(rbio); +- bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); +- bch2_bkey_buf_exit(&sk, c); +- return; +-err: +- rbio->bio.bi_status = BLK_STS_IOERR; +- goto out; + } + + static void bch2_rbio_retry(struct work_struct *work) +@@ -478,34 +464,36 @@ static void bch2_rbio_retry(struct work_struct *work) + }; + struct bch_io_failures failed = { .nr = 0 }; + +- trace_and_count(c, read_retry, &rbio->bio); ++ trace_io_read_retry(&rbio->bio); ++ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], ++ bvec_iter_sectors(rbio->bvec_iter)); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + +- rbio->bio.bi_status = 0; ++ if (!rbio->split) ++ rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + +- flags |= BCH_READ_IN_RETRY; +- flags &= ~BCH_READ_MAY_PROMOTE; ++ flags |= BCH_READ_in_retry; ++ flags &= ~BCH_READ_may_promote; ++ flags &= ~BCH_READ_last_fragment; ++ flags |= BCH_READ_must_clone; + +- if (flags & BCH_READ_NODECODE) { ++ if (flags & BCH_READ_data_update) + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); +- } else { +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- ++ else + __bch2_read(c, rbio, iter, inum, &failed, flags); +- } + } + + static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) + { + rbio->retry = retry; ++ rbio->saw_error = true; + +- if (rbio->flags & BCH_READ_IN_RETRY) ++ if (rbio->flags & BCH_READ_in_retry) + return; + + if (retry == READ_ERR) { +@@ -712,32 +700,40 @@ static void __bch2_read_endio(struct work_struct *work) + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + +- if (rbio->flags & BCH_READ_NODECODE) +- goto nodecode; ++ if (likely(!(rbio->flags & BCH_READ_data_update))) { ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + +- /* Adjust crc to point to subset of data we want: */ +- crc.offset += rbio->offset_into_extent; +- crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ if (crc_is_compressed(crc)) { ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- if (crc_is_compressed(crc)) { +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && ++ !c->opts.no_data_io) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); + +- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && +- !c->opts.no_data_io) +- goto decompression_err; +- } else { +- /* don't need to decrypt the entire bio: */ +- nonce = nonce_add(nonce, crc.offset << 9); +- bio_advance(src, crc.offset << 9); ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; + +- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); +- src->bi_iter.bi_size = dst_iter.bi_size; ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ } else { ++ if (rbio->split) ++ rbio->parent->pick = rbio->pick; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; +@@ -754,12 +750,9 @@ static void __bch2_read_endio(struct work_struct *work) + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; +- +- promote_start(rbio->promote, rbio); +- rbio->promote = NULL; + } +-nodecode: +- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ ++ if (likely(!(rbio->flags & BCH_READ_in_retry))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +@@ -772,8 +765,8 @@ static void __bch2_read_endio(struct work_struct *work) + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ +- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { +- rbio->flags |= BCH_READ_MUST_BOUNCE; ++ if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { ++ rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } +@@ -810,11 +803,11 @@ static void bch2_read_endio(struct bio *bio) + return; + } + +- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { +- trace_and_count(c, read_reuse_race, &rbio->bio); ++ trace_and_count(c, io_read_reuse_race, &rbio->bio); + +- if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); +@@ -883,12 +876,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, +- struct bch_io_failures *failed, unsigned flags) ++ struct bch_io_failures *failed, unsigned flags, int dev) + { + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; +- struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; +@@ -902,10 +894,12 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); ++ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], ++ bvec_iter_sectors(iter)); + goto out_read_done; + } + retry_pick: +- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) +@@ -941,7 +935,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ +- if ((flags & BCH_READ_IN_RETRY) && ++ if ((flags & BCH_READ_in_retry) && + !pick.ptr.cached && + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { +@@ -955,48 +949,53 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ +- bch2_trans_unlock(trans); ++ if (!(flags & BCH_READ_in_retry)) ++ bch2_trans_unlock(trans); ++ else ++ bch2_trans_unlock_long(trans); ++ ++ if (!(flags & BCH_READ_data_update)) { ++ if (!(flags & BCH_READ_last_fragment) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_must_clone; ++ ++ narrow_crcs = !(flags & BCH_READ_in_retry) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_user_mapped)) ++ flags |= BCH_READ_must_bounce; + +- if (flags & BCH_READ_NODECODE) { ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_none && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_user_mapped)) || ++ (flags & BCH_READ_must_bounce)))) { ++ read_full = true; ++ bounce = true; ++ } ++ } else { ++ read_full = true; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ +- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { ++ BUG(); + if (ca) + percpu_ref_put(&ca->io_ref); + goto hole; + } + + iter.bi_size = pick.crc.compressed_size << 9; +- goto get_bio; +- } +- +- if (!(flags & BCH_READ_LAST_FRAGMENT) || +- bio_flagged(&orig->bio, BIO_CHAIN)) +- flags |= BCH_READ_MUST_CLONE; +- +- narrow_crcs = !(flags & BCH_READ_IN_RETRY) && +- bch2_can_narrow_extent_crcs(k, pick.crc); +- +- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) +- flags |= BCH_READ_MUST_BOUNCE; +- +- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); +- +- if (crc_is_compressed(pick.crc) || +- (pick.crc.csum_type != BCH_CSUM_none && +- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || +- (bch2_csum_type_is_encryption(pick.crc.csum_type) && +- (flags & BCH_READ_USER_MAPPED)) || +- (flags & BCH_READ_MUST_BOUNCE)))) { +- read_full = true; +- bounce = true; + } + + if (orig->opts.promote_target || have_io_error(failed)) +- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, +- &rbio, &bounce, &read_full, failed); ++ rbio = promote_alloc(trans, iter, k, &pick, flags, orig, ++ &bounce, &read_full, failed); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); +@@ -1015,7 +1014,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +-get_bio: ++ + if (rbio) { + /* + * promote already allocated bounce rbio: +@@ -1030,17 +1029,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + +- rbio = rbio_init(bio_alloc_bioset(NULL, ++ rbio = rbio_init_fragment(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; +- rbio->split = true; +- } else if (flags & BCH_READ_MUST_CLONE) { ++ } else if (flags & BCH_READ_must_clone) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't +@@ -1049,11 +1047,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ +- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, ++ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + rbio->bio.bi_iter = iter; +- rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; +@@ -1062,11 +1059,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + +- rbio->c = c; + rbio->submit_time = local_clock(); +- if (rbio->split) +- rbio->parent = orig; +- else ++ if (!rbio->split) + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; +@@ -1076,41 +1070,38 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; +- /* XXX: only initialize this if needed */ +- rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->bversion; +- rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + +- if (flags & BCH_READ_NODECODE) +- orig->pick = pick; +- + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) +- trace_and_count(c, read_bounce, &rbio->bio); ++ trace_and_count(c, io_read_bounce, &rbio->bio); + +- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); ++ if (!(flags & BCH_READ_data_update)) ++ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); ++ else ++ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ +- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) ++ if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + +- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { + bio_inc_remaining(&orig->bio); +- trace_and_count(c, read_split, &orig->bio); ++ trace_and_count(c, io_read_split, &orig->bio); + } + + if (!rbio->pick.idx) { +@@ -1132,10 +1123,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } else { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); +@@ -1153,11 +1144,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + goto out; + } + +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } + out: +- if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ if (likely(!(flags & BCH_READ_in_retry))) { + return 0; + } else { + int ret; +@@ -1180,24 +1171,26 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } + + err: +- if (flags & BCH_READ_IN_RETRY) ++ if (flags & BCH_READ_in_retry) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + + hole: ++ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], ++ bvec_iter_sectors(iter)); + /* +- * won't normally happen in the BCH_READ_NODECODE ++ * won't normally happen in the BCH_READ_data_update + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ +- if (flags & BCH_READ_NODECODE) ++ if (flags & BCH_READ_data_update) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + out_read_done: +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + bch2_rbio_done(orig); + return 0; + } +@@ -1212,7 +1205,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bkey_s_c k; + int ret; + +- BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_data_update); + + bch2_bkey_buf_init(&sk); + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +@@ -1262,15 +1255,15 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, +- offset_into_extent, failed, flags); ++ offset_into_extent, failed, flags, -1); + if (ret) + goto err; + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(bvec_iter.bi_size, bytes); +@@ -1287,7 +1280,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + if (ret) { + struct printbuf buf = PRINTBUF; +- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); ++ lockrestart_do(trans, ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, ++ bvec_iter.bi_sector << 9)); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); +diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h +index a82e8a94ccb6..73275da5d2c4 100644 +--- a/fs/bcachefs/io_read.h ++++ b/fs/bcachefs/io_read.h +@@ -35,20 +35,19 @@ struct bch_read_bio { + u16 flags; + union { + struct { +- u16 bounce:1, ++ u16 promote:1, ++ bounce:1, + split:1, +- kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, ++ saw_error:1, + retry:2, + context:2; + }; + u16 _state; + }; + +- struct bch_devs_list devs_have; +- + struct extent_ptr_decoded pick; + + /* +@@ -65,8 +64,6 @@ struct bch_read_bio { + struct bpos data_pos; + struct bversion version; + +- struct promote_op *promote; +- + struct bch_io_opts opts; + + struct work_struct work; +@@ -108,23 +105,32 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, + return 0; + } + ++#define BCH_READ_FLAGS() \ ++ x(retry_if_stale) \ ++ x(may_promote) \ ++ x(user_mapped) \ ++ x(data_update) \ ++ x(last_fragment) \ ++ x(must_bounce) \ ++ x(must_clone) \ ++ x(in_retry) ++ ++enum __bch_read_flags { ++#define x(n) __BCH_READ_##n, ++ BCH_READ_FLAGS() ++#undef x ++}; ++ + enum bch_read_flags { +- BCH_READ_RETRY_IF_STALE = 1 << 0, +- BCH_READ_MAY_PROMOTE = 1 << 1, +- BCH_READ_USER_MAPPED = 1 << 2, +- BCH_READ_NODECODE = 1 << 3, +- BCH_READ_LAST_FRAGMENT = 1 << 4, +- +- /* internal: */ +- BCH_READ_MUST_BOUNCE = 1 << 5, +- BCH_READ_MUST_CLONE = 1 << 6, +- BCH_READ_IN_RETRY = 1 << 7, ++#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), ++ BCH_READ_FLAGS() ++#undef x + }; + + int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, +- struct bch_io_failures *, unsigned); ++ struct bch_io_failures *, unsigned, int); + + static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, +@@ -132,7 +138,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, + unsigned offset_into_extent, unsigned flags) + { + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, +- data_btree, k, offset_into_extent, NULL, flags); ++ data_btree, k, offset_into_extent, NULL, flags, -1); + } + + void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, +@@ -145,24 +151,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + BUG_ON(rbio->_state); + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, +- BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE| +- BCH_READ_USER_MAPPED); ++ BCH_READ_retry_if_stale| ++ BCH_READ_may_promote| ++ BCH_READ_user_mapped); + } + +-static inline struct bch_read_bio *rbio_init(struct bio *bio, +- struct bch_io_opts opts) ++static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, ++ struct bch_read_bio *orig) + { + struct bch_read_bio *rbio = to_rbio(bio); + ++ rbio->c = orig->c; + rbio->_state = 0; +- rbio->promote = NULL; +- rbio->opts = opts; ++ rbio->split = true; ++ rbio->parent = orig; ++ rbio->opts = orig->opts; ++ return rbio; ++} ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_fs *c, ++ struct bch_io_opts opts, ++ bio_end_io_t end_io) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->start_time = local_clock(); ++ rbio->c = c; ++ rbio->_state = 0; ++ rbio->opts = opts; ++ rbio->bio.bi_end_io = end_io; + return rbio; + } + +diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c +index dd508d93e9fc..0177198e90eb 100644 +--- a/fs/bcachefs/io_write.c ++++ b/fs/bcachefs/io_write.c +@@ -374,7 +374,7 @@ static int bch2_write_index_default(struct bch_write_op *op) + bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, +- op->flags & BCH_WRITE_CHECK_ENOSPC); ++ op->flags & BCH_WRITE_check_enospc); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -403,7 +403,7 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", +- op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); ++ op->flags & BCH_WRITE_move ? "(internal move)" : ""); + } + + void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +@@ -483,7 +483,7 @@ static void bch2_write_done(struct closure *cl) + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + +- if (!(op->flags & BCH_WRITE_MOVE)) ++ if (!(op->flags & BCH_WRITE_move)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + +@@ -529,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op) + unsigned dev; + int ret = 0; + +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; +@@ -538,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op) + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + +- ret = !(op->flags & BCH_WRITE_MOVE) ++ ret = !(op->flags & BCH_WRITE_move) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + +@@ -570,14 +570,22 @@ static void __bch2_write_index(struct bch_write_op *op) + err: + keys->top = keys->keys; + op->error = ret; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + goto out; + } + + static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) + { + if (state != wp->state) { ++ struct task_struct *p = current; + u64 now = ktime_get_ns(); ++ u64 runtime = p->se.sum_exec_runtime + ++ (now - p->se.exec_start); ++ ++ if (state == WRITE_POINT_runnable) ++ wp->last_runtime = runtime; ++ else if (wp->state == WRITE_POINT_runnable) ++ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) +@@ -591,7 +599,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) + { + enum write_point_state state; + +- state = running ? WRITE_POINT_running : ++ state = running ? WRITE_POINT_runnable: + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + +@@ -605,8 +613,8 @@ static CLOSURE_CALLBACK(bch2_write_index) + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + +- if ((op->flags & BCH_WRITE_SUBMITTED) && +- (op->flags & BCH_WRITE_MOVE)) ++ if ((op->flags & BCH_WRITE_submitted) && ++ (op->flags & BCH_WRITE_move)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); +@@ -644,11 +652,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) + if (!op) + break; + +- op->flags |= BCH_WRITE_IN_WORKER; ++ op->flags |= BCH_WRITE_in_worker; + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + __bch2_write(op); + else + bch2_write_done(&op->cl); +@@ -672,7 +680,7 @@ static void bch2_write_endio(struct bio *bio) + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); +- op->flags |= BCH_WRITE_IO_ERROR; ++ op->flags |= BCH_WRITE_io_error; + } + + if (wbio->nocow) { +@@ -719,7 +727,7 @@ static void init_append_extent(struct bch_write_op *op, + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, +- op->flags & BCH_WRITE_CACHED); ++ op->flags & BCH_WRITE_cached); + + bch2_keylist_push(&op->insert_keys); + } +@@ -836,7 +844,7 @@ static enum prep_encoded_ret { + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + +- if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ if (!(op->flags & BCH_WRITE_data_encoded)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); +@@ -944,9 +952,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + if (ec_buf || + op->compression_opt || + (op->csum_type && +- !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ !(op->flags & BCH_WRITE_pages_stable)) || + (bch2_csum_type_is_encryption(op->csum_type) && +- !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ !(op->flags & BCH_WRITE_pages_owned))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); +@@ -966,7 +974,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + break; + + BUG_ON(op->compression_opt && +- (op->flags & BCH_WRITE_DATA_ENCODED) && ++ (op->flags & BCH_WRITE_data_encoded) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + +@@ -1004,7 +1012,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + } + } + +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { +@@ -1036,7 +1044,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, +@@ -1210,9 +1218,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + + static void __bch2_nocow_write_done(struct bch_write_op *op) + { +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -EIO; +- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) ++ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) + bch2_nocow_write_convert_unwritten(op); + } + +@@ -1241,7 +1249,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + struct bucket_to_lock *stale_at; + int stale, ret; + +- if (op->flags & BCH_WRITE_MOVE) ++ if (op->flags & BCH_WRITE_move) + return; + + darray_init(&buckets); +@@ -1299,7 +1307,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + }), GFP_KERNEL|__GFP_NOFAIL); + + if (ptr->unwritten) +- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; ++ op->flags |= BCH_WRITE_convert_unwritten; + } + + /* Unlock before taking nocow locks, doing IO: */ +@@ -1307,7 +1315,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + bch2_trans_unlock(trans); + + bch2_cut_front(op->pos, op->insert_keys.top); +- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) ++ if (op->flags & BCH_WRITE_convert_unwritten) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + darray_for_each(buckets, i) { +@@ -1332,7 +1340,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + } + + op->pos.offset += bio_sectors(bio); +@@ -1346,7 +1354,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + break; + bch2_btree_iter_advance(&iter); + } +@@ -1366,15 +1374,15 @@ static void bch2_nocow_write(struct bch_write_op *op) + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + op->error = ret; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + } + + /* fallback to cow write path? */ +- if (!(op->flags & BCH_WRITE_SUBMITTED)) { ++ if (!(op->flags & BCH_WRITE_submitted)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; +- } else if (op->flags & BCH_WRITE_SYNC) { ++ } else if (op->flags & BCH_WRITE_sync) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl.work); + } else { +@@ -1426,7 +1434,7 @@ static void __bch2_write(struct bch_write_op *op) + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + goto out_nofs_restore; + } + again: +@@ -1456,7 +1464,7 @@ static void __bch2_write(struct bch_write_op *op) + ret = bch2_trans_run(c, lockrestart_do(trans, + bch2_alloc_sectors_start_trans(trans, + op->target, +- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), + op->write_point, + &op->devs_have, + op->nr_replicas, +@@ -1479,10 +1487,10 @@ static void __bch2_write(struct bch_write_op *op) + bch2_alloc_sectors_done_inlined(c, wp); + err: + if (ret <= 0) { +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + + if (unlikely(ret < 0)) { +- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { ++ if (!(op->flags & BCH_WRITE_alloc_nowait)) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); +@@ -1514,14 +1522,14 @@ static void __bch2_write(struct bch_write_op *op) + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ +- if ((op->flags & BCH_WRITE_SYNC) || +- (!(op->flags & BCH_WRITE_SUBMITTED) && +- !(op->flags & BCH_WRITE_IN_WORKER))) { ++ if ((op->flags & BCH_WRITE_sync) || ++ (!(op->flags & BCH_WRITE_submitted) && ++ !(op->flags & BCH_WRITE_in_worker))) { + bch2_wait_on_allocator(c, &op->cl); + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + goto again; + bch2_write_done(&op->cl); + } else { +@@ -1542,8 +1550,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + + memset(&op->failed, 0, sizeof(op->failed)); + +- op->flags |= BCH_WRITE_WROTE_DATA_INLINE; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_wrote_data_inline; ++ op->flags |= BCH_WRITE_submitted; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + +@@ -1606,8 +1614,8 @@ CLOSURE_CALLBACK(bch2_write) + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + +- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) +- op->flags |= BCH_WRITE_ALLOC_NOWAIT; ++ if (op->flags & BCH_WRITE_only_specified_devs) ++ op->flags |= BCH_WRITE_alloc_nowait; + + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); + op->start_time = local_clock(); +@@ -1628,13 +1636,14 @@ CLOSURE_CALLBACK(bch2_write) + goto err; + } + +- if (!(op->flags & BCH_WRITE_MOVE) && ++ if (!(op->flags & BCH_WRITE_move) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + +- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); ++ if (!(op->flags & BCH_WRITE_move)) ++ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, +diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h +index b4626013abc8..02cca52be0bd 100644 +--- a/fs/bcachefs/io_write.h ++++ b/fs/bcachefs/io_write.h +@@ -23,21 +23,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); + + #define BCH_WRITE_FLAGS() \ +- x(ALLOC_NOWAIT) \ +- x(CACHED) \ +- x(DATA_ENCODED) \ +- x(PAGES_STABLE) \ +- x(PAGES_OWNED) \ +- x(ONLY_SPECIFIED_DEVS) \ +- x(WROTE_DATA_INLINE) \ +- x(FROM_INTERNAL) \ +- x(CHECK_ENOSPC) \ +- x(SYNC) \ +- x(MOVE) \ +- x(IN_WORKER) \ +- x(SUBMITTED) \ +- x(IO_ERROR) \ +- x(CONVERT_UNWRITTEN) ++ x(alloc_nowait) \ ++ x(cached) \ ++ x(data_encoded) \ ++ x(pages_stable) \ ++ x(pages_owned) \ ++ x(only_specified_devs) \ ++ x(wrote_data_inline) \ ++ x(check_enospc) \ ++ x(sync) \ ++ x(move) \ ++ x(in_worker) \ ++ x(submitted) \ ++ x(io_error) \ ++ x(convert_unwritten) + + enum __bch_write_flags { + #define x(f) __BCH_WRITE_##f, +diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h +index 6e878a6f2f0b..3ef6df9145ef 100644 +--- a/fs/bcachefs/io_write_types.h ++++ b/fs/bcachefs/io_write_types.h +@@ -64,7 +64,7 @@ struct bch_write_op { + struct bpos pos; + struct bversion version; + +- /* For BCH_WRITE_DATA_ENCODED: */ ++ /* For BCH_WRITE_data_encoded: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 24c294d4634e..ea96605cf162 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -56,11 +56,18 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 + prt_printf(out, "seq:\t%llu\n", seq); + printbuf_indent_add(out, 2); + +- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); ++ if (!buf->write_started) ++ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); + +- prt_printf(out, "size:\t"); +- prt_human_readable_u64(out, vstruct_bytes(buf->data)); +- prt_newline(out); ++ struct closure *cl = &buf->io; ++ int r = atomic_read(&cl->remaining); ++ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); ++ ++ if (buf->data) { ++ prt_printf(out, "size:\t"); ++ prt_human_readable_u64(out, vstruct_bytes(buf->data)); ++ prt_newline(out); ++ } + + prt_printf(out, "expires:\t"); + prt_printf(out, "%li jiffies\n", buf->expires - jiffies); +@@ -87,6 +94,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 + + static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) + { ++ lockdep_assert_held(&j->lock); ++ out->atomic++; ++ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + +@@ -95,6 +105,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) + seq++) + bch2_journal_buf_to_text(out, j, seq); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); ++ ++ --out->atomic; + } + + static inline struct journal_buf * +@@ -104,10 +116,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) + + EBUG_ON(seq > journal_cur_seq(j)); + +- if (journal_seq_unwritten(j, seq)) { ++ if (journal_seq_unwritten(j, seq)) + buf = j->buf + (seq & JOURNAL_BUF_MASK); +- EBUG_ON(le64_to_cpu(buf->data->seq) != seq); +- } + return buf; + } + +@@ -195,7 +205,8 @@ void bch2_journal_do_writes(struct journal *j) + if (w->write_started) + continue; + +- if (!journal_state_count(j->reservations, idx)) { ++ if (!journal_state_seq_count(j, j->reservations, seq)) { ++ j->seq_write_started = seq; + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } +@@ -306,7 +317,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t + + bch2_journal_space_available(j); + +- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); ++ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); + } + + void bch2_journal_halt(struct journal *j) +@@ -391,6 +402,9 @@ static int journal_entry_open(struct journal *j) + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) + return JOURNAL_ERR_max_in_flight; + ++ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) ++ return JOURNAL_ERR_max_open; ++ + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) +@@ -398,8 +412,16 @@ static int journal_entry_open(struct journal *j) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } + ++ if (!j->free_buf && !buf->data) ++ return JOURNAL_ERR_enomem; /* will retry after write completion frees up a buf */ ++ + BUG_ON(!j->cur_entry_sectors); + ++ if (!buf->data) { ++ swap(buf->data, j->free_buf); ++ swap(buf->buf_size, j->free_buf_size); ++ } ++ + buf->expires = + (journal_cur_seq(j) == j->flushed_seq_ondisk + ? jiffies +@@ -464,7 +486,7 @@ static int journal_entry_open(struct journal *j) + + new.idx++; + BUG_ON(journal_state_count(new, new.idx)); +- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); ++ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); + + journal_state_inc(&new); + +@@ -514,6 +536,33 @@ static void journal_write_work(struct work_struct *work) + spin_unlock(&j->lock); + } + ++static void journal_buf_prealloc(struct journal *j) ++{ ++ if (j->free_buf && ++ j->free_buf_size >= j->buf_size_want) ++ return; ++ ++ unsigned buf_size = j->buf_size_want; ++ ++ spin_unlock(&j->lock); ++ void *buf = kvmalloc(buf_size, GFP_NOFS); ++ spin_lock(&j->lock); ++ ++ if (buf && ++ (!j->free_buf || ++ buf_size > j->free_buf_size)) { ++ swap(buf, j->free_buf); ++ swap(buf_size, j->free_buf_size); ++ } ++ ++ if (unlikely(buf)) { ++ spin_unlock(&j->lock); ++ /* kvfree can sleep */ ++ kvfree(buf); ++ spin_lock(&j->lock); ++ } ++} ++ + static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned flags) + { +@@ -544,6 +593,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + + spin_lock(&j->lock); + ++ journal_buf_prealloc(j); ++ + /* + * Recheck after taking the lock, so we don't race with another thread + * that just did journal_entry_open() and call bch2_journal_entry_close() +@@ -571,20 +622,43 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + can_discard = j->can_discard; + spin_unlock(&j->lock); + out: ++ if (likely(!ret)) ++ return 0; + if (ret == JOURNAL_ERR_retry) + goto retry; +- if (!ret) +- return 0; + + if (journal_error_check_stuck(j, ret, flags)) + ret = -BCH_ERR_journal_res_get_blocked; + + if (ret == JOURNAL_ERR_max_in_flight && +- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { ++ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && ++ trace_journal_entry_full_enabled()) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_printbuf_make_room(&buf, 4096); + ++ spin_lock(&j->lock); ++ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); ++ bch2_journal_bufs_to_text(&buf, j); ++ spin_unlock(&j->lock); ++ ++ trace_journal_entry_full(c, buf.buf); ++ printbuf_exit(&buf); ++ count_event(c, journal_entry_full); ++ } ++ ++ if (ret == JOURNAL_ERR_max_open && ++ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && ++ trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; ++ ++ bch2_printbuf_make_room(&buf, 4096); ++ ++ spin_lock(&j->lock); + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); ++ spin_unlock(&j->lock); ++ + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); +@@ -951,7 +1025,8 @@ static void __bch2_journal_block(struct journal *j) + new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + +- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) ++ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + } + } + +@@ -992,7 +1067,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + *blocked = true; + } + +- ret = journal_state_count(s, idx) > open ++ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open + ? ERR_PTR(-EAGAIN) + : buf; + break; +@@ -1342,6 +1417,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; ++ j->seq_write_started = cur_seq - 1; + j->seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; +@@ -1382,8 +1458,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + set_bit(JOURNAL_running, &j->flags); + j->last_flush_write = jiffies; + +- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); +- j->reservations.unwritten_idx++; ++ j->reservations.idx = journal_cur_seq(j); + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + +@@ -1475,6 +1550,7 @@ void bch2_fs_journal_exit(struct journal *j) + + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + kvfree(j->buf[i].data); ++ kvfree(j->free_buf); + free_fifo(&j->pin); + } + +@@ -1501,13 +1577,13 @@ int bch2_fs_journal_init(struct journal *j) + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) + return -BCH_ERR_ENOMEM_journal_pin_fifo; + +- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { +- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; +- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); +- if (!j->buf[i].data) +- return -BCH_ERR_ENOMEM_journal_buf; ++ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; ++ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); ++ if (!j->free_buf) ++ return -BCH_ERR_ENOMEM_journal_buf; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + j->buf[i].idx = i; +- } + + j->pin.front = j->pin.back = 1; + +@@ -1557,6 +1633,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + prt_printf(out, "average write size:\t"); + prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); + prt_newline(out); ++ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); + prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 107f7f901cd9..1c460ded2a11 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) + closure_wake_up(&j->async_wait); + } + +-static inline struct journal_buf *journal_cur_buf(struct journal *j) +-{ +- return j->buf + j->reservations.idx; +-} +- + /* Sequence number of oldest dirty journal entry */ + + static inline u64 journal_last_seq(struct journal *j) +@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) + return j->seq_ondisk + 1; + } + ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ unsigned idx = (journal_cur_seq(j) & ++ JOURNAL_BUF_MASK & ++ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; ++ ++ return j->buf + idx; ++} ++ + static inline int journal_state_count(union journal_res_state s, int idx) + { + switch (idx) { +@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) + BUG(); + } + ++static inline int journal_state_seq_count(struct journal *j, ++ union journal_res_state s, u64 seq) ++{ ++ if (journal_cur_seq(j) - seq <= JOURNAL_STATE_BUF_NR) ++ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); ++ else ++ return 0; ++} ++ + static inline void journal_state_inc(union journal_res_state *s) + { + s->buf0_count += s->idx == 0; +@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) + static inline struct jset_entry * + journal_res_entry(struct journal *j, struct journal_res *res) + { +- return vstruct_idx(j->buf[res->idx].data, res->offset); ++ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); + } + + static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, +@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); + void bch2_journal_do_writes(struct journal *); + void bch2_journal_buf_put_final(struct journal *, u64); + +-static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) ++static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) + { ++ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; + union journal_res_state s; + + s = journal_state_buf_put(j, idx); +@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s + bch2_journal_buf_put_final(j, seq); + } + +-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) ++static inline void bch2_journal_buf_put(struct journal *j, u64 seq) + { ++ unsigned idx = seq & JOURNAL_STATE_BUF_MASK; + union journal_res_state s; + + s = journal_state_buf_put(j, idx); +@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, + BCH_JSET_ENTRY_btree_keys, + 0, 0, 0); + +- bch2_journal_buf_put(j, res->idx, res->seq); ++ bch2_journal_buf_put(j, res->seq); + + res->ref = 0; + } +@@ -361,9 +376,9 @@ static inline int journal_res_get_fast(struct journal *j, + &old.v, new.v)); + + res->ref = true; +- res->idx = old.idx; + res->offset = old.cur_entry_offset; +- res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ res->seq = journal_cur_seq(j); ++ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; + return 1; + } + +@@ -390,6 +405,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re + (flags & JOURNAL_RES_GET_NONBLOCK) != 0, + NULL, _THIS_IP_); + EBUG_ON(!res->ref); ++ BUG_ON(!res->seq); + } + return 0; + } +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index 11c39e0c34f4..61f71e7baff2 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -1611,7 +1611,6 @@ static CLOSURE_CALLBACK(journal_write_done) + struct journal *j = container_of(w, struct journal, buf[w->idx]); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_replicas_padded replicas; +- union journal_res_state old, new; + u64 seq = le64_to_cpu(w->data->seq); + int err = 0; + +@@ -1641,6 +1640,21 @@ static CLOSURE_CALLBACK(journal_write_done) + j->err_seq = seq; + w->write_done = true; + ++ if (!j->free_buf || j->free_buf_size < w->buf_size) { ++ swap(j->free_buf, w->data); ++ swap(j->free_buf_size, w->buf_size); ++ } ++ ++ if (w->data) { ++ void *buf = w->data; ++ w->data = NULL; ++ w->buf_size = 0; ++ ++ spin_unlock(&j->lock); ++ kvfree(buf); ++ spin_lock(&j->lock); ++ } ++ + bool completed = false; + + for (seq = journal_last_unwritten_seq(j); +@@ -1650,7 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) + if (!w->write_done) + break; + +- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { ++ if (!j->err_seq && !w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + +@@ -1671,16 +1685,6 @@ static CLOSURE_CALLBACK(journal_write_done) + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); + +- old.v = atomic64_read(&j->reservations.counter); +- do { +- new.v = old.v; +- BUG_ON(journal_state_count(new, new.unwritten_idx)); +- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); +- +- new.unwritten_idx++; +- } while (!atomic64_try_cmpxchg(&j->reservations.counter, +- &old.v, new.v)); +- + closure_wake_up(&w->wait); + completed = true; + } +@@ -1695,7 +1699,7 @@ static CLOSURE_CALLBACK(journal_write_done) + } + + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && +- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { ++ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index 1f25c111c54c..e463d2d95359 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + +- unsigned i; +- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); +- src < bl->start + nr; +- src++, i = eytzinger0_next(i, nr)) { ++ src = bl->start; ++ eytzinger0_for_each(i, nr) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) + *dst++ = *src; ++ src++; + } + + unsigned new_nr = dst - bl->start; +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index a198a81d7478..060ec991dd2b 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -12,7 +12,11 @@ + /* btree write buffer steals 8 bits for its own purposes: */ + #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) + +-#define JOURNAL_BUF_BITS 2 ++#define JOURNAL_STATE_BUF_BITS 2 ++#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) ++#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) ++ ++#define JOURNAL_BUF_BITS 4 + #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) + #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) + +@@ -79,7 +83,6 @@ struct journal_entry_pin { + + struct journal_res { + bool ref; +- u8 idx; + u16 u64s; + u32 offset; + u64 seq; +@@ -95,9 +98,8 @@ union journal_res_state { + }; + + struct { +- u64 cur_entry_offset:20, ++ u64 cur_entry_offset:22, + idx:2, +- unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, +@@ -107,13 +109,13 @@ union journal_res_state { + + /* bytes: */ + #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ +-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ + + /* + * We stash some journal state as sentinal values in cur_entry_offset: + * note - cur_entry_offset is in units of u64s + */ +-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) + + #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) + #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) +@@ -152,9 +154,11 @@ enum journal_flags { + x(retry) \ + x(blocked) \ + x(max_in_flight) \ ++ x(max_open) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ ++ x(enomem) \ + x(insufficient_devices) + + enum journal_errors { +@@ -217,6 +221,8 @@ struct journal { + * other is possibly being written out. + */ + struct journal_buf buf[JOURNAL_BUF_NR]; ++ void *free_buf; ++ unsigned free_buf_size; + + spinlock_t lock; + +@@ -234,6 +240,7 @@ struct journal { + /* Sequence number of most recent journal entry (last entry in @pin) */ + atomic64_t seq; + ++ u64 seq_write_started; + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 flushed_seq_ondisk; +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index ddc187fb693d..57ad662871ba 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -15,6 +15,7 @@ + #include "keylist.h" + #include "migrate.h" + #include "move.h" ++#include "progress.h" + #include "replicas.h" + #include "super-io.h" + +@@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + return 0; + } + +-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++static int bch2_dev_usrdata_drop(struct bch_fs *c, ++ struct progress_indicator_state *progress, ++ unsigned dev_idx, int flags) + { + struct btree_trans *trans = bch2_trans_get(c); + enum btree_id id; +@@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); ++ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); ++ })); + if (ret) + break; + } +@@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + return ret; + } + +-static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++static int bch2_dev_metadata_drop(struct bch_fs *c, ++ struct progress_indicator_state *progress, ++ unsigned dev_idx, int flags) + { + struct btree_trans *trans; + struct btree_iter iter; +@@ -125,6 +132,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + while (bch2_trans_begin(trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { ++ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); ++ + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) + goto next; + +@@ -169,6 +178,11 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) + + int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) + { +- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: +- bch2_dev_metadata_drop(c, dev_idx, flags); ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, ++ BIT_ULL(BTREE_ID_extents)| ++ BIT_ULL(BTREE_ID_reflink)); ++ ++ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, &progress, dev_idx, flags); + } +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index c493ea625553..e0e10deaea73 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { + NULL + }; + +-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, ++static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +- if (trace_move_extent_enabled()) { ++ if (trace_io_move_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); +- trace_move_extent(c, buf.buf); ++ trace_io_move(c, buf.buf); + printbuf_exit(&buf); + } + } + +-static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) ++static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) + { +- if (trace_move_extent_read_enabled()) { ++ if (trace_io_move_read_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); +- trace_move_extent_read(c, buf.buf); ++ trace_io_move_read(c, buf.buf); + printbuf_exit(&buf); + } + } +@@ -74,11 +74,7 @@ struct moving_io { + unsigned read_sectors; + unsigned write_sectors; + +- struct bch_read_bio rbio; +- + struct data_update write; +- /* Must be last since it is variable size */ +- struct bio_vec bi_inline_vecs[]; + }; + + static void move_free(struct moving_io *io) +@@ -88,13 +84,17 @@ static void move_free(struct moving_io *io) + if (io->b) + atomic_dec(&io->b->count); + +- bch2_data_update_exit(&io->write); +- + mutex_lock(&ctxt->lock); + list_del(&io->io_list); + wake_up(&ctxt->wait); + mutex_unlock(&ctxt->lock); + ++ if (!io->write.data_opts.scrub) { ++ bch2_data_update_exit(&io->write); ++ } else { ++ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); ++ kfree(io->write.bvecs); ++ } + kfree(io); + } + +@@ -114,17 +114,30 @@ static void move_write_done(struct bch_write_op *op) + + static void move_write(struct moving_io *io) + { +- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ if (ctxt->stats) { ++ if (io->write.rbio.bio.bi_status) ++ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, ++ &ctxt->stats->sectors_error_uncorrected); ++ else if (io->write.rbio.saw_error) ++ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, ++ &ctxt->stats->sectors_error_corrected); ++ } ++ ++ if (unlikely(io->write.rbio.bio.bi_status || ++ io->write.rbio.hole || ++ io->write.data_opts.scrub)) { + move_free(io); + return; + } + +- if (trace_move_extent_write_enabled()) { ++ if (trace_io_move_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); +- trace_move_extent_write(c, buf.buf); ++ trace_io_move_write(c, buf.buf); + printbuf_exit(&buf); + } + +@@ -132,7 +145,7 @@ static void move_write(struct moving_io *io) + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); + +- bch2_data_update_read_done(&io->write, io->rbio.pick.crc); ++ bch2_data_update_read_done(&io->write); + } + + struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) +@@ -145,7 +158,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx + + static void move_read_endio(struct bio *bio) + { +- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); + struct moving_context *ctxt = io->write.ctxt; + + atomic_sub(io->read_sectors, &ctxt->read_sectors); +@@ -258,14 +271,10 @@ int bch2_move_extent(struct moving_context *ctxt, + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- struct moving_io *io; +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + +- trace_move_extent2(c, k, &io_opts, &data_opts); ++ trace_io_move2(c, k, &io_opts, &data_opts); ++ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); + + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); +@@ -273,7 +282,8 @@ int bch2_move_extent(struct moving_context *ctxt, + bch2_data_update_opts_normalize(k, &data_opts); + + if (!data_opts.rewrite_ptrs && +- !data_opts.extra_replicas) { ++ !data_opts.extra_replicas && ++ !data_opts.scrub) { + if (data_opts.kill_ptrs) + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); + return 0; +@@ -285,13 +295,7 @@ int bch2_move_extent(struct moving_context *ctxt, + */ + bch2_trans_unlock(trans); + +- /* write path might have to decompress data: */ +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); +- +- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); +- io = kzalloc(sizeof(struct moving_io) + +- sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + if (!io) + goto err; + +@@ -300,31 +304,27 @@ int bch2_move_extent(struct moving_context *ctxt, + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; + +- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- bio_set_prio(&io->write.op.wbio.bio, +- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- +- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, +- GFP_KERNEL)) +- goto err_free; ++ if (!data_opts.scrub) { ++ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, ++ &io_opts, data_opts, iter->btree_id, k); ++ if (ret) ++ goto err_free; + +- io->rbio.c = c; +- io->rbio.opts = io_opts; +- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- io->rbio.bio.bi_vcnt = pages; +- bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ io->write.op.end_io = move_write_done; ++ } else { ++ bch2_bkey_buf_init(&io->write.k); ++ bch2_bkey_buf_reassemble(&io->write.k, c, k); + +- io->rbio.bio.bi_opf = REQ_OP_READ; +- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); +- io->rbio.bio.bi_end_io = move_read_endio; ++ io->write.op.c = c; ++ io->write.data_opts = data_opts; + +- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, +- io_opts, data_opts, iter->btree_id, k); +- if (ret) +- goto err_free_pages; ++ ret = bch2_data_update_bios_init(&io->write, c, &io_opts); ++ if (ret) ++ goto err_free; ++ } + +- io->write.op.end_io = move_write_done; ++ io->write.rbio.bio.bi_end_io = move_read_endio; ++ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); +@@ -339,9 +339,7 @@ int bch2_move_extent(struct moving_context *ctxt, + atomic_inc(&io->b->count); + } + +- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); +- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); +- trace_move_extent_read2(c, k); ++ trace_io_move_read2(c, k); + + mutex_lock(&ctxt->lock); + atomic_add(io->read_sectors, &ctxt->read_sectors); +@@ -356,33 +354,34 @@ int bch2_move_extent(struct moving_context *ctxt, + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); +- bch2_read_extent(trans, &io->rbio, +- bkey_start_pos(k.k), +- iter->btree_id, k, 0, +- BCH_READ_NODECODE| +- BCH_READ_LAST_FRAGMENT); ++ __bch2_read_extent(trans, &io->write.rbio, ++ io->write.rbio.bio.bi_iter, ++ bkey_start_pos(k.k), ++ iter->btree_id, k, 0, ++ NULL, ++ BCH_READ_data_update| ++ BCH_READ_last_fragment, ++ data_opts.scrub ? data_opts.read_dev : -1); + return 0; +-err_free_pages: +- bio_free_pages(&io->write.op.wbio.bio); + err_free: + kfree(io); + err: +- if (ret == -BCH_ERR_data_update_done) ++ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; + + if (bch2_err_matches(ret, EROFS) || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + +- count_event(c, move_extent_start_fail); ++ count_event(c, io_move_start_fail); + +- if (trace_move_extent_start_fail_enabled()) { ++ if (trace_io_move_start_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); +- trace_move_extent_start_fail(c, buf.buf); ++ trace_io_move_start_fail(c, buf.buf); + printbuf_exit(&buf); + } + return ret; +@@ -627,7 +626,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + +- if (ret2 == -ENOMEM) { ++ if (bch2_err_matches(ret2, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; +@@ -689,21 +688,22 @@ int bch2_move_data(struct bch_fs *c, + bool wait_on_copygc, + move_pred_fn pred, void *arg) + { +- + struct moving_context ctxt; +- int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); +- ret = __bch2_move_data(&ctxt, start, end, pred, arg); ++ int ret = __bch2_move_data(&ctxt, start, end, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; + } + +-int bch2_evacuate_bucket(struct moving_context *ctxt, +- struct move_bucket_in_flight *bucket_in_flight, +- struct bpos bucket, int gen, +- struct data_update_opts _data_opts) ++static int __bch2_move_data_phys(struct moving_context *ctxt, ++ struct move_bucket_in_flight *bucket_in_flight, ++ unsigned dev, ++ u64 bucket_start, ++ u64 bucket_end, ++ unsigned data_types, ++ move_pred_fn pred, void *arg) + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +@@ -712,16 +712,20 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + struct btree_iter iter = {}, bp_iter = {}; + struct bkey_buf sk; + struct bkey_s_c k; +- struct data_update_opts data_opts; + unsigned sectors_moved = 0; + struct bkey_buf last_flushed; + int ret = 0; + +- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); ++ struct bch_dev *ca = bch2_dev_tryget(c, dev); + if (!ca) + return 0; + +- trace_bucket_evacuate(c, &bucket); ++ bucket_end = min(bucket_end, ca->mi.nbuckets); ++ ++ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); ++ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); ++ bch2_dev_put(ca); ++ ca = NULL; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); +@@ -732,8 +736,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + */ + bch2_trans_begin(trans); + +- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, +- bucket_pos_to_bp_start(ca, bucket), 0); ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); + + bch_err_msg(c, ret, "looking up alloc key"); + if (ret) +@@ -757,7 +760,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + if (ret) + goto err; + +- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) ++ if (!k.k || bkey_gt(k.k->p, bp_end)) + break; + + if (k.k->type != KEY_TYPE_backpointer) +@@ -765,107 +768,146 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + +- if (!bp.v->level) { +- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); +- ret = bkey_err(k); +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- continue; +- if (ret) +- goto err; +- if (!k.k) +- goto next; ++ if (ctxt->stats) ++ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + +- bch2_bkey_buf_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); ++ if (!(data_types & BIT(bp.v->data_type))) ++ goto next; + ++ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ++ ret = bkey_err(k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ goto err; ++ if (!k.k) ++ goto next; ++ ++ if (!bp.v->level) { + ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + if (ret) { + bch2_trans_iter_exit(trans, &iter); + continue; + } ++ } + +- data_opts = _data_opts; +- data_opts.target = io_opts.background_target; +- data_opts.rewrite_ptrs = 0; +- +- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ +- unsigned i = 0; +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { +- if (p.ptr.dev == bucket.inode) { +- if (p.ptr.cached) { +- bch2_trans_iter_exit(trans, &iter); +- goto next; +- } +- data_opts.rewrite_ptrs |= 1U << i; +- break; +- } +- i++; +- } +- +- ret = bch2_move_extent(ctxt, bucket_in_flight, +- &iter, k, io_opts, data_opts); ++ struct data_update_opts data_opts = {}; ++ if (!pred(c, arg, k, &io_opts, &data_opts)) { + bch2_trans_iter_exit(trans, &iter); ++ goto next; ++ } + +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- continue; +- if (ret == -ENOMEM) { +- /* memory allocation failure, wait for some IO to finish */ +- bch2_move_ctxt_wait_for_io(ctxt); +- continue; +- } +- if (ret) +- goto err; ++ if (data_opts.scrub && ++ !bch2_dev_idx_is_online(c, data_opts.read_dev)) { ++ bch2_trans_iter_exit(trans, &iter); ++ ret = -BCH_ERR_device_offline; ++ break; ++ } + +- if (ctxt->stats) +- atomic64_add(sectors, &ctxt->stats->sectors_seen); +- sectors_moved += sectors; +- } else { +- struct btree *b; ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); + +- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); +- ret = PTR_ERR_OR_ZERO(b); +- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) +- goto next; +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- continue; +- if (ret) +- goto err; +- if (!b) +- goto next; ++ /* move_extent will drop locks */ ++ unsigned sectors = bp.v->bucket_len; + +- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); ++ if (!bp.v->level) ++ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); ++ else if (!data_opts.scrub) ++ ret = bch2_btree_node_rewrite_key(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); ++ else ++ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); + +- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +- bch2_trans_iter_exit(trans, &iter); +- +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- continue; +- if (ret) +- goto err; ++ bch2_trans_iter_exit(trans, &iter); + +- if (ctxt->rate) +- bch2_ratelimit_increment(ctxt->rate, sectors); +- if (ctxt->stats) { +- atomic64_add(sectors, &ctxt->stats->sectors_seen); +- atomic64_add(sectors, &ctxt->stats->sectors_moved); +- } +- sectors_moved += btree_sectors(c); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ continue; + } ++ if (ret) ++ goto err; ++ ++ if (ctxt->stats) ++ atomic64_add(sectors, &ctxt->stats->sectors_seen); ++ sectors_moved += sectors; + next: + bch2_btree_iter_advance(&bp_iter); + } +- +- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); + err: + bch2_trans_iter_exit(trans, &bp_iter); +- bch2_dev_put(ca); + bch2_bkey_buf_exit(&sk, c); + bch2_bkey_buf_exit(&last_flushed, c); + return ret; + } + ++static int bch2_move_data_phys(struct bch_fs *c, ++ unsigned dev, ++ u64 start, ++ u64 end, ++ unsigned data_types, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc, ++ move_pred_fn pred, void *arg) ++{ ++ struct moving_context ctxt; ++ ++ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); ++ ++ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); ++ ctxt.stats->phys = true; ++ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; ++ ++ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); ++ bch2_moving_ctxt_exit(&ctxt); ++ ++ return ret; ++} ++ ++struct evacuate_bucket_arg { ++ struct bpos bucket; ++ int gen; ++ struct data_update_opts data_opts; ++}; ++ ++static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ struct evacuate_bucket_arg *arg = _arg; ++ ++ *data_opts = arg->data_opts; ++ ++ unsigned i = 0; ++ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { ++ if (ptr->dev == arg->bucket.inode && ++ (arg->gen < 0 || arg->gen == ptr->gen) && ++ !ptr->cached) ++ data_opts->rewrite_ptrs |= BIT(i); ++ i++; ++ } ++ ++ return data_opts->rewrite_ptrs != 0; ++} ++ ++int bch2_evacuate_bucket(struct moving_context *ctxt, ++ struct move_bucket_in_flight *bucket_in_flight, ++ struct bpos bucket, int gen, ++ struct data_update_opts data_opts) ++{ ++ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; ++ ++ return __bch2_move_data_phys(ctxt, bucket_in_flight, ++ bucket.inode, ++ bucket.offset, ++ bucket.offset + 1, ++ ~0, ++ evacuate_bucket_pred, &arg); ++} ++ + typedef bool (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_update_opts *); +@@ -1007,14 +1049,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, + return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + } + +-static bool migrate_btree_pred(struct bch_fs *c, void *arg, +- struct btree *b, +- struct bch_io_opts *io_opts, +- struct data_update_opts *data_opts) +-{ +- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +-} +- + /* + * Ancient versions of bcachefs produced packed formats which could represent + * keys that the in memory format cannot represent; this checks for those +@@ -1104,6 +1138,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, + return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + } + ++static bool scrub_pred(struct bch_fs *c, void *_arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ struct bch_ioctl_data *arg = _arg; ++ ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == arg->migrate.dev) { ++ if (!p.crc.csum_type) ++ return false; ++ break; ++ } ++ } ++ ++ data_opts->scrub = true; ++ data_opts->read_dev = arg->migrate.dev; ++ return true; ++} ++ + int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +@@ -1118,6 +1176,22 @@ int bch2_data_job(struct bch_fs *c, + bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + + switch (op.op) { ++ case BCH_DATA_OP_scrub: ++ /* ++ * prevent tests from spuriously failing, make sure we see all ++ * btree nodes that need to be repaired ++ */ ++ bch2_btree_interior_updates_flush(c); ++ ++ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, ++ op.scrub.data_types, ++ NULL, ++ stats, ++ writepoint_hashed((unsigned long) current), ++ false, ++ scrub_pred, &op) ?: ret; ++ break; ++ + case BCH_DATA_OP_rereplicate: + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); +@@ -1137,14 +1211,14 @@ int bch2_data_job(struct bch_fs *c, + + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); +- ret = bch2_move_btree(c, start, end, +- migrate_btree_pred, &op, stats) ?: ret; +- ret = bch2_move_data(c, start, end, +- NULL, +- stats, +- writepoint_hashed((unsigned long) current), +- true, +- migrate_pred, &op) ?: ret; ++ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, ++ ~0, ++ NULL, ++ stats, ++ writepoint_hashed((unsigned long) current), ++ true, ++ migrate_pred, &op) ?: ret; ++ bch2_btree_interior_updates_flush(c); + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_rewrite_old_nodes: +@@ -1216,7 +1290,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) +- bch2_write_op_to_text(out, &io->write.op); ++ bch2_data_update_inflight_to_text(out, &io->write); + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +index e22841ef31e4..82e473ed48d2 100644 +--- a/fs/bcachefs/move_types.h ++++ b/fs/bcachefs/move_types.h +@@ -3,17 +3,31 @@ + #define _BCACHEFS_MOVE_TYPES_H + + #include "bbpos_types.h" ++#include "bcachefs_ioctl.h" + + struct bch_move_stats { +- enum bch_data_type data_type; +- struct bbpos pos; + char name[32]; ++ bool phys; ++ enum bch_ioctl_data_event_ret ret; ++ ++ union { ++ struct { ++ enum bch_data_type data_type; ++ struct bbpos pos; ++ }; ++ struct { ++ unsigned dev; ++ u64 offset; ++ }; ++ }; + + atomic64_t keys_moved; + atomic64_t keys_raced; + atomic64_t sectors_seen; + atomic64_t sectors_moved; + atomic64_t sectors_raced; ++ atomic64_t sectors_error_corrected; ++ atomic64_t sectors_error_uncorrected; + }; + + struct move_bucket_key { +diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c +new file mode 100644 +index 000000000000..bafd1c91a802 +--- /dev/null ++++ b/fs/bcachefs/progress.c +@@ -0,0 +1,63 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bbpos.h" ++#include "disk_accounting.h" ++#include "progress.h" ++ ++void bch2_progress_init(struct progress_indicator_state *s, ++ struct bch_fs *c, ++ u64 btree_id_mask) ++{ ++ memset(s, 0, sizeof(*s)); ++ ++ s->next_print = jiffies + HZ * 10; ++ ++ for (unsigned i = 0; i < BTREE_ID_NR; i++) { ++ if (!(btree_id_mask & BIT_ULL(i))) ++ continue; ++ ++ struct disk_accounting_pos acc = { ++ .type = BCH_DISK_ACCOUNTING_btree, ++ .btree.id = i, ++ }; ++ ++ u64 v; ++ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); ++ s->nodes_total += div64_ul(v, btree_sectors(c)); ++ } ++} ++ ++static inline bool progress_update_p(struct progress_indicator_state *s) ++{ ++ bool ret = time_after_eq(jiffies, s->next_print); ++ ++ if (ret) ++ s->next_print = jiffies + HZ * 10; ++ return ret; ++} ++ ++void bch2_progress_update_iter(struct btree_trans *trans, ++ struct progress_indicator_state *s, ++ struct btree_iter *iter, ++ const char *msg) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = path_l(btree_iter_path(trans, iter))->b; ++ ++ s->nodes_seen += b != s->last_node; ++ s->last_node = b; ++ ++ if (progress_update_p(s)) { ++ struct printbuf buf = PRINTBUF; ++ unsigned percent = s->nodes_total ++ ? div64_u64(s->nodes_seen * 100, s->nodes_total) ++ : 0; ++ ++ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", ++ msg, percent, s->nodes_seen, s->nodes_total); ++ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); ++ ++ bch_info(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++} +diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h +new file mode 100644 +index 000000000000..23fb1811f943 +--- /dev/null ++++ b/fs/bcachefs/progress.h +@@ -0,0 +1,29 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_PROGRESS_H ++#define _BCACHEFS_PROGRESS_H ++ ++/* ++ * Lame progress indicators ++ * ++ * We don't like to use these because they print to the dmesg console, which is ++ * spammy - we much prefer to be wired up to a userspace programm (e.g. via ++ * thread_with_file) and have it print the progress indicator. ++ * ++ * But some code is old and doesn't support that, or runs in a context where ++ * that's not yet practical (mount). ++ */ ++ ++struct progress_indicator_state { ++ unsigned long next_print; ++ u64 nodes_seen; ++ u64 nodes_total; ++ struct btree *last_node; ++}; ++ ++void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); ++void bch2_progress_update_iter(struct btree_trans *, ++ struct progress_indicator_state *, ++ struct btree_iter *, ++ const char *); ++ ++#endif /* _BCACHEFS_PROGRESS_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index d0a1f5cd5c2b..58f6d97e506c 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -341,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + memset(data_opts, 0, sizeof(*data_opts)); + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; + + if (!data_opts->rewrite_ptrs) { + /* +@@ -449,7 +449,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, + { + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; + return data_opts->rewrite_ptrs != 0; + } + +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 98825437381c..71c786cdb192 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -32,7 +32,6 @@ + #include + #include + +-#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + + int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) + { +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 376fd0a6e868..33b656c01942 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + BUG_ON(missing_start < refd_start); + BUG_ON(missing_end > refd_end); + +- if (fsck_err(trans, reflink_p_to_missing_reflink_v, +- "pointer to missing indirect extent\n" +- " %s\n" +- " missing range %llu-%llu", +- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), +- missing_start, missing_end)) { ++ struct bpos missing_pos = bkey_start_pos(p.k); ++ missing_pos.offset += missing_start - live_start; ++ ++ prt_printf(&buf, "pointer to missing indirect extent in "); ++ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); ++ if (ret) ++ goto err; ++ ++ prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9); ++ bch2_bkey_val_to_text(&buf, c, p.s_c); ++ ++ prt_printf(&buf, "\n missing reflink btree range %llu-%llu", ++ missing_start, missing_end); ++ ++ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + ret = PTR_ERR_OR_ZERO(new); + if (ret) +diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c +index 6992e7469112..2b4b8445d418 100644 +--- a/fs/bcachefs/sb-counters.c ++++ b/fs/bcachefs/sb-counters.c +@@ -5,7 +5,13 @@ + + /* BCH_SB_FIELD_counters */ + +-static const char * const bch2_counter_names[] = { ++static const u8 counters_to_stable_map[] = { ++#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, ++ BCH_PERSISTENT_COUNTERS() ++#undef x ++}; ++ ++const char * const bch2_counter_names[] = { + #define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() + #undef x +@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) + return 0; + + return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; +-}; ++} + + static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) + { + return 0; +-}; ++} + + static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + +- for (unsigned i = 0; i < nr; i++) +- prt_printf(out, "%s \t%llu\n", +- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", +- le64_to_cpu(ctrs->d[i])); +-}; ++ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { ++ unsigned stable = counters_to_stable_map[i]; ++ if (stable < nr) ++ prt_printf(out, "%s \t%llu\n", ++ bch2_counter_names[i], ++ le64_to_cpu(ctrs->d[stable])); ++ } ++} + + int bch2_sb_counters_to_cpu(struct bch_fs *c) + { + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); +- unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); +- u64 val = 0; + +- for (i = 0; i < BCH_COUNTER_NR; i++) ++ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) + c->counters_on_mount[i] = 0; + +- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { +- val = le64_to_cpu(ctrs->d[i]); +- percpu_u64_set(&c->counters[i], val); +- c->counters_on_mount[i] = val; ++ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { ++ unsigned stable = counters_to_stable_map[i]; ++ if (stable < nr) { ++ u64 v = le64_to_cpu(ctrs->d[stable]); ++ percpu_u64_set(&c->counters[i], v); ++ c->counters_on_mount[i] = v; ++ } + } ++ + return 0; +-}; ++} + + int bch2_sb_counters_from_cpu(struct bch_fs *c) + { + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); + struct bch_sb_field_counters *ret; +- unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + if (nr < BCH_COUNTER_NR) { + ret = bch2_sb_field_resize(&c->disk_sb, counters, +- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); +- ++ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); + if (ret) { + ctrs = ret; + nr = bch2_sb_counter_nr_entries(ctrs); + } + } + ++ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { ++ unsigned stable = counters_to_stable_map[i]; ++ if (stable < nr) ++ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); ++ } + +- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) +- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + return 0; + } + +@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { + .validate = bch2_sb_counters_validate, + .to_text = bch2_sb_counters_to_text, + }; ++ ++#ifndef NO_BCACHEFS_CHARDEV ++long bch2_ioctl_query_counters(struct bch_fs *c, ++ struct bch_ioctl_query_counters __user *user_arg) ++{ ++ struct bch_ioctl_query_counters arg; ++ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); ++ if (ret) ++ return ret; ++ ++ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || ++ arg.pad) ++ return -EINVAL; ++ ++ arg.nr = min(arg.nr, BCH_COUNTER_NR); ++ ret = put_user(arg.nr, &user_arg->nr); ++ if (ret) ++ return ret; ++ ++ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { ++ unsigned stable = counters_to_stable_map[i]; ++ ++ if (stable < arg.nr) { ++ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) ++ ? percpu_u64_get(&c->counters[i]) ++ : c->counters_on_mount[i]; ++ ++ ret = put_user(v, &user_arg->d[stable]); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++#endif +diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h +index 81f8aec9fcb1..a4329ad8dd1b 100644 +--- a/fs/bcachefs/sb-counters.h ++++ b/fs/bcachefs/sb-counters.h +@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); + void bch2_fs_counters_exit(struct bch_fs *); + int bch2_fs_counters_init(struct bch_fs *); + ++extern const char * const bch2_counter_names[]; + extern const struct bch_sb_field_ops bch_sb_field_ops_counters; + ++long bch2_ioctl_query_counters(struct bch_fs *, ++ struct bch_ioctl_query_counters __user *); ++ + #endif // _BCACHEFS_SB_COUNTERS_H +diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h +index fdcf598f08b1..c82a891026d3 100644 +--- a/fs/bcachefs/sb-counters_format.h ++++ b/fs/bcachefs/sb-counters_format.h +@@ -9,10 +9,23 @@ enum counters_flags { + + #define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0, TYPE_SECTORS) \ ++ x(io_read_inline, 80, TYPE_SECTORS) \ ++ x(io_read_hole, 81, TYPE_SECTORS) \ ++ x(io_read_promote, 30, TYPE_COUNTER) \ ++ x(io_read_bounce, 31, TYPE_COUNTER) \ ++ x(io_read_split, 33, TYPE_COUNTER) \ ++ x(io_read_reuse_race, 34, TYPE_COUNTER) \ ++ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ ++ x(io_move_read, 35, TYPE_SECTORS) \ ++ x(io_move_write, 36, TYPE_SECTORS) \ ++ x(io_move_finish, 37, TYPE_SECTORS) \ ++ x(io_move_fail, 38, TYPE_COUNTER) \ ++ x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ ++ x(bucket_discard_fast, 79, TYPE_COUNTER) \ + x(bucket_alloc, 5, TYPE_COUNTER) \ + x(bucket_alloc_fail, 6, TYPE_COUNTER) \ + x(btree_cache_scan, 7, TYPE_COUNTER) \ +@@ -38,16 +51,6 @@ enum counters_flags { + x(journal_reclaim_finish, 27, TYPE_COUNTER) \ + x(journal_reclaim_start, 28, TYPE_COUNTER) \ + x(journal_write, 29, TYPE_COUNTER) \ +- x(read_promote, 30, TYPE_COUNTER) \ +- x(read_bounce, 31, TYPE_COUNTER) \ +- x(read_split, 33, TYPE_COUNTER) \ +- x(read_retry, 32, TYPE_COUNTER) \ +- x(read_reuse_race, 34, TYPE_COUNTER) \ +- x(move_extent_read, 35, TYPE_SECTORS) \ +- x(move_extent_write, 36, TYPE_SECTORS) \ +- x(move_extent_finish, 37, TYPE_SECTORS) \ +- x(move_extent_fail, 38, TYPE_COUNTER) \ +- x(move_extent_start_fail, 39, TYPE_COUNTER) \ + x(copygc, 40, TYPE_COUNTER) \ + x(copygc_wait, 41, TYPE_COUNTER) \ + x(gc_gens_end, 42, TYPE_COUNTER) \ +@@ -95,6 +98,13 @@ enum bch_persistent_counters { + BCH_COUNTER_NR + }; + ++enum bch_persistent_counters_stable { ++#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, ++ BCH_PERSISTENT_COUNTERS() ++#undef x ++ BCH_COUNTER_STABLE_NR ++}; ++ + struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h +index 762083b564ee..b29b6c6c21dd 100644 +--- a/fs/bcachefs/sb-members.h ++++ b/fs/bcachefs/sb-members.h +@@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) + return !percpu_ref_is_zero(&ca->io_ref); + } + ++static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); ++ ++static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) ++{ ++ rcu_read_lock(); ++ struct bch_dev *ca = bch2_dev_rcu(c, dev); ++ bool ret = ca && bch2_dev_is_online(ca); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ + static inline bool bch2_dev_is_readable(struct bch_dev *ca) + { + return bch2_dev_is_online(ca) && +diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c +index c54091a28909..e7f197896db1 100644 +--- a/fs/bcachefs/snapshot.c ++++ b/fs/bcachefs/snapshot.c +@@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) + goto out; + } + +- while (id && id < ancestor - IS_ANCESTOR_BITMAP) +- id = get_ancestor_below(t, id, ancestor); ++ if (likely(ancestor >= IS_ANCESTOR_BITMAP)) ++ while (id && id < ancestor - IS_ANCESTOR_BITMAP) ++ id = get_ancestor_below(t, id, ancestor); + + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) +@@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) + return 0; + } + +-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) ++u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) + { + u32 id = snapshot_root; + u32 subvol = 0, s; +diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h +index 00373cf32e7b..81180181d7c9 100644 +--- a/fs/bcachefs/snapshot.h ++++ b/fs/bcachefs/snapshot.h +@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) + return id; + } + ++u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); + u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); + + static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index a7eb1f511484..b3f2c651c1f8 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -176,7 +176,6 @@ read_attribute(btree_reserve_cache); + read_attribute(stripes_heap); + read_attribute(open_buckets); + read_attribute(open_buckets_partial); +-read_attribute(write_points); + read_attribute(nocow_lock_table); + + #ifdef BCH_WRITE_REF_DEBUG +@@ -364,9 +363,6 @@ SHOW(bch2_fs) + if (attr == &sysfs_open_buckets_partial) + bch2_open_buckets_partial_to_text(out, c); + +- if (attr == &sysfs_write_points) +- bch2_write_points_to_text(out, c); +- + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); + +@@ -569,7 +565,6 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_stripes_heap, + &sysfs_open_buckets, + &sysfs_open_buckets_partial, +- &sysfs_write_points, + #ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, + #endif +diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h +index c1b51009edf6..5718988dd7d6 100644 +--- a/fs/bcachefs/trace.h ++++ b/fs/bcachefs/trace.h +@@ -295,12 +295,12 @@ TRACE_EVENT(write_super, + + /* io.c: */ + +-DEFINE_EVENT(bio, read_promote, ++DEFINE_EVENT(bio, io_read_promote, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) + ); + +-TRACE_EVENT(read_nopromote, ++TRACE_EVENT(io_read_nopromote, + TP_PROTO(struct bch_fs *c, int ret), + TP_ARGS(c, ret), + +@@ -319,22 +319,22 @@ TRACE_EVENT(read_nopromote, + __entry->ret) + ); + +-DEFINE_EVENT(bio, read_bounce, ++DEFINE_EVENT(bio, io_read_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) + ); + +-DEFINE_EVENT(bio, read_split, ++DEFINE_EVENT(bio, io_read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) + ); + +-DEFINE_EVENT(bio, read_retry, ++DEFINE_EVENT(bio, io_read_retry, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) + ); + +-DEFINE_EVENT(bio, read_reuse_race, ++DEFINE_EVENT(bio, io_read_reuse_race, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) + ); +@@ -797,53 +797,32 @@ TRACE_EVENT(bucket_invalidate, + + /* Moving IO */ + +-TRACE_EVENT(bucket_evacuate, +- TP_PROTO(struct bch_fs *c, struct bpos *bucket), +- TP_ARGS(c, bucket), +- +- TP_STRUCT__entry( +- __field(dev_t, dev ) +- __field(u32, dev_idx ) +- __field(u64, bucket ) +- ), +- +- TP_fast_assign( +- __entry->dev = c->dev; +- __entry->dev_idx = bucket->inode; +- __entry->bucket = bucket->offset; +- ), +- +- TP_printk("%d:%d %u:%llu", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->dev_idx, __entry->bucket) +-); +- +-DEFINE_EVENT(fs_str, move_extent, ++DEFINE_EVENT(fs_str, io_move, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); + +-DEFINE_EVENT(fs_str, move_extent_read, ++DEFINE_EVENT(fs_str, io_move_read, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); + +-DEFINE_EVENT(fs_str, move_extent_write, ++DEFINE_EVENT(fs_str, io_move_write, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); + +-DEFINE_EVENT(fs_str, move_extent_finish, ++DEFINE_EVENT(fs_str, io_move_finish, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); + +-DEFINE_EVENT(fs_str, move_extent_fail, ++DEFINE_EVENT(fs_str, io_move_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); + +-DEFINE_EVENT(fs_str, move_extent_start_fail, ++DEFINE_EVENT(fs_str, io_move_start_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) + ); +@@ -881,37 +860,6 @@ TRACE_EVENT(move_data, + __entry->sectors_raced) + ); + +-TRACE_EVENT(evacuate_bucket, +- TP_PROTO(struct bch_fs *c, struct bpos *bucket, +- unsigned sectors, unsigned bucket_size, +- int ret), +- TP_ARGS(c, bucket, sectors, bucket_size, ret), +- +- TP_STRUCT__entry( +- __field(dev_t, dev ) +- __field(u64, member ) +- __field(u64, bucket ) +- __field(u32, sectors ) +- __field(u32, bucket_size ) +- __field(int, ret ) +- ), +- +- TP_fast_assign( +- __entry->dev = c->dev; +- __entry->member = bucket->inode; +- __entry->bucket = bucket->offset; +- __entry->sectors = sectors; +- __entry->bucket_size = bucket_size; +- __entry->ret = ret; +- ), +- +- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->member, __entry->bucket, +- __entry->sectors, __entry->bucket_size, +- __entry->ret) +-); +- + TRACE_EVENT(copygc, + TP_PROTO(struct bch_fs *c, + u64 buckets, +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index e0a876cbaa6b..50a90e48f6dd 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); +- eytzinger0_for_each(i, NR_QUANTILES) { +- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ eytzinger0_for_each(j, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; + +- u64 q = max(quantiles->entries[i].m, last_q); ++ u64 q = max(quantiles->entries[j].m, last_q); + prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); +@@ -701,9 +701,9 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) + #if 0 + void eytzinger1_test(void) + { +- unsigned inorder, eytz, size; ++ unsigned inorder, size; + +- pr_info("1 based eytzinger test:"); ++ pr_info("1 based eytzinger test:\n"); + + for (size = 2; + size < 65536; +@@ -711,13 +711,7 @@ void eytzinger1_test(void) + unsigned extra = eytzinger1_extra(size); + + if (!(size % 4096)) +- pr_info("tree size %u", size); +- +- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); +- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); +- +- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); +- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ pr_info("tree size %u\n", size); + + inorder = 1; + eytzinger1_for_each(eytz, size) { +@@ -728,15 +722,16 @@ void eytzinger1_test(void) + + inorder++; + } ++ BUG_ON(inorder - 1 != size); + } + } + + void eytzinger0_test(void) + { + +- unsigned inorder, eytz, size; ++ unsigned inorder, size; + +- pr_info("0 based eytzinger test:"); ++ pr_info("0 based eytzinger test:\n"); + + for (size = 1; + size < 65536; +@@ -744,13 +739,7 @@ void eytzinger0_test(void) + unsigned extra = eytzinger0_extra(size); + + if (!(size % 4096)) +- pr_info("tree size %u", size); +- +- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); +- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); +- +- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); +- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ pr_info("tree size %u\n", size); + + inorder = 0; + eytzinger0_for_each(eytz, size) { +@@ -761,54 +750,191 @@ void eytzinger0_test(void) + + inorder++; + } ++ BUG_ON(inorder != size); ++ ++ inorder = size - 1; ++ eytzinger0_for_each_prev(eytz, size) { ++ BUG_ON(eytz != eytzinger0_first(size) && ++ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); ++ ++ inorder--; ++ } ++ BUG_ON(inorder != -1); + } + } + +-static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++static inline int cmp_u16(const void *_l, const void *_r) + { + const u16 *l = _l, *r = _r; + +- return (*l > *r) - (*r - *l); ++ return (*l > *r) - (*r > *l); + } + +-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) + { +- int i, c1 = -1, c2 = -1; +- ssize_t r; ++ int r, s; ++ bool bad; + + r = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); +- if (r >= 0) +- c1 = test_array[r]; +- +- for (i = 0; i < nr; i++) +- if (test_array[i] <= search && test_array[i] > c2) +- c2 = test_array[i]; +- +- if (c1 != c2) { +- eytzinger0_for_each(i, nr) +- pr_info("[%3u] = %12u", i, test_array[i]); +- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", +- i, r, c1, c2); ++ if (r >= 0) { ++ if (test_array[r] > search) { ++ bad = true; ++ } else { ++ s = eytzinger0_next(r, nr); ++ bad = s >= 0 && test_array[s] <= search; ++ } ++ } else { ++ s = eytzinger0_last(nr); ++ bad = s >= 0 && test_array[s] <= search; ++ } ++ ++ if (bad) { ++ s = -1; ++ eytzinger0_for_each_prev(j, nr) { ++ if (test_array[j] <= search) { ++ s = j; ++ break; ++ } ++ } ++ ++ eytzinger0_for_each(j, nr) ++ pr_info("[%3u] = %12u\n", j, test_array[j]); ++ pr_info("find_le(%12u) = %3i should be %3i\n", ++ search, r, s); ++ BUG(); ++ } ++} ++ ++static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) ++{ ++ int r, s; ++ bool bad; ++ ++ r = eytzinger0_find_gt(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) { ++ if (test_array[r] <= search) { ++ bad = true; ++ } else { ++ s = eytzinger0_prev(r, nr); ++ bad = s >= 0 && test_array[s] > search; ++ } ++ } else { ++ s = eytzinger0_first(nr); ++ bad = s >= 0 && test_array[s] > search; ++ } ++ ++ if (bad) { ++ s = -1; ++ eytzinger0_for_each(j, nr) { ++ if (test_array[j] > search) { ++ s = j; ++ break; ++ } ++ } ++ ++ eytzinger0_for_each(j, nr) ++ pr_info("[%3u] = %12u\n", j, test_array[j]); ++ pr_info("find_gt(%12u) = %3i should be %3i\n", ++ search, r, s); ++ BUG(); + } + } + ++static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) ++{ ++ int r, s; ++ bool bad; ++ ++ r = eytzinger0_find_ge(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) { ++ if (test_array[r] < search) { ++ bad = true; ++ } else { ++ s = eytzinger0_prev(r, nr); ++ bad = s >= 0 && test_array[s] >= search; ++ } ++ } else { ++ s = eytzinger0_first(nr); ++ bad = s >= 0 && test_array[s] >= search; ++ } ++ ++ if (bad) { ++ s = -1; ++ eytzinger0_for_each(j, nr) { ++ if (test_array[j] >= search) { ++ s = j; ++ break; ++ } ++ } ++ ++ eytzinger0_for_each(j, nr) ++ pr_info("[%3u] = %12u\n", j, test_array[j]); ++ pr_info("find_ge(%12u) = %3i should be %3i\n", ++ search, r, s); ++ BUG(); ++ } ++} ++ ++static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) ++{ ++ unsigned r; ++ int s; ++ bool bad; ++ ++ r = eytzinger0_find(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ ++ if (r < nr) { ++ bad = test_array[r] != search; ++ } else { ++ s = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ bad = s >= 0 && test_array[s] == search; ++ } ++ ++ if (bad) { ++ eytzinger0_for_each(j, nr) ++ pr_info("[%3u] = %12u\n", j, test_array[j]); ++ pr_info("find(%12u) = %3i is incorrect\n", ++ search, r); ++ BUG(); ++ } ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ eytzinger0_find_test_le(test_array, nr, search); ++ eytzinger0_find_test_gt(test_array, nr, search); ++ eytzinger0_find_test_ge(test_array, nr, search); ++ eytzinger0_find_test_eq(test_array, nr, search); ++} ++ + void eytzinger0_find_test(void) + { + unsigned i, nr, allocated = 1 << 12; + u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); + + for (nr = 1; nr < allocated; nr++) { +- pr_info("testing %u elems", nr); ++ u16 prev = 0; ++ ++ pr_info("testing %u elems\n", nr); + + get_random_bytes(test_array, nr * sizeof(test_array[0])); + eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); + + /* verify array is sorted correctly: */ +- eytzinger0_for_each(i, nr) +- BUG_ON(i != eytzinger0_last(nr) && +- test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ eytzinger0_for_each(j, nr) { ++ BUG_ON(test_array[j] < prev); ++ prev = test_array[j]; ++ } + + for (i = 0; i < U16_MAX; i += 1 << 12) + eytzinger0_find_test_val(test_array, nr, i); +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 1a1720116071..e7c3541b38f3 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r) + + #include + +-#define QSTR(n) { { { .len = strlen(n) } }, .name = n } +- + static inline bool qstr_eq(const struct qstr l, const struct qstr r) + { + return l.len == r.len && !memcmp(l.name, r.name, l.len); +-- +2.45.3 +