diff --git a/sys-kernel/hardened-kernel/Manifest b/sys-kernel/hardened-kernel/Manifest index cd20512..7a4c5c9 100644 --- a/sys-kernel/hardened-kernel/Manifest +++ b/sys-kernel/hardened-kernel/Manifest @@ -1,9 +1,9 @@ DIST genpatches-6.11-10.base.tar.xz 757872 BLAKE2B 72566af9a781288f516dcd30881851fe371a0f3d072aeabbd9d3e57ea96896cb9d8f0d594f8729215baa83d9546c675789b596dac5781b3640e963059d23223b SHA512 ae04d309e3b97cfd7f09993cf297fa5825c53e83acc54805f1f6f2d09cd07aa1715866be3d59874d0131d1746a398b9449fda1987ea6bdbd66402e411569d874 DIST genpatches-6.11-10.experimental.tar.xz 77928 BLAKE2B a0928f0ff7eb6b9a5659d0ab41dafcf3b474cd7aa357b65a7a147972132c08703a88467e51b7dbd8004781cb0cb8a9620190737963f1fcc1e9e5d98f68ba72d6 SHA512 2be91396f9ec97b2e051db72742e3db1edaa56255c7a2cde2ce2ecc1de4771e92ba6d55e863380fe4dc6ef8d8778bec1a9926a9ffe2dd5d1036b9c36a9afae13 DIST genpatches-6.11-10.extras.tar.xz 4060 BLAKE2B a94b8799f6c1d338a17e25b1dde6aa616754bfde092eb5ad1da11a6ec8b1107dce827d05ecc756a4918339329190e6572bb089de89d9a11c8c08f067eb7b269d SHA512 1a166a0054827ac9bef700d075cc2a1e3934dbe7b7aa64b34109b521f5bb21e231d59be4643f6faf702e5d0b3cb7d82e8cc1ba1f77e3bf88c38f9b6ffc61e35f -DIST genpatches-6.12-11.base.tar.xz 711536 BLAKE2B a9911cbb7cca5e6f3b06adba52289957eaccbdef25aa1d1f610ac7d7fde2a1cce46095e3d48b281a9ff0a9e88559a492833f0f39cf151ba560d6f3ff8fa5a4b3 SHA512 d33a54716e726cc8d19b366a7ad1cc51822eba4fa329eda596e6e7dbd851a9725db4e5f4bb827f36a26604b7c0b5a60154a2310f99b18265fb3c68251420fc1c -DIST genpatches-6.12-11.experimental.tar.xz 78152 BLAKE2B 20a933a7a74056026ef8b74fd0a4cf41c425855e042a0e6cf9a1b1f0eda77a2c186712fbf59188ffcb31a3c3ba954f7df4e35c5c2dce26c2479af6854af1381d SHA512 90cccbf1c8523197e56661ad34c71d2345dbbebf1f58e4678e45bb75f0b3a057614e996788e7eccec01b5bceffa93d77d6cf259475c1619d8907927a44cadb77 -DIST genpatches-6.12-11.extras.tar.xz 4056 BLAKE2B f8c2a145a06ea061100ba4d16e873a3186c4025d48610180aed135a8802494855decd8a58c24ffa4dd394cac843e41f45dd8aa93c28e03cdb1d46d8ce496da17 SHA512 e2808c8a70aaf3ef76348542a0aa0656f1ccdbe3523b22e5539d0c3952b4013aafddca5d881bc0458aab33f1b652f178d89ae4e596a9f541274f74eaf97ad16b +DIST genpatches-6.12-13.base.tar.xz 806528 BLAKE2B 0eec1195d5a497e64556d992893f251a54c558151aef1fc1d03de0b95bdbe03588ef486875e7676f9635da62e975dd8509279ed2d96b4dcb3796e1d8a4bb3768 SHA512 3cf8549679aae42fe56a25c0f9d5eb5c59fb867c77dc22c21446064e55d91276ec75f60f5737ab9a39c82dd0e74ab1ee3bd62b5a78f7448827b5577257976aed +DIST genpatches-6.12-13.experimental.tar.xz 78444 BLAKE2B 721334f36ae20a598295ce87c31acb12eadee9080bed53ba3a4fbd03180abfa4771c39d0b2d79ccdd28b04766ddd6eb80b1c8f3dc9e5fd2b8c17fe921125d1ba SHA512 834f04aed25aa6f9e660e64b02e488f2c99cf8cb4e20a11c813c8e21b7d91a650e0c795c1369147206d707c75f2a1d25144b08ca9b1b02162443d4ea1bd36d26 +DIST genpatches-6.12-13.extras.tar.xz 4056 BLAKE2B af48fba5d81bf8e13658555d94fa131e3f8fa06144148dd11b58823f802c5c4aea823fc4cd8f308ba6518638b65637d0b8a030d513d4b2a05384fc8f40550d94 SHA512 d7b1290d44696284fde4980ad3b90a190659056739a084ae704d1edf1a06ee166fa78960a42999b9e6e2d8aad4fd330621c8865f5e2517f1453c916d6375a365 DIST gentoo-kernel-config-g14.tar.gz 5686 BLAKE2B e5147ff4ffab92428b3e7f1b0897b997f8a298805f4f43b0a4d3047607bbb1a5ebfc268e0bb9688372a5eda561df9f256c78e00cdd7e3caf7868724214722f56 SHA512 f79638f9ff9dd0154512baf0234024216b6708d35a3c03a580ca5913286ad1ea13bdde5ea9b4722c6a7cd8d591c11ec52c1e225111a260343cd56aa1f1a88502 DIST kernel-aarch64-fedora.config.6.11.5-gentoo 285046 BLAKE2B e8ae27d70fa023976e950d4edcb38963e2fff39efa5cd1ff5922278e871efe6e6cda11c609e721eb2a3f7b030ea75447be384065d3b177000c301fc287a34d7f SHA512 121bbeebace3b760ff6ef36cf9970def3073966ea2fc1089c19c08d27a0524502dedc8c988c5239e78ce04caea6feb5ba7b5d53e0319b22ba63ce6cbc2a07e75 DIST kernel-aarch64-fedora.config.6.12.1-gentoo 287989 BLAKE2B fbf6183487ffc6d30543c6b9caedbca224cc9ce4ec917e35ab351030212b721af8cc33aafa1feb229a1d6b45c9f45329f8e4957bdb3d43bee7ac223eeb90a994 SHA512 fad6121dfe4a3c82039cfe77614e90b4a954fe12d156f29ef9a596745327a3d30c7a40fc4002405a692685c7deaf9a7d3d6f944d505bc51ed5c387f9c9fd6311 @@ -16,4 +16,4 @@ DIST kernel-x86_64-fedora.config.6.12.1-gentoo 256170 BLAKE2B 39e03735453c66f426 DIST linux-6.11.tar.xz 146900704 BLAKE2B e7750c0878d71a56a0ce52d4c4c912199dad5bf5e2e8f872585a6494afbb37cbd852e612a6858936d2dc9b7776a3933818f540db408d57e90d18ea5249bba7ab SHA512 329c1f94008742e3f0c2ce7e591a16316d1b2cb9ea4596d4f45604097e07b7aa2f64afa40630a07f321a858455c77aa32ba57b271932ddcf4dc27863f9081cea DIST linux-6.12.tar.xz 147906904 BLAKE2B b2ec2fc69218cacabbbe49f78384a5d259ca581b717617c12b000b16f4a4c59ee348ea886b37147f5f70fb9a7a01c1e2c8f19021078f6b23f5bc62d1c48d5e5e SHA512 a37b1823df7b4f72542f689b65882634740ba0401a42fdcf6601d9efd2e132e5a7650e70450ba76f6cd1f13ca31180f2ccee9d54fe4df89bc0000ade4380a548 DIST linux-hardened-v6.11.8-hardened1.patch 95386 BLAKE2B c8afa1a25191e73d0a1208ce3bc7dea7d856d2697adcd3f5a9d1ec9695f393aa42099353699c1f58dd056c6fb4215860661a6a17358c887877612ac58a4cf3f6 SHA512 d5baa895f069af8e8f3e6d605e86e10137de6a3d956d8dc092e6c3ed4c52ae6faa9dc10dce2bee6696a75e0d7e595f912e06f64a36965ef282918145567597b3 -DIST linux-hardened-v6.12.8-hardened1.patch 89620 BLAKE2B a18bb10a7d184ca0374659c6dfe9efd56501482329f05bae2081510a887f7aa77fd651f635da05304f75b9e1bcad02dc4249123e6687a89e5be0eefe0d508ca8 SHA512 6ea3f25dbe3724799705d7f6cf49dce8884dd6cbbc479987db90e6fb3b0493cc71febddcca70c7bee129ec1c867b541485d61e95b09b3524e0746576396aa936 +DIST linux-hardened-v6.12.10-hardened1.patch 89621 BLAKE2B f33abaf900d6401b58bdd712f0ab3069aa9156d2b68666248e53dc7c93a9817d6ee220cb70b47f3b225cfb39d779094c1021f20a93c060933bff94ba0f51a3d1 SHA512 66c70fd5d98a5a603b5661f6a0915fc34544180cededfd02c8f5b374da5af2b1d5a5b2e6dd52aa8aaa8b59f07ae0a4f2adafc8c970a6c08e7cc56289ef0e96bf diff --git a/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-432522786827.patch b/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-1410769.patch similarity index 86% rename from sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-432522786827.patch rename to sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-1410769.patch index b0858ec..52f4f5f 100644 --- a/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-432522786827.patch +++ b/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-master-1410769.patch @@ -1,15 +1,10 @@ -From 3f94dc89581133d018110bc81f108aa3bf485b38 Mon Sep 17 00:00:00 2001 +From 02db0d7cbe82bc442bb758ffc03bb32f1f7db952 Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko -Date: Sun, 5 Jan 2025 12:38:05 +0300 -Subject: [PATCH] bcachefs: cherry-pick updates from master 432522786827 +Date: Sun, 26 Jan 2025 14:49:17 +0300 +Subject: [PATCH] bcachefs: cherry-pick updates from master 1410769 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit -From tag: v6.12 up to: -commit 4325227868277451df623c89e4472a1d9db5df94 (bcachefs/master, bcachefs/for-next, bcachefs/bcachefs-testing) -Author: Kent Overstreet -Date: Sat Jan 4 12:10:25 2025 -0500 - Signed-off-by: Alexander Miroshnichenko --- .../filesystems/bcachefs/CodingStyle.rst | 2 +- @@ -19,34 +14,34 @@ Signed-off-by: Alexander Miroshnichenko fs/bcachefs/alloc_background.c | 558 ++++++------ fs/bcachefs/alloc_background.h | 18 +- fs/bcachefs/alloc_background_format.h | 4 +- - fs/bcachefs/alloc_foreground.c | 304 +++---- - fs/bcachefs/alloc_foreground.h | 4 +- + fs/bcachefs/alloc_foreground.c | 315 +++---- + fs/bcachefs/alloc_foreground.h | 21 +- fs/bcachefs/backpointers.c | 838 +++++++++++------- fs/bcachefs/backpointers.h | 97 +- fs/bcachefs/bbpos.h | 2 +- fs/bcachefs/bcachefs.h | 70 +- - fs/bcachefs/bcachefs_format.h | 105 ++- + fs/bcachefs/bcachefs_format.h | 106 ++- fs/bcachefs/bkey.h | 7 - fs/bcachefs/bkey_methods.c | 29 +- fs/bcachefs/bkey_methods.h | 15 +- fs/bcachefs/bkey_types.h | 28 + - fs/bcachefs/btree_cache.c | 59 +- + fs/bcachefs/btree_cache.c | 64 +- fs/bcachefs/btree_cache.h | 14 +- fs/bcachefs/btree_gc.c | 178 +--- fs/bcachefs/btree_gc.h | 4 +- fs/bcachefs/btree_io.c | 225 +++-- fs/bcachefs/btree_io.h | 6 +- - fs/bcachefs/btree_iter.c | 590 +++++++----- + fs/bcachefs/btree_iter.c | 593 ++++++++----- fs/bcachefs/btree_iter.h | 134 ++- fs/bcachefs/btree_journal_iter.c | 237 ++++- fs/bcachefs/btree_journal_iter.h | 22 +- fs/bcachefs/btree_journal_iter_types.h | 36 + - fs/bcachefs/btree_key_cache.c | 75 +- - fs/bcachefs/btree_locking.c | 16 +- + fs/bcachefs/btree_key_cache.c | 73 +- + fs/bcachefs/btree_locking.c | 78 +- fs/bcachefs/btree_locking.h | 50 +- fs/bcachefs/btree_node_scan.c | 153 ++-- fs/bcachefs/btree_node_scan_types.h | 1 - - fs/bcachefs/btree_trans_commit.c | 205 ++--- + fs/bcachefs/btree_trans_commit.c | 207 ++--- fs/bcachefs/btree_types.h | 42 +- fs/bcachefs/btree_update.c | 70 +- fs/bcachefs/btree_update.h | 29 +- @@ -59,61 +54,67 @@ Signed-off-by: Alexander Miroshnichenko fs/bcachefs/chardev.c | 219 +---- fs/bcachefs/checksum.c | 10 +- fs/bcachefs/checksum.h | 2 +- - fs/bcachefs/compress.c | 96 +- + fs/bcachefs/compress.c | 127 ++- + fs/bcachefs/compress.h | 4 +- fs/bcachefs/darray.h | 2 +- - fs/bcachefs/data_update.c | 76 +- - fs/bcachefs/debug.c | 4 +- + fs/bcachefs/data_update.c | 293 ++++-- + fs/bcachefs/data_update.h | 9 +- + fs/bcachefs/debug.c | 5 +- fs/bcachefs/dirent.c | 10 +- - fs/bcachefs/dirent.h | 4 +- + fs/bcachefs/dirent.h | 9 +- fs/bcachefs/disk_accounting.c | 150 ++-- fs/bcachefs/disk_accounting.h | 73 +- fs/bcachefs/ec.c | 267 +++--- fs/bcachefs/ec.h | 5 +- - fs/bcachefs/errcode.h | 21 +- + fs/bcachefs/ec_format.h | 17 + + fs/bcachefs/errcode.h | 26 +- fs/bcachefs/error.c | 187 ++-- fs/bcachefs/error.h | 58 +- fs/bcachefs/extent_update.c | 4 +- fs/bcachefs/extents.c | 290 ++---- fs/bcachefs/extents.h | 18 +- fs/bcachefs/extents_format.h | 15 +- - fs/bcachefs/fs-common.c | 108 ++- + fs/bcachefs/fs-common.c | 119 ++- fs/bcachefs/fs-common.h | 2 + - fs/bcachefs/fs-io-buffered.c | 45 +- - fs/bcachefs/fs-io-direct.c | 5 + + fs/bcachefs/fs-io-buffered.c | 68 +- + fs/bcachefs/fs-io-direct.c | 25 +- fs/bcachefs/fs-io-pagecache.c | 4 +- fs/bcachefs/fs-io.c | 54 +- fs/bcachefs/fs-ioctl.c | 7 +- - fs/bcachefs/fs.c | 88 +- + fs/bcachefs/fs.c | 101 ++- fs/bcachefs/fs.h | 1 + - fs/bcachefs/fsck.c | 731 +++++++++------ + fs/bcachefs/fsck.c | 772 ++++++++++------ fs/bcachefs/fsck.h | 11 + fs/bcachefs/inode.c | 169 ++-- fs/bcachefs/inode.h | 43 +- fs/bcachefs/inode_format.h | 15 +- fs/bcachefs/io_misc.c | 22 +- - fs/bcachefs/io_read.c | 246 +++-- - fs/bcachefs/io_read.h | 28 +- - fs/bcachefs/io_write.c | 102 ++- - fs/bcachefs/journal.c | 162 +++- - fs/bcachefs/journal.h | 9 +- - fs/bcachefs/journal_io.c | 225 +++-- + fs/bcachefs/io_read.c | 726 ++++++++------- + fs/bcachefs/io_read.h | 98 +- + fs/bcachefs/io_write.c | 184 ++-- + fs/bcachefs/io_write.h | 31 +- + fs/bcachefs/io_write_types.h | 2 +- + fs/bcachefs/journal.c | 252 +++--- + fs/bcachefs/journal.h | 18 +- + fs/bcachefs/journal_io.c | 222 +++-- fs/bcachefs/journal_io.h | 2 +- - fs/bcachefs/journal_reclaim.c | 19 +- - fs/bcachefs/journal_types.h | 5 + + fs/bcachefs/journal_reclaim.c | 161 +++- + fs/bcachefs/journal_reclaim.h | 3 + + fs/bcachefs/journal_types.h | 18 +- fs/bcachefs/logged_ops.c | 11 +- fs/bcachefs/logged_ops_format.h | 5 + fs/bcachefs/lru.c | 4 +- fs/bcachefs/lru.h | 2 +- - fs/bcachefs/move.c | 184 ++-- + fs/bcachefs/move.c | 248 +++--- fs/bcachefs/move.h | 5 +- - fs/bcachefs/movinggc.c | 6 +- + fs/bcachefs/movinggc.c | 17 +- fs/bcachefs/opts.c | 26 +- fs/bcachefs/opts.h | 61 +- fs/bcachefs/printbuf.h | 15 +- fs/bcachefs/quota.c | 2 +- fs/bcachefs/quota.h | 4 +- fs/bcachefs/rcu_pending.c | 38 +- - fs/bcachefs/rebalance.c | 266 +++++- + fs/bcachefs/rebalance.c | 270 +++++- fs/bcachefs/rebalance.h | 10 + fs/bcachefs/rebalance_format.h | 53 ++ fs/bcachefs/rebalance_types.h | 2 - @@ -127,13 +128,13 @@ Signed-off-by: Alexander Miroshnichenko fs/bcachefs/reflink_format.h | 7 +- fs/bcachefs/sb-clean.c | 6 +- fs/bcachefs/sb-counters_format.h | 165 ++-- - fs/bcachefs/sb-downgrade.c | 25 +- - fs/bcachefs/sb-errors_format.h | 53 +- - fs/bcachefs/six.c | 17 +- + fs/bcachefs/sb-downgrade.c | 28 +- + fs/bcachefs/sb-errors_format.h | 56 +- + fs/bcachefs/six.c | 27 +- fs/bcachefs/six.h | 1 + fs/bcachefs/snapshot.c | 515 +++++------ fs/bcachefs/snapshot.h | 17 +- - fs/bcachefs/str_hash.c | 286 ++++++ + fs/bcachefs/str_hash.c | 295 ++++++ fs/bcachefs/str_hash.h | 28 +- fs/bcachefs/subvolume.c | 68 +- fs/bcachefs/subvolume.h | 19 +- @@ -144,7 +145,7 @@ Signed-off-by: Alexander Miroshnichenko fs/bcachefs/super.h | 10 - fs/bcachefs/sysfs.c | 60 +- fs/bcachefs/tests.c | 26 +- - fs/bcachefs/trace.h | 77 +- + fs/bcachefs/trace.h | 103 ++- fs/bcachefs/util.h | 32 + fs/bcachefs/varint.c | 5 +- fs/bcachefs/xattr.c | 13 +- @@ -152,7 +153,7 @@ Signed-off-by: Alexander Miroshnichenko fs/fs_parser.c | 3 +- include/linux/fs_parser.h | 2 + include/linux/min_heap.h | 4 +- - 140 files changed, 7038 insertions(+), 4594 deletions(-) + 146 files changed, 7954 insertions(+), 5223 deletions(-) create mode 100644 fs/bcachefs/btree_journal_iter_types.h create mode 100644 fs/bcachefs/rebalance_format.h create mode 100644 fs/bcachefs/str_hash.c @@ -1184,7 +1185,7 @@ index befdaa95c515..740238369a5a 100644 __u32 pad; } __packed __aligned(8); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -index 372178c8d416..6df41c331a52 100644 +index 372178c8d416..ecd14962ab01 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) @@ -1217,87 +1218,98 @@ index 372178c8d416..6df41c331a52 100644 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= -@@ -175,20 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) +@@ -175,70 +179,46 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) --{ ++static inline bool may_alloc_bucket(struct bch_fs *c, ++ struct bpos bucket, ++ struct bucket_alloc_state *s) + { - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; -- } -- ++ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { ++ s->skipped_open++; ++ return false; + } + - return -1; -} -- - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) - { - switch (watermark) { -@@ -206,33 +196,40 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) - } - } - --static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -- u64 bucket, -- enum bch_watermark watermark, -- const struct bch_alloc_v4 *a, -- struct bucket_alloc_state *s, -- struct closure *cl) -+static inline bool may_alloc_bucket(struct bch_fs *c, -+ struct bpos bucket, -+ struct bucket_alloc_state *s) - { -- struct open_bucket *ob; -- -- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -- s->skipped_nouse++; -- return NULL; -- } -- -- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { -+ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; -- return NULL; -+ return false; - } - - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { - s->skipped_need_journal_commit++; -- return NULL; -+ return false; - } - -- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { -+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; ++ s->skipped_need_journal_commit++; + return false; + } + +-static inline unsigned open_buckets_reserved(enum bch_watermark watermark) +-{ +- switch (watermark) { +- case BCH_WATERMARK_interior_updates: +- return 0; +- case BCH_WATERMARK_reclaim: +- return OPEN_BUCKETS_COUNT / 6; +- case BCH_WATERMARK_btree: +- case BCH_WATERMARK_btree_copygc: +- return OPEN_BUCKETS_COUNT / 4; +- case BCH_WATERMARK_copygc: +- return OPEN_BUCKETS_COUNT / 3; +- default: +- return OPEN_BUCKETS_COUNT / 2; ++ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { ++ s->skipped_nocow++; ++ return false; + } + + return true; -+} -+ -+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + } + + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +- u64 bucket, + u64 bucket, u8 gen, -+ enum bch_watermark watermark, -+ struct bucket_alloc_state *s, -+ struct closure *cl) -+{ + enum bch_watermark watermark, +- const struct bch_alloc_v4 *a, + struct bucket_alloc_state *s, + struct closure *cl) + { +- struct open_bucket *ob; + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; -+ -+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -+ s->skipped_nouse++; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; return NULL; } -@@ -254,14 +251,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * +- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { +- s->skipped_open++; +- return NULL; +- } +- +- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { +- s->skipped_need_journal_commit++; +- return NULL; +- } +- +- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { +- s->skipped_nocow++; +- return NULL; +- } +- + spin_lock(&c->freelist_lock); + +- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + +@@ -254,14 +234,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } @@ -1314,7 +1326,7 @@ index 372178c8d416..6df41c331a52 100644 ob->bucket = bucket; spin_unlock(&ob->lock); -@@ -276,111 +272,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * +@@ -276,111 +255,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, @@ -1345,8 +1357,7 @@ index 372178c8d416..6df41c331a52 100644 - ob = ERR_PTR(-EIO); - goto err; - } -+ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - +- - k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_cached); @@ -1390,7 +1401,8 @@ index 372178c8d416..6df41c331a52 100644 - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { - struct bch_backpointer bp; - struct bpos bp_pos = POS_MIN; -- ++ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); + - ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, - &bp_pos, &bp, - BTREE_ITER_nopreserve); @@ -1438,7 +1450,7 @@ index 372178c8d416..6df41c331a52 100644 */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, -@@ -389,10 +303,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, +@@ -389,10 +286,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { @@ -1451,7 +1463,7 @@ index 372178c8d416..6df41c331a52 100644 u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; -@@ -415,10 +330,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, +@@ -415,10 +313,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; @@ -1462,7 +1474,7 @@ index 372178c8d416..6df41c331a52 100644 if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -@@ -452,7 +363,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, +@@ -452,7 +346,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, s->buckets_seen++; @@ -1474,7 +1486,7 @@ index 372178c8d416..6df41c331a52 100644 next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); -@@ -489,20 +403,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, +@@ -489,20 +386,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -1506,7 +1518,7 @@ index 372178c8d416..6df41c331a52 100644 if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { -@@ -511,32 +426,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, +@@ -511,32 +409,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto fail; bucket = sector_to_bucket(ca, @@ -1553,7 +1565,7 @@ index 372178c8d416..6df41c331a52 100644 ob = ERR_PTR(ret); if (!ob && alloc_start > ca->mi.first_bucket) { -@@ -544,8 +463,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, +@@ -544,8 +446,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto again; } @@ -1562,7 +1574,7 @@ index 372178c8d416..6df41c331a52 100644 return ob; } -@@ -595,6 +512,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, +@@ -595,6 +495,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * @watermark: how important is this allocation? * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available @@ -1570,7 +1582,7 @@ index 372178c8d416..6df41c331a52 100644 * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. -@@ -629,6 +547,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, +@@ -629,6 +530,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, bch2_dev_do_invalidates(ca); if (!avail) { @@ -1581,7 +1593,7 @@ index 372178c8d416..6df41c331a52 100644 if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; -@@ -711,9 +633,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, +@@ -711,9 +616,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) @@ -1593,7 +1605,7 @@ index 372178c8d416..6df41c331a52 100644 return ret; } -@@ -785,18 +707,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, +@@ -785,18 +690,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; @@ -1615,7 +1627,7 @@ index 372178c8d416..6df41c331a52 100644 if (!ca) continue; -@@ -805,8 +722,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, +@@ -805,8 +705,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } @@ -1623,11 +1635,11 @@ index 372178c8d416..6df41c331a52 100644 - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, -+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); ++ cl, flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); -@@ -850,10 +768,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, +@@ -850,10 +751,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; @@ -1638,7 +1650,7 @@ index 372178c8d416..6df41c331a52 100644 int ret = 0; if (nr_replicas < 2) -@@ -862,34 +776,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, +@@ -862,34 +759,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; @@ -1690,8 +1702,26 @@ index 372178c8d416..6df41c331a52 100644 bch2_ec_stripe_head_put(c, h); return ret; } +@@ -1420,7 +1315,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (wp->data_type != BCH_DATA_user) + have_cache = true; + +- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ if (target && !(flags & BCH_WRITE_only_specified_devs)) { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, +@@ -1510,7 +1405,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + +- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && ++ if (cl && !(flags & BCH_WRITE_alloc_nowait) && + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -index 1a16fd5bd4f8..f25481a0d1a0 100644 +index 1a16fd5bd4f8..baf5dc163c8a 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); @@ -1703,7 +1733,7 @@ index 1a16fd5bd4f8..f25481a0d1a0 100644 }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -@@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, +@@ -28,13 +28,28 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); @@ -1712,6 +1742,28 @@ index 1a16fd5bd4f8..f25481a0d1a0 100644 static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) { return bch2_dev_have_ref(c, ob->dev); + } + ++static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) ++{ ++ switch (watermark) { ++ case BCH_WATERMARK_interior_updates: ++ return 0; ++ case BCH_WATERMARK_reclaim: ++ return OPEN_BUCKETS_COUNT / 6; ++ case BCH_WATERMARK_btree: ++ case BCH_WATERMARK_btree_copygc: ++ return OPEN_BUCKETS_COUNT / 4; ++ case BCH_WATERMARK_copygc: ++ return OPEN_BUCKETS_COUNT / 3; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum bch_watermark, enum bch_data_type, + struct closure *); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 654a58132a4d..ebeb6a5ff9d2 100644 --- a/fs/bcachefs/backpointers.c @@ -3291,7 +3343,7 @@ index e94a83b8113e..161cf2f05d2a 100644 struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -index 5004f6ba997c..0680930508a3 100644 +index 5004f6ba997c..f70f0108401f 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) @@ -3323,7 +3375,7 @@ index 5004f6ba997c..0680930508a3 100644 #include "inode_format.h" #include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" -@@ -679,7 +679,13 @@ struct bch_sb_field_ext { +@@ -679,7 +679,14 @@ struct bch_sb_field_ext { x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ @@ -3334,11 +3386,12 @@ index 5004f6ba997c..0680930508a3 100644 + x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ -+ x(autofix_errors, BCH_VERSION(1, 19)) ++ x(autofix_errors, BCH_VERSION(1, 19)) \ ++ x(directory_size, BCH_VERSION(1, 20)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, -@@ -844,6 +850,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, +@@ -844,6 +851,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, struct bch_sb, flags[5], 16, 32); @@ -3349,7 +3402,7 @@ index 5004f6ba997c..0680930508a3 100644 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { -@@ -896,21 +906,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u +@@ -896,21 +907,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(new_varint, 15) \ x(journal_no_flush, 16) \ x(alloc_v2, 17) \ @@ -3382,7 +3435,7 @@ index 5004f6ba997c..0680930508a3 100644 enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, -@@ -1032,7 +1043,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) +@@ -1032,7 +1044,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) x(crc64, 2) \ x(xxhash, 3) @@ -3391,7 +3444,7 @@ index 5004f6ba997c..0680930508a3 100644 #define x(t, n) BCH_CSUM_OPT_##t = n, BCH_CSUM_OPTS() #undef x -@@ -1221,6 +1232,15 @@ struct jset_entry_log { +@@ -1221,6 +1233,15 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); @@ -3407,7 +3460,7 @@ index 5004f6ba997c..0680930508a3 100644 struct jset_entry_datetime { struct jset_entry entry; __le64 seconds; -@@ -1268,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); +@@ -1268,14 +1289,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); /* Btree: */ enum btree_id_flags { @@ -3431,7 +3484,7 @@ index 5004f6ba997c..0680930508a3 100644 BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ -@@ -1283,17 +1307,20 @@ enum btree_id_flags { +@@ -1283,17 +1308,20 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_reservation)| \ BIT_ULL(KEY_TYPE_reflink_p)| \ BIT_ULL(KEY_TYPE_inline_data)) \ @@ -3455,7 +3508,7 @@ index 5004f6ba997c..0680930508a3 100644 BIT_ULL(KEY_TYPE_whiteout)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_hash_whiteout)| \ -@@ -1307,7 +1334,9 @@ enum btree_id_flags { +@@ -1307,7 +1335,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_quota)) \ x(stripes, 6, 0, \ BIT_ULL(KEY_TYPE_stripe)) \ @@ -3466,7 +3519,7 @@ index 5004f6ba997c..0680930508a3 100644 BIT_ULL(KEY_TYPE_reflink_v)| \ BIT_ULL(KEY_TYPE_indirect_inline_data)| \ BIT_ULL(KEY_TYPE_error)) \ -@@ -1315,28 +1344,38 @@ enum btree_id_flags { +@@ -1315,28 +1345,38 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_subvolume)) \ x(snapshots, 9, 0, \ BIT_ULL(KEY_TYPE_snapshot)) \ @@ -3512,7 +3565,7 @@ index 5004f6ba997c..0680930508a3 100644 BIT_ULL(KEY_TYPE_accounting)) \ enum btree_id { -@@ -1361,6 +1400,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) +@@ -1361,6 +1401,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) case BTREE_ID_need_discard: case BTREE_ID_freespace: case BTREE_ID_bucket_gens: @@ -3712,10 +3765,22 @@ index c9ae9e42b385..b4f328f9853c 100644 + #endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -index 7123019ab3bc..672ca2c1d37d 100644 +index 7123019ab3bc..ca755e8d1a37 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c -@@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) +@@ -24,7 +24,10 @@ do { \ + } while (0) + + const char * const bch2_btree_node_flags[] = { +-#define x(f) #f, ++ "typebit", ++ "typebit", ++ "typebit", ++#define x(f) [BTREE_NODE_##f] = #f, + BTREE_FLAGS() + #undef x + NULL +@@ -222,7 +225,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) struct btree_cache *bc = &c->btree_cache; mutex_lock(&bc->lock); @@ -3723,7 +3788,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { set_btree_node_pinned(b); list_move(&b->list, &bc->live[1].list); -@@ -326,7 +325,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, +@@ -326,7 +328,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); @@ -3732,7 +3797,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 bkey_copy(&b->key, new); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -@@ -1004,16 +1003,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +@@ -1004,16 +1006,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) return; prt_printf(&buf, @@ -3755,7 +3820,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); -@@ -1133,7 +1130,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr +@@ -1133,7 +1133,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); @@ -3764,7 +3829,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 } EBUG_ON(b->c.btree_id != path->btree_id); -@@ -1223,7 +1220,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * +@@ -1223,7 +1223,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); @@ -3773,7 +3838,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 } EBUG_ON(b->c.btree_id != path->btree_id); -@@ -1305,7 +1302,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, +@@ -1305,7 +1305,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); @@ -3782,7 +3847,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 goto out; } -@@ -1398,13 +1395,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) +@@ -1398,13 +1398,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) prt_printf(out, "(unknown btree %u)", btree); } @@ -3819,7 +3884,7 @@ index 7123019ab3bc..672ca2c1d37d 100644 } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -@@ -1478,8 +1493,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc +@@ -1478,8 +1496,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); @@ -4610,7 +4675,7 @@ index 9b01ca3de907..6f9e4a6dacf7 100644 bool bch2_btree_flush_all_reads(struct bch_fs *); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -index eef9b89c561d..367231ab1980 100644 +index eef9b89c561d..5988219c6908 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -4910,9 +4975,12 @@ index eef9b89c561d..367231ab1980 100644 if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) -@@ -2184,10 +2242,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); +@@ -2181,13 +2239,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos + if (unlikely(ret)) + return bkey_s_c_err(ret); +- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); +- k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - iter->k = u; @@ -4927,10 +4995,11 @@ index eef9b89c561d..367231ab1980 100644 + + iter->k = u; + k.k = &iter->k; ++ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } -@@ -2201,8 +2264,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp +@@ -2201,8 +2263,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp bch2_btree_iter_verify(iter); while (1) { @@ -4939,7 +5008,7 @@ index eef9b89c561d..367231ab1980 100644 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); -@@ -2212,17 +2273,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp +@@ -2212,17 +2272,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); @@ -4960,7 +5029,7 @@ index eef9b89c561d..367231ab1980 100644 } btree_path_set_should_be_locked(trans, path); -@@ -2233,15 +2294,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp +@@ -2233,15 +2293,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; @@ -4979,7 +5048,7 @@ index eef9b89c561d..367231ab1980 100644 if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) -@@ -2270,32 +2330,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp +@@ -2270,32 +2329,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; @@ -5019,7 +5088,7 @@ index eef9b89c561d..367231ab1980 100644 EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { -@@ -2304,8 +2364,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e +@@ -2304,8 +2363,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->update_path = 0; } @@ -5028,7 +5097,7 @@ index eef9b89c561d..367231ab1980 100644 while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (unlikely(!k.k)) -@@ -2313,75 +2371,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e +@@ -2313,75 +2370,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (unlikely(bkey_err(k))) goto out_no_locked; @@ -5139,7 +5208,7 @@ index eef9b89c561d..367231ab1980 100644 + continue; } - } -- + - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: @@ -5151,7 +5220,7 @@ index eef9b89c561d..367231ab1980 100644 - search_key = bpos_successor(k.k->p); - continue; - } - +- - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { - search_key = bkey_successor(iter, k.k->p); @@ -5164,7 +5233,7 @@ index eef9b89c561d..367231ab1980 100644 } /* -@@ -2451,127 +2509,204 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +@@ -2451,127 +2508,204 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } @@ -5228,6 +5297,10 @@ index eef9b89c561d..367231ab1980 100644 + } + + btree_path_set_should_be_locked(trans, path); ++ ++ k = btree_path_level_peek_all(trans->c, l, &iter->k); ++ if (!k.k || bpos_gt(k.k->p, search_key)) { ++ k = btree_path_level_prev(trans, path, l, &iter->k); - k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); - if (!k.k || @@ -5235,10 +5308,6 @@ index eef9b89c561d..367231ab1980 100644 - ? bpos_ge(bkey_start_pos(k.k), search_key) - : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); -+ k = btree_path_level_peek_all(trans->c, l, &iter->k); -+ if (!k.k || bpos_gt(k.k->p, search_key)) { -+ k = btree_path_level_prev(trans, path, l, &iter->k); -+ + BUG_ON(k.k && bpos_gt(k.k->p, search_key)); + } + @@ -5327,22 +5396,15 @@ index eef9b89c561d..367231ab1980 100644 + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; - ++ + if (iter->flags & BTREE_ITER_filter_snapshots) { + struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; + if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { - /* -- * If we have a saved candidate, and we're no -- * longer at the same _key_ (not pos), return -- * that candidate ++ /* + * If we have a saved candidate, and we're past + * the last possible snapshot overwrite, return + * it: - */ -- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { -- bch2_path_put_nokeep(trans, iter->path, -- iter->flags & BTREE_ITER_intent); -- iter->path = saved_path; ++ */ + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; @@ -5364,14 +5426,21 @@ index eef9b89c561d..367231ab1980 100644 + search_key = bpos_predecessor(k.k->p); + continue; + } -+ + + if (k.k->p.snapshot != iter->snapshot) { -+ /* + /* +- * If we have a saved candidate, and we're no +- * longer at the same _key_ (not pos), return +- * that candidate + * Have a key visible in iter->snapshot, but + * might have overwrites: - save it and keep + * searching. Unless it's a whiteout - then drop + * our previous saved candidate: -+ */ + */ +- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { +- bch2_path_put_nokeep(trans, iter->path, +- iter->flags & BTREE_ITER_intent); +- iter->path = saved_path; + if (saved_path) { + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_intent); @@ -5448,7 +5517,7 @@ index eef9b89c561d..367231ab1980 100644 if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; -@@ -2581,8 +2716,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +@@ -2581,8 +2715,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -5461,7 +5530,7 @@ index eef9b89c561d..367231ab1980 100644 } /** -@@ -2607,7 +2745,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +@@ -2607,7 +2744,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; @@ -5470,7 +5539,7 @@ index eef9b89c561d..367231ab1980 100644 bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); -@@ -2632,6 +2770,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +@@ -2632,6 +2769,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } @@ -5481,7 +5550,7 @@ index eef9b89c561d..367231ab1980 100644 if ((iter->flags & BTREE_ITER_cached) || !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; -@@ -2658,6 +2800,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +@@ -2658,6 +2799,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; @@ -5493,7 +5562,7 @@ index eef9b89c561d..367231ab1980 100644 } else { struct bpos next; struct bpos end = iter->pos; -@@ -2671,7 +2818,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +@@ -2671,7 +2817,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); @@ -5502,7 +5571,7 @@ index eef9b89c561d..367231ab1980 100644 if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); -@@ -2682,7 +2829,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +@@ -2682,7 +2828,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; @@ -5511,7 +5580,7 @@ index eef9b89c561d..367231ab1980 100644 if (unlikely(bkey_err(k))) bch2_btree_iter_set_pos(iter, pos); else -@@ -2902,7 +3049,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, +@@ -2902,7 +3048,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, @@ -5520,7 +5589,7 @@ index eef9b89c561d..367231ab1980 100644 _RET_IP_); } -@@ -2918,8 +3065,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, +@@ -2918,8 +3064,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, flags |= BTREE_ITER_snapshot_field; flags |= BTREE_ITER_all_snapshots; @@ -5533,7 +5602,7 @@ index eef9b89c561d..367231ab1980 100644 _RET_IP_); iter->min_depth = depth; -@@ -3122,14 +3272,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) +@@ -3122,14 +3271,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; @@ -5550,7 +5619,7 @@ index eef9b89c561d..367231ab1980 100644 return trans->restart_count; } -@@ -3228,7 +3378,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) +@@ -3228,7 +3377,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; @@ -5559,7 +5628,7 @@ index eef9b89c561d..367231ab1980 100644 closure_init_stack_release(&trans->ref); return trans; -@@ -3262,6 +3412,9 @@ void bch2_trans_put(struct btree_trans *trans) +@@ -3262,6 +3411,9 @@ void bch2_trans_put(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -5569,7 +5638,7 @@ index eef9b89c561d..367231ab1980 100644 bch2_trans_unlock(trans); trans_for_each_update(trans, i) -@@ -3285,6 +3438,10 @@ void bch2_trans_put(struct btree_trans *trans) +@@ -3285,6 +3437,10 @@ void bch2_trans_put(struct btree_trans *trans) closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; @@ -5580,7 +5649,7 @@ index eef9b89c561d..367231ab1980 100644 unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; -@@ -3338,8 +3495,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, +@@ -3338,8 +3494,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); @@ -5592,7 +5661,7 @@ index eef9b89c561d..367231ab1980 100644 bch2_bpos_to_text(out, btree_node_pos(b)); prt_printf(out, "\t locks %u:%u:%u held by pid %u", -@@ -3378,11 +3536,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) +@@ -3378,11 +3535,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) if (!path->nodes_locked) continue; @@ -5609,7 +5678,7 @@ index eef9b89c561d..367231ab1980 100644 bch2_bpos_to_text(out, path->pos); prt_newline(out); -@@ -3488,7 +3646,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) +@@ -3488,7 +3645,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) #ifdef CONFIG_LOCKDEP fs_reclaim_acquire(GFP_KERNEL); struct btree_trans *trans = bch2_trans_get(c); @@ -6429,7 +6498,7 @@ index 000000000000..8b773823704f + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -index 244610b1d0b5..3b62296c3100 100644 +index 244610b1d0b5..c378b97ebeca 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c) @@ -6511,19 +6580,16 @@ index 244610b1d0b5..3b62296c3100 100644 return ret; } -@@ -282,10 +291,8 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, +@@ -283,7 +292,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, unsigned flags) { -- if (flags & BTREE_ITER_cached_nofill) { + if (flags & BTREE_ITER_cached_nofill) { - ck_path->uptodate = BTREE_ITER_UPTODATE; -+ if (flags & BTREE_ITER_cached_nofill) ++ ck_path->l[0].b = NULL; return 0; -- } + } - struct bch_fs *c = trans->c; - struct btree_iter iter; -@@ -293,6 +300,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, +@@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, @@ -6531,7 +6597,7 @@ index 244610b1d0b5..3b62296c3100 100644 BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; -@@ -306,9 +314,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, +@@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (unlikely(ret)) goto out; @@ -6552,7 +6618,7 @@ index 244610b1d0b5..3b62296c3100 100644 out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); -@@ -424,8 +442,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, +@@ -424,8 +444,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; @@ -6570,7 +6636,7 @@ index 244610b1d0b5..3b62296c3100 100644 BTREE_UPDATE_key_cache_reclaim| BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun) ?: -@@ -433,7 +458,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, +@@ -433,7 +460,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| commit_flags); @@ -6579,7 +6645,7 @@ index 244610b1d0b5..3b62296c3100 100644 bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && -@@ -586,8 +611,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, +@@ -586,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, bkey_cached_free(bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -6601,10 +6667,160 @@ index 244610b1d0b5..3b62296c3100 100644 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c -index efe2a007b482..8503931463d1 100644 +index efe2a007b482..10b805a60f52 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c -@@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) +@@ -109,6 +109,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g) + lock_graph_up(g); + } + ++static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) ++{ ++ while (g->g + g->nr > i) ++ lock_graph_up(g); ++} ++ + static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) + { + g->g[g->nr++] = (struct trans_waiting_for_lock) { +@@ -124,15 +130,20 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) + __lock_graph_down(g, trans); + } + +-static bool lock_graph_remove_non_waiters(struct lock_graph *g) ++static bool lock_graph_remove_non_waiters(struct lock_graph *g, ++ struct trans_waiting_for_lock *from) + { + struct trans_waiting_for_lock *i; + +- for (i = g->g + 1; i < g->g + g->nr; i++) ++ if (from->trans->locking != from->node_want) { ++ lock_graph_pop_from(g, from); ++ return true; ++ } ++ ++ for (i = from + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { +- while (g->g + g->nr > i) +- lock_graph_up(g); ++ lock_graph_pop_from(g, i); + return true; + } + +@@ -179,13 +190,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans) + return 3; + } + +-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) ++static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, ++ struct trans_waiting_for_lock *from) + { + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; + +- if (lock_graph_remove_non_waiters(g)) ++ if (lock_graph_remove_non_waiters(g, from)) + return 0; + + /* Only checking, for debugfs: */ +@@ -195,7 +207,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) + goto out; + } + +- for (i = g->g; i < g->g + g->nr; i++) { ++ for (i = from; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; +@@ -229,8 +241,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) + ret = abort_lock(g, abort); + out: + if (ret) +- while (g->nr) +- lock_graph_up(g); ++ lock_graph_pop_all(g); ++ else ++ lock_graph_pop_from(g, abort); + return ret; + } + +@@ -243,7 +256,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) { + closure_put(&trans->ref); +- return break_cycle(g, cycle); ++ return break_cycle(g, cycle, i); + } + + if (g->nr == ARRAY_SIZE(g->g)) { +@@ -252,8 +265,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + if (orig_trans->lock_may_not_fail) + return 0; + +- while (g->nr) +- lock_graph_up(g); ++ lock_graph_pop_all(g); + + if (cycle) + return 0; +@@ -281,7 +293,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + + g.nr = 0; + +- if (trans->lock_must_abort) { ++ if (trans->lock_must_abort && !trans->lock_may_not_fail) { + if (cycle) + return -1; + +@@ -336,7 +348,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + * structures - which means it can't be blocked + * waiting on a lock: + */ +- if (!lock_graph_remove_non_waiters(&g)) { ++ if (!lock_graph_remove_non_waiters(&g, g.g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be +@@ -512,7 +524,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) + { + struct btree *b = path->l[level].b; +- struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); + + if (!is_btree_node(path, level)) + return false; +@@ -536,24 +547,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, + if (race_fault()) + return false; + +- if (btree_node_locked(path, level)) { +- bool ret; +- +- six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); +- ret = six_lock_tryupgrade(&b->c.lock); +- six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); +- +- if (ret) +- goto success; +- } else { +- if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) +- goto success; +- } ++ if (btree_node_locked(path, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) ++ goto success; + +- /* +- * Do we already have an intent lock via another path? If so, just bump +- * lock count: +- */ + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); +@@ -782,7 +780,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) return bch2_trans_relock_fail(trans, path, &f, trace); } @@ -6613,7 +6829,7 @@ index efe2a007b482..8503931463d1 100644 out: bch2_trans_verify_locks(trans); return 0; -@@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans) +@@ -818,6 +816,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } @@ -6631,7 +6847,7 @@ index efe2a007b482..8503931463d1 100644 int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { -@@ -856,6 +867,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) +@@ -856,6 +865,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) (want == BTREE_NODE_UNLOCKED || have != BTREE_NODE_WRITE_LOCKED) && want != have); @@ -7024,7 +7240,7 @@ index b6c36c45d0be..2811b6857c97 100644 u8 level; unsigned sectors_written; diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c -index 9bf471fa4361..6b79b672e0b1 100644 +index 9bf471fa4361..2760dd9569ed 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) @@ -7045,6 +7261,15 @@ index 9bf471fa4361..6b79b672e0b1 100644 six_unlock_read(&b->c.lock); bch2_trans_put(trans); +@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) + { + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, +- trans->journal_u64s, flags); ++ trans->journal_u64s, flags, trans); + } + + #define JSET_ENTRY_LOG_U64s 4 @@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct bkey_i *new_k; int ret; @@ -9244,15 +9469,16 @@ index e40499fde9a4..43b9d71f2f2b 100644 { switch (type) { diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -index 1410365a8891..f99ff1819597 100644 +index 1410365a8891..114bf2f3879f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c -@@ -2,13 +2,33 @@ +@@ -2,13 +2,34 @@ #include "bcachefs.h" #include "checksum.h" #include "compress.h" +#include "error.h" #include "extents.h" ++#include "io_write.h" +#include "opts.h" #include "super-io.h" @@ -9281,7 +9507,7 @@ index 1410365a8891..f99ff1819597 100644 /* Bounce buffer: */ struct bbuf { void *b; -@@ -158,6 +178,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -158,6 +179,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, void *workspace; int ret; @@ -9301,7 +9527,7 @@ index 1410365a8891..f99ff1819597 100644 src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { -@@ -176,13 +209,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -176,13 +210,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, .avail_out = dst_len, }; @@ -9317,7 +9543,7 @@ index 1410365a8891..f99ff1819597 100644 if (ret != Z_STREAM_END) goto err; -@@ -195,14 +228,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -195,14 +229,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, if (real_src_len > src_len - 4) goto err; @@ -9334,7 +9560,7 @@ index 1410365a8891..f99ff1819597 100644 if (ret != dst_len) goto err; -@@ -212,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -212,6 +246,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, BUG(); } ret = 0; @@ -9342,7 +9568,68 @@ index 1410365a8891..f99ff1819597 100644 out: bio_unmap_or_unbounce(c, src_data); return ret; -@@ -394,8 +428,21 @@ static unsigned __bio_compress(struct bch_fs *c, +@@ -220,11 +255,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + goto out; + } + +-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, +- struct bch_extent_crc_unpacked *crc) ++int bch2_bio_uncompress_inplace(struct bch_write_op *op, ++ struct bio *bio) + { ++ struct bch_fs *c = op->c; ++ struct bch_extent_crc_unpacked *crc = &op->crc; + struct bbuf data = { NULL }; + size_t dst_len = crc->uncompressed_size << 9; ++ int ret = 0; + + /* bio must own its pages: */ + BUG_ON(!bio->bi_vcnt); +@@ -232,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { +- bch_err(c, "error rewriting existing data: extent too big"); ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "error rewriting existing data: extent too big"); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); + return -EIO; + } + + data = __bounce_alloc(c, dst_len, WRITE); + + if (__bio_uncompress(c, bio, data.b, *crc)) { +- if (!c->opts.no_data_io) +- bch_err(c, "error rewriting existing data: decompression error"); +- bio_unmap_or_unbounce(c, data); +- return -EIO; ++ if (!c->opts.no_data_io) { ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "error rewriting existing data: decompression error"); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ret = -EIO; ++ goto err; + } + + /* +@@ -259,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + crc->uncompressed_size = crc->live_size; + crc->offset = 0; + crc->csum = (struct bch_csum) { 0, 0 }; +- ++err: + bio_unmap_or_unbounce(c, data); +- return 0; ++ return ret; + } + + int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -394,8 +441,21 @@ static unsigned __bio_compress(struct bch_fs *c, unsigned pad; int ret = 0; @@ -9366,7 +9653,7 @@ index 1410365a8891..f99ff1819597 100644 /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) -@@ -404,7 +451,7 @@ static unsigned __bio_compress(struct bch_fs *c, +@@ -404,7 +464,7 @@ static unsigned __bio_compress(struct bch_fs *c, dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); @@ -9375,7 +9662,7 @@ index 1410365a8891..f99ff1819597 100644 *src_len = src->bi_iter.bi_size; *dst_len = dst->bi_iter.bi_size; -@@ -447,7 +494,7 @@ static unsigned __bio_compress(struct bch_fs *c, +@@ -447,7 +507,7 @@ static unsigned __bio_compress(struct bch_fs *c, *src_len = round_down(*src_len, block_bytes(c)); } @@ -9384,7 +9671,7 @@ index 1410365a8891..f99ff1819597 100644 if (ret) goto err; -@@ -477,6 +524,9 @@ static unsigned __bio_compress(struct bch_fs *c, +@@ -477,6 +537,9 @@ static unsigned __bio_compress(struct bch_fs *c, err: ret = BCH_COMPRESSION_TYPE_incompressible; goto out; @@ -9394,7 +9681,7 @@ index 1410365a8891..f99ff1819597 100644 } unsigned bch2_bio_compress(struct bch_fs *c, -@@ -559,7 +609,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) +@@ -559,7 +622,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) { unsigned i; @@ -9402,7 +9689,7 @@ index 1410365a8891..f99ff1819597 100644 for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) mempool_exit(&c->compress_workspace[i]); mempool_exit(&c->compression_bounce[WRITE]); -@@ -568,7 +617,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) +@@ -568,7 +630,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { @@ -9410,7 +9697,7 @@ index 1410365a8891..f99ff1819597 100644 ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); -@@ -576,19 +624,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +@@ -576,19 +637,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) struct { unsigned feature; @@ -9439,7 +9726,7 @@ index 1410365a8891..f99ff1819597 100644 }, *i; bool have_compressed = false; -@@ -613,9 +659,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +@@ -613,9 +672,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) { @@ -9449,7 +9736,7 @@ index 1410365a8891..f99ff1819597 100644 if (!(features & (1 << i->feature))) continue; -@@ -628,11 +671,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +@@ -628,11 +684,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return -BCH_ERR_ENOMEM_compression_workspace_init; } @@ -9461,6 +9748,21 @@ index 1410365a8891..f99ff1819597 100644 return 0; } +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +index 607fd5e232c9..bec2f05bfd52 100644 +--- a/fs/bcachefs/compress.h ++++ b/fs/bcachefs/compress.h +@@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; + } + +-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, +- struct bch_extent_crc_unpacked *); ++struct bch_write_op; ++int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); + int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, + struct bvec_iter, struct bch_extent_crc_unpacked); + unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 8f4c3f0665c4..c6151495985f 100644 --- a/fs/bcachefs/darray.h @@ -9475,10 +9777,55 @@ index 8f4c3f0665c4..c6151495985f 100644 #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c -index 8e75a852b358..585214931e05 100644 +index 8e75a852b358..fbe22e56da91 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c -@@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m, +@@ -33,7 +33,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { +- if (!bch2_dev_tryget(c, ptr->dev)) { ++ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; +@@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc + return true; + } + +-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) ++static noinline void trace_move_extent_finish2(struct data_update *u, ++ struct bkey_i *new, ++ struct bkey_i *insert) + { +- if (trace_move_extent_finish_enabled()) { +- struct printbuf buf = PRINTBUF; ++ struct bch_fs *c = u->op.c; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&buf, c, k); +- trace_move_extent_finish(c, buf.buf); +- printbuf_exit(&buf); +- } ++ prt_newline(&buf); ++ ++ bch2_data_update_to_text(&buf, u); ++ prt_newline(&buf); ++ ++ prt_str_indented(&buf, "new replicas:\t"); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ prt_newline(&buf); ++ ++ prt_str_indented(&buf, "insert:\t"); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ prt_newline(&buf); ++ ++ trace_move_extent_finish(c, buf.buf); ++ printbuf_exit(&buf); + } + + static void trace_move_extent_fail2(struct data_update *m, +@@ -110,11 +123,8 @@ static void trace_move_extent_fail2(struct data_update *m, { struct bch_fs *c = m->op.c; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); @@ -9491,7 +9838,7 @@ index 8e75a852b358..585214931e05 100644 if (!trace_move_extent_fail_enabled()) return; -@@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m, +@@ -122,27 +132,25 @@ static void trace_move_extent_fail2(struct data_update *m, prt_str(&buf, msg); if (insert) { @@ -9531,7 +9878,7 @@ index 8e75a852b358..585214931e05 100644 prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); -@@ -194,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, +@@ -194,7 +202,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct bpos next_pos; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -9540,7 +9887,7 @@ index 8e75a852b358..585214931e05 100644 bch2_trans_begin(trans); -@@ -231,16 +226,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, +@@ -231,16 +239,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * * Fist, drop rewrite_ptrs from @new: */ @@ -9561,7 +9908,7 @@ index 8e75a852b358..585214931e05 100644 } if (m->data_opts.rewrite_ptrs && -@@ -323,8 +318,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, +@@ -323,8 +331,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * it's been hard to reproduce, so this should give us some more * information when it does occur: */ @@ -9575,7 +9922,7 @@ index 8e75a852b358..585214931e05 100644 if (invalid) { struct printbuf buf = PRINTBUF; -@@ -362,7 +360,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, +@@ -362,7 +373,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: @@ -9584,16 +9931,207 @@ index 8e75a852b358..585214931e05 100644 bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, -@@ -540,7 +538,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, +@@ -374,7 +385,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); +- trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); ++ if (trace_move_extent_finish_enabled()) ++ trace_move_extent_finish2(m, &new->k_i, insert); + } + err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -414,14 +426,15 @@ int bch2_data_update_index_update(struct bch_write_op *op) + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); + } + +-void bch2_data_update_read_done(struct data_update *m, +- struct bch_extent_crc_unpacked crc) ++void bch2_data_update_read_done(struct data_update *m) + { ++ m->read_done = true; ++ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + +- m->op.crc = crc; +- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; ++ m->op.crc = m->rbio.pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, NULL); + } +@@ -431,31 +444,34 @@ void bch2_data_update_exit(struct data_update *update) + struct bch_fs *c = update->op.c; + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + ++ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ kfree(update->bvecs); ++ update->bvecs = NULL; ++ + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); +- bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); +- bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ bch2_bkey_buf_exit(&update->k, c); + } + +-static void bch2_update_unwritten_extent(struct btree_trans *trans, +- struct data_update *update) ++static int bch2_update_unwritten_extent(struct btree_trans *trans, ++ struct data_update *update) + { + struct bch_fs *c = update->op.c; +- struct bio *bio = &update->op.wbio.bio; + struct bkey_i_extent *e; + struct write_point *wp; + struct closure cl; + struct btree_iter iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + + closure_init_stack(&cl); + bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); + +- while (bio_sectors(bio)) { +- unsigned sectors = bio_sectors(bio); ++ while (bpos_lt(update->op.pos, update->k.k->k.p)) { ++ unsigned sectors = update->k.k->k.p.offset - ++ update->op.pos.offset; + + bch2_trans_begin(trans); + +@@ -491,7 +507,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch_err_fn_ratelimited(c, ret); + + if (ret) +- return; ++ break; + + sectors = min(sectors, wp->sectors_free); + +@@ -501,7 +517,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + +- bio_advance(bio, sectors << 9); + update->op.pos.offset += sectors; + + extent_for_each_ptr(extent_i_to_s(e), ptr) +@@ -520,41 +535,60 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_trans_unlock(trans); + closure_sync(&cl); + } ++ ++ return ret; + } + + void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +- printbuf_tabstop_push(out, 20); +- prt_str(out, "rewrite ptrs:\t"); ++ if (!out->nr_tabstops) ++ printbuf_tabstop_push(out, 20); ++ ++ prt_str_indented(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_str(out, "compression:\t"); +- prt_str(out, "kill ptrs:\t"); ++ prt_str_indented(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + +- prt_str(out, "target:\t"); ++ prt_str_indented(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + +- prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); ++ prt_str_indented(out, "compression:\t"); + bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); - prt_str(out, "opts.replicas:\t"); -@@ -614,7 +612,7 @@ int bch2_data_update_init(struct btree_trans *trans, +- prt_str(out, "opts.replicas:\t"); ++ prt_str_indented(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); ++ prt_newline(out); + +- prt_str(out, "extra replicas:\t"); ++ prt_str_indented(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); ++ prt_newline(out); + } + + void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) ++{ ++ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); ++ prt_newline(out); ++ ++ prt_str_indented(out, "old key:\t"); ++ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); ++} ++ ++void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) + { + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); ++ printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); ++ prt_printf(out, "read_done:\t\%u\n", m->read_done); ++ bch2_write_op_to_text(out, &m->op); ++ printbuf_indent_sub(out, 2); + } + + int bch2_extent_drop_ptrs(struct btree_trans *trans, +@@ -600,6 +634,40 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } + ++static bool can_allocate_without_blocking(struct bch_fs *c, ++ struct data_update *m) ++{ ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) ++ return false; ++ ++ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ++ ? m->op.target ++ : 0; ++ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ darray_for_each(m->op.devs_have, i) ++ __clear_bit(*i, devs.d); ++ ++ rcu_read_lock(); ++ unsigned nr_replicas = 0, i; ++ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { ++ struct bch_dev *ca = bch2_dev_rcu(c, i); ++ ++ struct bch_dev_usage usage; ++ bch2_dev_usage_read_fast(ca, &usage); ++ ++ if (!dev_buckets_free(ca, usage, m->op.watermark)) ++ continue; ++ ++ nr_replicas += ca->mi.durability; ++ if (nr_replicas >= m->op.nr_replicas) ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return nr_replicas >= m->op.nr_replicas; ++} ++ + int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, + struct moving_context *ctxt, +@@ -614,7 +682,7 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -9602,18 +10140,38 @@ index 8e75a852b358..585214931e05 100644 int ret = 0; /* -@@ -622,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans, +@@ -622,17 +690,8 @@ int bch2_data_update_init(struct btree_trans *trans, * and we have to check for this because we go rw before repairing the * snapshots table - just skip it, we can move it later. */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) +- return -BCH_ERR_data_update_done; +- +- if (!bkey_get_dev_refs(c, k)) +- return -BCH_ERR_data_update_done; +- +- if (c->opts.nocow_enabled && +- !bkey_nocow_lock(c, ctxt, k)) { +- bkey_put_dev_refs(c, k); +- return -BCH_ERR_nocow_lock_blocked; +- } + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; ++ return -BCH_ERR_data_update_done_no_snapshot; - if (!bkey_get_dev_refs(c, k)) -@@ -652,22 +650,22 @@ int bch2_data_update_init(struct btree_trans *trans, - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); +@@ -647,27 +706,27 @@ int bch2_data_update_init(struct btree_trans *trans, + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.nr_replicas = 0; +- m->op.flags |= BCH_WRITE_PAGES_STABLE| +- BCH_WRITE_PAGES_OWNED| +- BCH_WRITE_DATA_ENCODED| +- BCH_WRITE_MOVE| ++ m->op.flags |= BCH_WRITE_pages_stable| ++ BCH_WRITE_pages_owned| ++ BCH_WRITE_data_encoded| ++ BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = background_compression(io_opts); + m->op.compression_opt = io_opts.background_compression; @@ -9638,7 +10196,7 @@ index 8e75a852b358..585214931e05 100644 bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } -@@ -687,7 +685,7 @@ int bch2_data_update_init(struct btree_trans *trans, +@@ -687,7 +746,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; @@ -9647,7 +10205,93 @@ index 8e75a852b358..585214931e05 100644 } unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); -@@ -750,14 +748,14 @@ int bch2_data_update_init(struct btree_trans *trans, +@@ -724,7 +783,15 @@ int bch2_data_update_init(struct btree_trans *trans, + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); +- goto out; ++ if (!ret) ++ ret = -BCH_ERR_data_update_done_no_writes_needed; ++ goto out_bkey_buf_exit; ++ } ++ ++ if ((m->op.flags & BCH_WRITE_alloc_nowait) && ++ !can_allocate_without_blocking(c, m)) { ++ ret = -BCH_ERR_data_update_done_would_block; ++ goto out_bkey_buf_exit; + } + + if (reserve_sectors) { +@@ -733,31 +800,77 @@ int bch2_data_update_init(struct btree_trans *trans, + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) +- goto out; ++ goto out_bkey_buf_exit; ++ } ++ ++ if (!bkey_get_dev_refs(c, k)) { ++ ret = -BCH_ERR_data_update_done_no_dev_refs; ++ goto out_put_disk_res; ++ } ++ ++ if (c->opts.nocow_enabled && ++ !bkey_nocow_lock(c, ctxt, k)) { ++ ret = -BCH_ERR_nocow_lock_blocked; ++ goto out_put_dev_refs; + } + + if (bkey_extent_is_unwritten(k)) { +- bch2_update_unwritten_extent(trans, m); +- goto out; ++ ret = bch2_update_unwritten_extent(trans, m) ?: ++ -BCH_ERR_data_update_done_unwritten; ++ goto out_nocow_unlock; + } + ++ /* write path might have to decompress data: */ ++ unsigned buf_bytes = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); ++ ++ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); ++ ++ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); ++ if (!m->bvecs) ++ goto enomem; ++ ++ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); ++ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); ++ ++ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) ++ goto enomem; ++ ++ rbio_init(&m->rbio.bio, c, io_opts, NULL); ++ m->rbio.bio.bi_iter.bi_size = buf_bytes; ++ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ ++ bio_set_prio(&m->op.wbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ + return 0; +-out: +- bch2_data_update_exit(m); +- return ret ?: -BCH_ERR_data_update_done; ++enomem: ++ ret = -ENOMEM; ++ kfree(m->bvecs); ++ m->bvecs = NULL; ++out_nocow_unlock: ++ if (c->opts.nocow_enabled) ++ bkey_nocow_unlock(c, k); ++out_put_dev_refs: ++ bkey_put_dev_refs(c, k); ++out_put_disk_res: ++ bch2_disk_reservation_put(c, &m->op.res); ++out_bkey_buf_exit: ++ bch2_bkey_buf_exit(&m->k, c); ++ return ret; + } + void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -9667,11 +10311,58 @@ index 8e75a852b358..585214931e05 100644 + ptr_bit <<= 1; } } +diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h +index e4b50723428e..f4cf5d17cc37 100644 +--- a/fs/bcachefs/data_update.h ++++ b/fs/bcachefs/data_update.h +@@ -4,6 +4,7 @@ + #define _BCACHEFS_DATA_UPDATE_H + + #include "bkey_buf.h" ++#include "io_read.h" + #include "io_write_types.h" + + struct moving_context; +@@ -22,20 +23,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + + struct data_update { + /* extent being updated: */ ++ bool read_done; + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_move_stats *stats; ++ ++ struct bch_read_bio rbio; + struct bch_write_op op; ++ struct bio_vec *bvecs; + }; + + void bch2_data_update_to_text(struct printbuf *, struct data_update *); ++void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); + + int bch2_data_update_index_update(struct bch_write_op *); + +-void bch2_data_update_read_done(struct data_update *, +- struct bch_extent_crc_unpacked); ++void bch2_data_update_read_done(struct data_update *); + + int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -index 45aec1afdb0e..b5de52a50d10 100644 +index 45aec1afdb0e..55333e82d1fe 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c -@@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * +@@ -20,6 +20,7 @@ + #include "extents.h" + #include "fsck.h" + #include "inode.h" ++#include "journal_reclaim.h" + #include "super.h" + + #include +@@ -472,7 +473,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); @@ -9732,7 +10423,7 @@ index faffc98d5605..600eee936f13 100644 POS(inum.inum, U64_MAX), inum.subvol, 0, k, ({ diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -index 53ad99666022..362b3b2f2f2e 100644 +index 53ad99666022..a633f83c1ac7 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,10 +4,10 @@ @@ -9748,6 +10439,18 @@ index 53ad99666022..362b3b2f2f2e 100644 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ +@@ -31,6 +31,11 @@ static inline unsigned dirent_val_u64s(unsigned len) + sizeof(u64)); + } + ++static inline unsigned int dirent_occupied_size(const struct qstr *name) ++{ ++ return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64); ++} ++ + int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 07eb8fa1b026..b32e91ba8be8 100644 --- a/fs/bcachefs/disk_accounting.c @@ -10780,8 +11483,36 @@ index 43326370b410..583ca6a226da 100644 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, +diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h +index 64ef52e00078..b9770f24f213 100644 +--- a/fs/bcachefs/ec_format.h ++++ b/fs/bcachefs/ec_format.h +@@ -20,6 +20,23 @@ struct bch_stripe { + */ + __u8 disk_label; + ++ /* ++ * Variable length sections: ++ * - Pointers ++ * - Checksums ++ * 2D array of [stripe block/device][csum block], with checksum block ++ * size given by csum_granularity_bits ++ * - Block sector counts: per-block array of u16s ++ * ++ * XXX: ++ * Either checksums should have come last, or we should have included a ++ * checksum_size field (the size in bytes of the checksum itself, not ++ * the blocksize the checksum covers). ++ * ++ * Currently we aren't able to access the block sector counts if the ++ * checksum type is unknown. ++ */ ++ + struct bch_extent_ptr ptrs[]; + } __packed __aligned(8); + diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h -index 9c4fe5cdbfb7..4590cd0c7c90 100644 +index 9c4fe5cdbfb7..d65a75e7216e 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -54,7 +54,8 @@ @@ -10819,7 +11550,7 @@ index 9c4fe5cdbfb7..4590cd0c7c90 100644 x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ -@@ -173,7 +176,9 @@ +@@ -173,8 +176,15 @@ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ @@ -10828,9 +11559,15 @@ index 9c4fe5cdbfb7..4590cd0c7c90 100644 + x(EINVAL, not_in_recovery) \ + x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ ++ x(BCH_ERR_data_update_done, data_update_done_would_block) \ ++ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ -@@ -192,7 +197,9 @@ + x(EINVAL, mismatched_block_size) \ +@@ -192,7 +202,9 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ @@ -10841,7 +11578,7 @@ index 9c4fe5cdbfb7..4590cd0c7c90 100644 x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ -@@ -241,7 +248,10 @@ +@@ -241,7 +253,10 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ @@ -10852,7 +11589,7 @@ index 9c4fe5cdbfb7..4590cd0c7c90 100644 x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ -@@ -257,6 +267,8 @@ +@@ -257,6 +272,8 @@ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ @@ -10861,7 +11598,7 @@ index 9c4fe5cdbfb7..4590cd0c7c90 100644 x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ -@@ -305,6 +317,7 @@ static inline long bch2_err_class(long err) +@@ -305,6 +322,7 @@ static inline long bch2_err_class(long err) #define BLK_STS_REMOVED ((__force blk_status_t)128) @@ -11844,7 +12581,7 @@ index 3bd2fdbb0817..c198dfc376d6 100644 union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -index 7e10a9ddcfd9..2c3d46ac70c6 100644 +index 7e10a9ddcfd9..d70d9f634cea 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, @@ -11858,7 +12595,15 @@ index 7e10a9ddcfd9..2c3d46ac70c6 100644 if (ret) goto err; -@@ -172,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans, +@@ -154,6 +152,7 @@ int bch2_create_trans(struct btree_trans *trans, + if (is_subdir_for_nlink(new_inode)) + dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; ++ dir_u->bi_size += dirent_occupied_size(name); + + ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) +@@ -172,6 +171,10 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } @@ -11869,7 +12614,38 @@ index 7e10a9ddcfd9..2c3d46ac70c6 100644 inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); -@@ -512,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans, +@@ -218,6 +221,7 @@ int bch2_link_trans(struct btree_trans *trans, + } + + dir_u->bi_mtime = dir_u->bi_ctime = now; ++ dir_u->bi_size += dirent_occupied_size(name); + + dir_hash = bch2_hash_info_init(c, dir_u); + +@@ -320,6 +324,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); ++ dir_u->bi_size -= dirent_occupied_size(name); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, +@@ -458,6 +463,14 @@ int bch2_rename_trans(struct btree_trans *trans, + goto err; + } + ++ if (mode == BCH_RENAME) { ++ src_dir_u->bi_size -= dirent_occupied_size(src_name); ++ dst_dir_u->bi_size += dirent_occupied_size(dst_name); ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ src_dir_u->bi_size -= dirent_occupied_size(src_name); ++ + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + +@@ -512,6 +525,15 @@ int bch2_rename_trans(struct btree_trans *trans, dst_dir_u->bi_nlink++; } @@ -11885,7 +12661,7 @@ index 7e10a9ddcfd9..2c3d46ac70c6 100644 if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -@@ -548,3 +559,94 @@ int bch2_rename_trans(struct btree_trans *trans, +@@ -548,3 +570,94 @@ int bch2_rename_trans(struct btree_trans *trans, bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } @@ -11992,10 +12768,25 @@ index c934e807b380..2b59210bb5e8 100644 + #endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c -index 95972809e76d..ab1d5db2fa56 100644 +index 95972809e76d..83e15908250d 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c -@@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans, +@@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; +- int flags = BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE; ++ int flags = BCH_READ_retry_if_stale| ++ BCH_READ_may_promote; + int ret = 0; + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +@@ -164,7 +162,8 @@ static void bchfs_read(struct btree_trans *trans, BTREE_ITER_slots); while (1) { struct bkey_s_c k; @@ -12005,7 +12796,7 @@ index 95972809e76d..ab1d5db2fa56 100644 enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); -@@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans, +@@ -197,7 +196,7 @@ static void bchfs_read(struct btree_trans *trans, k = bkey_i_to_s_c(sk.k); @@ -12014,7 +12805,24 @@ index 95972809e76d..ab1d5db2fa56 100644 if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, -@@ -230,10 +231,12 @@ static void bchfs_read(struct btree_trans *trans, +@@ -210,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans, + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); +@@ -230,10 +229,12 @@ static void bchfs_read(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); if (ret) { @@ -12031,7 +12839,7 @@ index 95972809e76d..ab1d5db2fa56 100644 rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } -@@ -248,6 +251,7 @@ void bch2_readahead(struct readahead_control *ractl) +@@ -248,6 +249,7 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; @@ -12039,7 +12847,7 @@ index 95972809e76d..ab1d5db2fa56 100644 bch2_inode_opts_get(&opts, c, &inode->ei_inode); -@@ -255,6 +259,16 @@ void bch2_readahead(struct readahead_control *ractl) +@@ -255,6 +257,16 @@ void bch2_readahead(struct readahead_control *ractl) if (ret) return; @@ -12056,7 +12864,23 @@ index 95972809e76d..ab1d5db2fa56 100644 bch2_pagecache_add_get(inode); struct btree_trans *trans = bch2_trans_get(c); -@@ -281,7 +295,7 @@ void bch2_readahead(struct readahead_control *ractl) +@@ -266,12 +278,13 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_readpages_end_io); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); +- rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(trans, rbio, inode_inum(inode), +@@ -281,7 +294,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_put(trans); bch2_pagecache_add_put(inode); @@ -12065,7 +12889,7 @@ index 95972809e76d..ab1d5db2fa56 100644 darray_exit(&readpages_iter.folios); } -@@ -296,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) +@@ -296,24 +309,30 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; @@ -12079,7 +12903,17 @@ index 95972809e76d..ab1d5db2fa56 100644 if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; -@@ -313,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_read_single_folio_end_io); + rbio->bio.bi_private = &done; +- rbio->bio.bi_end_io = bch2_read_single_folio_end_io; +- + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); @@ -12089,7 +12923,16 @@ index 95972809e76d..ab1d5db2fa56 100644 wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); -@@ -605,15 +625,6 @@ static int __bch2_writepage(struct folio *folio, +@@ -400,7 +419,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) + } + } + +- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ if (io->op.flags & BCH_WRITE_wrote_data_inline) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + +@@ -605,15 +624,6 @@ static int __bch2_writepage(struct folio *folio, BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); @@ -12105,7 +12948,7 @@ index 95972809e76d..ab1d5db2fa56 100644 w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; -@@ -669,7 +680,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, +@@ -669,7 +679,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN | fgf_set_order(len), mapping_gfp_mask(mapping)); @@ -12115,18 +12958,30 @@ index 95972809e76d..ab1d5db2fa56 100644 offset = pos - folio_pos(folio); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c -index 6d3a05ae5da8..2089c36b5866 100644 +index 6d3a05ae5da8..535bc5fcbcc0 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c -@@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +@@ -70,8 +70,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; + struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); ++ bool split = false; size_t shorten; -@@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + ssize_t ret; + +@@ -98,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + GFP_KERNEL, + &c->dio_read_bioset); + +- bio->bi_end_io = bch2_direct_IO_read_endio; +- + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + +@@ -128,14 +128,17 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) */ dio->should_dirty = iter_is_iovec(iter); @@ -12134,9 +12989,31 @@ index 6d3a05ae5da8..2089c36b5866 100644 + goto start; while (iter->count) { ++ split = true; ++ bio = bio_alloc_bioset(NULL, -@@ -160,6 +163,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); +- bio->bi_end_io = bch2_direct_IO_read_split_endio; + start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; +@@ -157,9 +160,19 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + if (iter->count) + closure_get(&dio->cl); + +- bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); ++ struct bch_read_bio *rbio = ++ rbio_init(bio, ++ c, ++ opts, ++ split ++ ? bch2_direct_IO_read_split_endio ++ : bch2_direct_IO_read_endio); ++ ++ bch2_read(c, rbio, inode_inum(inode)); } + blk_finish_plug(&plug); @@ -12144,6 +13021,17 @@ index 6d3a05ae5da8..2089c36b5866 100644 iter->count += shorten; if (sync) { +@@ -506,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) +- dio->op.flags |= BCH_WRITE_SYNC; +- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ++ dio->op.flags |= BCH_WRITE_sync; ++ dio->op.flags |= BCH_WRITE_check_enospc; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 1d4910ea0f1d..e072900e6a5b 100644 --- a/fs/bcachefs/fs-io-pagecache.c @@ -12310,7 +13198,7 @@ index 405cf08bda34..15725b4ce393 100644 return error; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -index a41d0d8a2f7b..3f83f131d0e8 100644 +index a41d0d8a2f7b..90ade8f648d9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -23,6 +23,7 @@ @@ -12329,7 +13217,17 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 #include #include #include -@@ -89,10 +91,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, +@@ -65,6 +67,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans, + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; + ++ if (fields & ATTR_SIZE) ++ i_size_write(&inode->v, bi->bi_size); ++ + if (fields & ATTR_ATIME) + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); + if (fields & ATTR_MTIME) +@@ -89,10 +94,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(trans); @@ -12359,7 +13257,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* -@@ -101,7 +118,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, +@@ -101,7 +121,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); @@ -12368,7 +13266,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -@@ -160,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) +@@ -160,8 +180,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; @@ -12379,7 +13277,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) -@@ -190,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { +@@ -190,11 +211,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .automatic_shrinking = true, }; @@ -12400,7 +13298,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 DARRAY(u32) subvols; int ret = 0; -@@ -219,15 +244,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +@@ -219,15 +247,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; @@ -12419,7 +13317,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { -@@ -248,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +@@ -248,7 +276,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) /* Ensure we see any new tables. */ smp_rmb(); @@ -12428,7 +13326,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 if (unlikely(tbl)) goto restart; rcu_read_unlock(); -@@ -327,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod +@@ -327,7 +355,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod spin_unlock(&inode->v.i_lock); if (remove) { @@ -12441,7 +13339,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; -@@ -372,6 +401,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, +@@ -372,6 +404,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, discard_new_inode(&inode->v); return old; } else { @@ -12453,7 +13351,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); -@@ -465,7 +499,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +@@ -465,7 +502,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, @@ -12462,7 +13360,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); -@@ -535,8 +569,7 @@ __bch2_create(struct mnt_idmap *idmap, +@@ -535,8 +572,7 @@ __bch2_create(struct mnt_idmap *idmap, inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; @@ -12472,7 +13370,16 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -@@ -617,7 +650,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, +@@ -549,7 +585,7 @@ __bch2_create(struct mnt_idmap *idmap, + + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + mutex_unlock(&dir->ei_update_lock); + } + +@@ -617,7 +653,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; @@ -12481,7 +13388,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); -@@ -628,7 +661,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, +@@ -628,7 +664,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, goto err; /* regular files may have hardlinks: */ @@ -12490,7 +13397,39 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), c, "dirent points to inode that does not point back:\n %s", -@@ -1245,7 +1278,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +@@ -706,7 +742,7 @@ static int __bch2_link(struct bch_fs *c, + + if (likely(!ret)) { + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); + } + +@@ -759,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + goto err; + + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + bch2_inode_update_after_write(trans, inode, &inode_u, + ATTR_MTIME); + +@@ -937,11 +973,11 @@ static int bch2_rename2(struct mnt_idmap *idmap, + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(trans, src_dir, &src_dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + + if (src_dir != dst_dir) + bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + + bch2_inode_update_after_write(trans, src_inode, &src_inode_u, + ATTR_CTIME); +@@ -1245,7 +1281,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; @@ -12498,7 +13437,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bool have_extent = false; int ret = 0; -@@ -1278,7 +1310,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +@@ -1278,7 +1313,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_snapshot(&iter, snapshot); @@ -12507,7 +13446,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 ret = bkey_err(k); if (ret) continue; -@@ -1292,9 +1324,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +@@ -1292,9 +1327,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, continue; } @@ -12519,7 +13458,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_bkey_buf_reassemble(&cur, c, k); -@@ -1306,7 +1337,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +@@ -1306,7 +1340,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); @@ -12528,7 +13467,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + -@@ -1736,7 +1767,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, +@@ -1736,7 +1770,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; @@ -12536,7 +13475,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; -@@ -2200,7 +2230,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) +@@ -2200,7 +2233,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); @@ -12546,7 +13485,7 @@ index a41d0d8a2f7b..3f83f131d0e8 100644 sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); -@@ -2345,13 +2376,16 @@ static int bch2_init_fs_context(struct fs_context *fc) +@@ -2345,13 +2379,16 @@ static int bch2_init_fs_context(struct fs_context *fc) void bch2_fs_vfs_exit(struct bch_fs *c) { @@ -12577,7 +13516,7 @@ index 59f9f7ae728d..dd2198541455 100644 struct list_head ei_vfs_inode_list; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -index 75c8a97a6954..3917d75f3c98 100644 +index 75c8a97a6954..8fcf7c8e5ede 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,6 +1,7 @@ @@ -13090,7 +14029,7 @@ index 75c8a97a6954..3917d75f3c98 100644 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, -@@ -1260,7 +1110,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans, +@@ -1260,12 +1110,43 @@ static int get_snapshot_root_inode(struct btree_trans *trans, goto err; BUG(); found_root: @@ -13099,7 +14038,43 @@ index 75c8a97a6954..3917d75f3c98 100644 err: bch2_trans_iter_exit(trans, &iter); return ret; -@@ -1291,7 +1141,9 @@ static int check_inode(struct btree_trans *trans, + } + ++static int check_directory_size(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ struct bkey_s_c inode_k, bool *write_inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 new_size = 0; ++ int ret; ++ ++ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, ++ SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), ++ POS(inode_k.k->p.offset, U64_MAX), ++ 0, k, ret) { ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); ++ struct qstr name = bch2_dirent_get_name(dirent); ++ ++ new_size += dirent_occupied_size(&name); ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (!ret && inode_u->bi_size != new_size) { ++ inode_u->bi_size = new_size; ++ *write_inode = true; ++ } ++ ++ return ret; ++} ++ + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, +@@ -1291,7 +1172,9 @@ static int check_inode(struct btree_trans *trans, if (!bkey_is_inode(k.k)) return 0; @@ -13110,7 +14085,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (snapshot_root->bi_inum != u.bi_inum) { ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); -@@ -1302,7 +1154,7 @@ static int check_inode(struct btree_trans *trans, +@@ -1302,7 +1185,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, @@ -13119,7 +14094,7 @@ index 75c8a97a6954..3917d75f3c98 100644 u.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); do_update = true; -@@ -1392,7 +1244,7 @@ static int check_inode(struct btree_trans *trans, +@@ -1392,7 +1275,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, @@ -13128,7 +14103,7 @@ index 75c8a97a6954..3917d75f3c98 100644 u.bi_inum, u.bi_snapshot)) { ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); -@@ -1415,7 +1267,7 @@ static int check_inode(struct btree_trans *trans, +@@ -1415,7 +1298,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_subvol) { struct bch_subvolume s; @@ -13137,7 +14112,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (ret && !bch2_err_matches(ret, ENOENT)) goto err; -@@ -1441,6 +1293,17 @@ static int check_inode(struct btree_trans *trans, +@@ -1441,6 +1324,27 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } @@ -13151,11 +14126,21 @@ index 75c8a97a6954..3917d75f3c98 100644 + buf.buf))) { + u.bi_journal_seq = journal_cur_seq(&c->journal); + do_update = true; ++ } ++ ++ if (S_ISDIR(u.bi_mode)) { ++ ret = check_directory_size(trans, &u, k, &do_update); ++ ++ fsck_err_on(ret, ++ trans, directory_size_mismatch, ++ "directory inode %llu:%u with the mismatch directory size", ++ u.bi_inum, k.k->p.snapshot); ++ ret = 0; + } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); -@@ -1502,7 +1365,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, +@@ -1502,7 +1406,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, break; struct bch_inode_unpacked parent_inode; @@ -13166,7 +14151,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (!inode_should_reattach(&parent_inode)) break; -@@ -1525,7 +1390,9 @@ static int check_unreachable_inode(struct btree_trans *trans, +@@ -1525,7 +1431,9 @@ static int check_unreachable_inode(struct btree_trans *trans, return 0; struct bch_inode_unpacked inode; @@ -13177,7 +14162,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (!inode_should_reattach(&inode)) return 0; -@@ -1649,7 +1516,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal +@@ -1649,7 +1557,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", w->last_pos.inode, i->snapshot, i->count, count2); @@ -13186,7 +14171,7 @@ index 75c8a97a6954..3917d75f3c98 100644 } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), -@@ -1753,7 +1620,7 @@ static int overlapping_extents_found(struct btree_trans *trans, +@@ -1753,7 +1661,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); @@ -13195,7 +14180,7 @@ index 75c8a97a6954..3917d75f3c98 100644 ret = bkey_err(k1); if (ret) goto err; -@@ -1778,7 +1645,7 @@ static int overlapping_extents_found(struct btree_trans *trans, +@@ -1778,7 +1686,7 @@ static int overlapping_extents_found(struct btree_trans *trans, while (1) { bch2_btree_iter_advance(&iter2); @@ -13204,7 +14189,7 @@ index 75c8a97a6954..3917d75f3c98 100644 ret = bkey_err(k2); if (ret) goto err; -@@ -2156,7 +2023,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, +@@ -2156,7 +2064,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } @@ -13213,7 +14198,7 @@ index 75c8a97a6954..3917d75f3c98 100644 !fsck_err(trans, inode_wrong_backpointer, "dirent points to inode that does not point back:\n %s", (bch2_bkey_val_to_text(&buf, c, d.s_c), -@@ -2480,7 +2347,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, +@@ -2480,7 +2388,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; @@ -13222,7 +14207,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (ret < 0) goto err; if (ret) { -@@ -2519,6 +2386,30 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, +@@ -2519,6 +2427,30 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } @@ -13253,7 +14238,7 @@ index 75c8a97a6954..3917d75f3c98 100644 } ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -@@ -2594,7 +2485,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, +@@ -2594,7 +2526,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; @@ -13262,7 +14247,7 @@ index 75c8a97a6954..3917d75f3c98 100644 bch_err_fn(c, ret); return ret; } -@@ -2774,6 +2665,48 @@ struct pathbuf_entry { +@@ -2774,6 +2706,48 @@ struct pathbuf_entry { typedef DARRAY(struct pathbuf_entry) pathbuf; @@ -13311,7 +14296,7 @@ index 75c8a97a6954..3917d75f3c98 100644 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { darray_for_each(*p, i) -@@ -2783,21 +2716,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) +@@ -2783,21 +2757,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } @@ -13341,7 +14326,7 @@ index 75c8a97a6954..3917d75f3c98 100644 while (!inode.bi_subvol) { struct btree_iter dirent_iter; -@@ -2807,7 +2740,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino +@@ -2807,7 +2781,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) @@ -13350,7 +14335,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); -@@ -2822,7 +2755,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino +@@ -2822,7 +2796,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); @@ -13359,7 +14344,7 @@ index 75c8a97a6954..3917d75f3c98 100644 .inum = inode.bi_inum, .snapshot = snapshot, })); -@@ -2834,22 +2767,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino +@@ -2834,22 +2808,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &inode_iter); inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, inode.bi_dir, snapshot), 0); @@ -13396,7 +14381,7 @@ index 75c8a97a6954..3917d75f3c98 100644 pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode.bi_inum, snapshot); -@@ -2862,12 +2805,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino +@@ -2862,12 +2846,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); } @@ -13418,7 +14403,7 @@ index 75c8a97a6954..3917d75f3c98 100644 printbuf_exit(&buf); bch_err_fn(c, ret); return ret; -@@ -2879,24 +2830,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino +@@ -2879,24 +2871,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino */ int bch2_check_directory_structure(struct bch_fs *c) { @@ -13446,7 +14431,7 @@ index 75c8a97a6954..3917d75f3c98 100644 bch_err_fn(c, ret); return ret; -@@ -2994,7 +2941,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, +@@ -2994,7 +2982,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, /* Should never fail, checked by bch2_inode_invalid: */ struct bch_inode_unpacked u; @@ -13457,7 +14442,7 @@ index 75c8a97a6954..3917d75f3c98 100644 /* * Backpointer and directory structure checks are sufficient for -@@ -3072,7 +3021,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite +@@ -3072,7 +3062,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite if (!bkey_is_inode(k.k)) return 0; @@ -13468,7 +14453,7 @@ index 75c8a97a6954..3917d75f3c98 100644 if (S_ISDIR(u.bi_mode)) return 0; -@@ -3194,3 +3145,223 @@ int bch2_fix_reflink_p(struct bch_fs *c) +@@ -3194,3 +3186,223 @@ int bch2_fix_reflink_p(struct bch_fs *c) bch_err_fn(c, ret); return ret; } @@ -14270,7 +15255,7 @@ index f283051758d6..5353979117b0 100644 bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c -index b3b934a87c6d..34a3569d085a 100644 +index b3b934a87c6d..6276f375dbc9 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -21,6 +21,7 @@ @@ -14281,22 +15266,330 @@ index b3b934a87c6d..34a3569d085a 100644 #include "subvolume.h" #include "trace.h" -@@ -231,11 +232,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, - update_opts.target = opts.foreground_target; +@@ -79,6 +80,7 @@ struct promote_op { + struct rhash_head hash; + struct bpos pos; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -- unsigned i = 0; ++ struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ + }; +@@ -90,16 +92,41 @@ static const struct rhashtable_params bch_promote_params = { + .automatic_shrinking = true, + }; + ++static inline bool have_io_error(struct bch_io_failures *failed) ++{ ++ return failed && failed->nr; ++} ++ ++static bool ptr_being_rewritten(struct bch_read_bio *orig, ++ unsigned dev, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_data_update)) ++ return false; ++ ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); ++ unsigned i = 0; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr->dev == dev && ++ u->data_opts.rewrite_ptrs & BIT(i)) ++ return true; ++ i++; ++ } ++ ++ return false; ++} ++ + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags, + struct bch_io_failures *failed) + { +- if (!failed) { ++ if (!have_io_error(failed)) { + BUG_ON(!opts.promote_target); + +- if (!(flags & BCH_READ_MAY_PROMOTE)) ++ if (!(flags & BCH_READ_may_promote)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) +@@ -119,98 +146,94 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static void promote_free(struct bch_fs *c, struct promote_op *op) ++static noinline void promote_free(struct bch_read_bio *rbio) + { +- int ret; ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); ++ struct bch_fs *c = rbio->c; ++ ++ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); + + bch2_data_update_exit(&op->write); + +- ret = rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params); +- BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); + } + + static void promote_done(struct bch_write_op *wop) + { +- struct promote_op *op = +- container_of(wop, struct promote_op, write.op); +- struct bch_fs *c = op->write.op.c; ++ struct promote_op *op = container_of(wop, struct promote_op, write.op); ++ struct bch_fs *c = op->write.rbio.c; + +- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], +- op->start_time); +- promote_free(c, op); ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); ++ promote_free(&op->write.rbio); + } + +-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++static void promote_start_work(struct work_struct *work) + { +- struct bio *bio = &op->write.op.wbio.bio; ++ struct promote_op *op = container_of(work, struct promote_op, work); + +- trace_and_count(op->write.op.c, read_promote, &rbio->bio); ++ bch2_data_update_read_done(&op->write); ++} + +- /* we now own pages: */ +- BUG_ON(!rbio->bounce); +- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++static noinline void promote_start(struct bch_read_bio *rbio) ++{ ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + +- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, +- sizeof(struct bio_vec) * rbio->bio.bi_vcnt); +- swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ trace_and_count(op->write.op.c, read_promote, &rbio->bio); + +- bch2_data_update_read_done(&op->write, rbio->pick.crc); ++ INIT_WORK(&op->work, promote_start_work); ++ queue_work(rbio->c->write_ref_wq, &op->work); + } + +-static struct promote_op *__promote_alloc(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bkey_s_c k, +- struct bpos pos, +- struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, +- unsigned sectors, +- struct bch_read_bio **rbio, +- struct bch_io_failures *failed) ++static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ unsigned sectors, ++ unsigned flags, ++ struct bch_read_bio *orig, ++ struct bch_io_failures *failed) + { + struct bch_fs *c = trans->c; +- struct promote_op *op = NULL; +- struct bio *bio; +- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) +- return ERR_PTR(-BCH_ERR_nopromote_no_writes); ++ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; + +- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); +- if (!op) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; +- } ++ if (!have_io_error(failed)) { ++ update_opts.target = orig->opts.promote_target; ++ update_opts.extra_replicas = 1; ++ update_opts.write_flags |= BCH_WRITE_cached; ++ update_opts.write_flags |= BCH_WRITE_only_specified_devs; ++ } else { ++ update_opts.target = orig->opts.foreground_target; + +- op->start_time = local_clock(); +- op->pos = pos; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) -- update_opts.rewrite_ptrs |= BIT(i); -- i++; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (bch2_dev_io_failures(failed, ptr->dev) && ++ !ptr_being_rewritten(orig, ptr->dev, flags)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; - } ++ } + +- /* +- * We don't use the mempool here because extents that aren't +- * checksummed or compressed can be too big for the mempool: +- */ +- *rbio = kzalloc(sizeof(struct bch_read_bio) + +- sizeof(struct bio_vec) * pages, +- GFP_KERNEL); +- if (!*rbio) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ if (!update_opts.rewrite_ptrs) ++ return NULL; } -@@ -321,6 +322,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, +- rbio_init(&(*rbio)->bio, opts); +- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) ++ return ERR_PTR(-BCH_ERR_nopromote_no_writes); + +- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { ++ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); ++ if (!op) { + ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ goto err_put; + } + +- (*rbio)->bounce = true; +- (*rbio)->split = true; +- (*rbio)->kmalloc = true; ++ op->start_time = local_clock(); ++ op->pos = pos; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) { +@@ -218,64 +241,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, + goto err; + } + +- bio = &op->write.op.wbio.bio; +- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); +- +- struct data_update_opts update_opts = {}; +- +- if (!failed) { +- update_opts.target = opts.promote_target; +- update_opts.extra_replicas = 1; +- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; +- } else { +- update_opts.target = opts.foreground_target; +- +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- unsigned i = 0; +- bkey_for_each_ptr(ptrs, ptr) { +- if (bch2_dev_io_failures(failed, ptr->dev)) +- update_opts.rewrite_ptrs |= BIT(i); +- i++; +- } +- } +- + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, + writepoint_hashed((unsigned long) current), +- opts, ++ orig->opts, + update_opts, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ +- if (ret) { +- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params)); +- goto err; +- } ++ if (ret) ++ goto err_remove_hash; + ++ rbio_init_fragment(&op->write.rbio.bio, orig); ++ op->write.rbio.bounce = true; ++ op->write.rbio.promote = true; + op->write.op.end_io = promote_done; + +- return op; ++ return &op->write.rbio; ++err_remove_hash: ++ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params)); + err: +- if (*rbio) +- bio_free_pages(&(*rbio)->bio); +- kfree(*rbio); +- *rbio = NULL; ++ bio_free_pages(&op->write.op.wbio.bio); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); ++err_put: + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return ERR_PTR(ret); + } + + noinline +-static struct promote_op *promote_alloc(struct btree_trans *trans, ++static struct bch_read_bio *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, + unsigned flags, +- struct bch_read_bio **rbio, ++ struct bch_read_bio *orig, + bool *bounce, + bool *read_full, + struct bch_io_failures *failed) +@@ -285,7 +287,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + * if failed != NULL we're not actually doing a promote, we're + * recovering from an io/checksum error + */ +- bool promote_full = (failed || ++ bool promote_full = (have_io_error(failed) || + *read_full || + READ_ONCE(c->opts.promote_whole_extents)); + /* data might have to be decompressed in the write path: */ +@@ -295,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); +- struct promote_op *promote; + int ret; + +- ret = should_promote(c, k, pos, opts, flags, failed); ++ ret = should_promote(c, k, pos, orig->opts, flags, failed); + if (ret) + goto nopromote; + +- promote = __promote_alloc(trans, +- k.k->type == KEY_TYPE_reflink_v +- ? BTREE_ID_reflink +- : BTREE_ID_extents, +- k, pos, pick, opts, sectors, rbio, failed); ++ struct bch_read_bio *promote = ++ __promote_alloc(trans, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_reflink ++ : BTREE_ID_extents, ++ k, pos, pick, sectors, flags, orig, failed); ++ if (!promote) ++ return NULL; ++ + ret = PTR_ERR_OR_ZERO(promote); + if (ret) + goto nopromote; +@@ -321,6 +326,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, /* Read */ @@ -14317,7 +15610,159 @@ index b3b934a87c6d..34a3569d085a 100644 #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 -@@ -499,6 +514,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, +@@ -355,20 +374,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) + { + BUG_ON(rbio->bounce && !rbio->split); + +- if (rbio->promote) +- promote_free(rbio->c, rbio->promote); +- rbio->promote = NULL; +- +- if (rbio->bounce) +- bch2_bio_free_pages_pool(rbio->c, &rbio->bio); +- + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + +- if (rbio->kmalloc) +- kfree(rbio); +- else ++ if (unlikely(rbio->promote)) { ++ if (!rbio->bio.bi_status) ++ promote_start(rbio); ++ else ++ promote_free(rbio); ++ } else { ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ + bio_put(&rbio->bio); ++ } + + rbio = parent; + } +@@ -388,61 +407,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) + bio_endio(&rbio->bio); + } + +-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) + { ++ struct data_update *u = container_of(rbio, struct data_update, rbio); + struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_buf sk; +- struct bkey_s_c k; +- int ret; +- +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- +- bch2_bkey_buf_init(&sk); +- +- bch2_trans_iter_init(trans, &iter, rbio->data_btree, +- rbio->read_pos, BTREE_ITER_slots); + retry: + bch2_trans_begin(trans); +- rbio->bio.bi_status = 0; + +- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = lockrestart_do(trans, ++ bkey_err(k = bch2_bkey_get_iter(trans, &iter, ++ u->btree_id, bkey_start_pos(&u->k.k->k), ++ 0))); + if (ret) + goto err; + +- bch2_bkey_buf_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); +- +- if (!bch2_bkey_matches_ptr(c, k, +- rbio->pick.ptr, +- rbio->data_pos.offset - +- rbio->pick.crc.offset)) { ++ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; +- goto out; ++ goto err; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, +- rbio->read_pos, +- rbio->data_btree, +- k, 0, failed, flags); ++ bkey_start_pos(&u->k.k->k), ++ u->btree_id, ++ bkey_i_to_s_c(u->k.k), ++ 0, failed, flags); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ + if (ret == READ_RETRY) + goto retry; + if (ret) +- goto err; +-out: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ ++ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + bch2_rbio_done(rbio); +- bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); +- bch2_bkey_buf_exit(&sk, c); +- return; +-err: +- rbio->bio.bi_status = BLK_STS_IOERR; +- goto out; + } + + static void bch2_rbio_retry(struct work_struct *work) +@@ -463,21 +468,20 @@ static void bch2_rbio_retry(struct work_struct *work) + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + +- rbio->bio.bi_status = 0; ++ if (!rbio->split) ++ rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + +- flags |= BCH_READ_IN_RETRY; +- flags &= ~BCH_READ_MAY_PROMOTE; ++ flags |= BCH_READ_in_retry; ++ flags &= ~BCH_READ_may_promote; ++ flags &= ~BCH_READ_last_fragment; ++ flags |= BCH_READ_must_clone; + +- if (flags & BCH_READ_NODECODE) { ++ if (flags & BCH_READ_data_update) + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); +- } else { +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- ++ else + __bch2_read(c, rbio, iter, inum, &failed, flags); +- } + } + + static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, +@@ -485,7 +489,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + { + rbio->retry = retry; + +- if (rbio->flags & BCH_READ_IN_RETRY) ++ if (rbio->flags & BCH_READ_in_retry) + return; + + if (retry == READ_ERR) { +@@ -499,6 +503,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } @@ -14347,7 +15792,7 @@ index b3b934a87c6d..34a3569d085a 100644 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { -@@ -562,6 +600,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +@@ -562,6 +589,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } @@ -14421,7 +15866,92 @@ index b3b934a87c6d..34a3569d085a 100644 /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { -@@ -668,33 +773,13 @@ static void __bch2_read_endio(struct work_struct *work) +@@ -602,32 +696,40 @@ static void __bch2_read_endio(struct work_struct *work) + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + +- if (rbio->flags & BCH_READ_NODECODE) +- goto nodecode; ++ if (likely(!(rbio->flags & BCH_READ_data_update))) { ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + +- /* Adjust crc to point to subset of data we want: */ +- crc.offset += rbio->offset_into_extent; +- crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ if (crc_is_compressed(crc)) { ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- if (crc_is_compressed(crc)) { +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && ++ !c->opts.no_data_io) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); + +- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && +- !c->opts.no_data_io) +- goto decompression_err; +- } else { +- /* don't need to decrypt the entire bio: */ +- nonce = nonce_add(nonce, crc.offset << 9); +- bio_advance(src, crc.offset << 9); ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; + +- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); +- src->bi_iter.bi_size = dst_iter.bi_size; ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ } else { ++ if (rbio->split) ++ rbio->parent->pick = rbio->pick; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; +@@ -644,12 +746,9 @@ static void __bch2_read_endio(struct work_struct *work) + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; +- +- promote_start(rbio->promote, rbio); +- rbio->promote = NULL; + } +-nodecode: +- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ ++ if (likely(!(rbio->flags & BCH_READ_in_retry))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +@@ -662,39 +761,19 @@ static void __bch2_read_endio(struct work_struct *work) + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ +- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { +- rbio->flags |= BCH_READ_MUST_BOUNCE; ++ if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { ++ rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); goto out; } @@ -14458,7 +15988,7 @@ index b3b934a87c6d..34a3569d085a 100644 goto out; } -@@ -715,16 +800,8 @@ static void bch2_read_endio(struct bio *bio) +@@ -715,24 +794,16 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; @@ -14477,7 +16007,17 @@ index b3b934a87c6d..34a3569d085a 100644 return; } -@@ -750,45 +827,6 @@ static void bch2_read_endio(struct bio *bio) +- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { + trace_and_count(c, read_reuse_race, &rbio->bio); + +- if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); +@@ -750,45 +821,6 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } @@ -14523,15 +16063,34 @@ index b3b934a87c6d..34a3569d085a 100644 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, -@@ -868,15 +906,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, +@@ -845,7 +877,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; +- struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; +@@ -868,15 +899,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (!pick_ret) goto hole; - if (pick_ret < 0) { + if (unlikely(pick_ret < 0)) { - struct printbuf buf = PRINTBUF; ++ struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ goto err; ++ } ++ ++ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; ++ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); ++ prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); bch2_bkey_val_to_text(&buf, c, k); - bch_err_inum_offset_ratelimited(c, @@ -14539,22 +16098,192 @@ index b3b934a87c6d..34a3569d085a 100644 - "no device to read from: %s\n %s", - bch2_err_str(pick_ret), - buf.buf); -+ bch_err_ratelimited(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ goto err; -+ } -+ -+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { -+ struct printbuf buf = PRINTBUF; -+ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); -+ prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ + bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); goto err; } -@@ -1062,11 +1109,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, +@@ -889,7 +929,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ +- if ((flags & BCH_READ_IN_RETRY) && ++ if ((flags & BCH_READ_in_retry) && + !pick.ptr.cached && + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { +@@ -903,48 +943,52 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ +- bch2_trans_unlock(trans); ++ if (!(flags & BCH_READ_in_retry)) ++ bch2_trans_unlock(trans); ++ else ++ bch2_trans_unlock_long(trans); ++ ++ if (!(flags & BCH_READ_data_update)) { ++ if (!(flags & BCH_READ_last_fragment) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_must_clone; ++ ++ narrow_crcs = !(flags & BCH_READ_in_retry) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_user_mapped)) ++ flags |= BCH_READ_must_bounce; + +- if (flags & BCH_READ_NODECODE) { ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_none && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_user_mapped)) || ++ (flags & BCH_READ_must_bounce)))) { ++ read_full = true; ++ bounce = true; ++ } ++ } else { ++ read_full = true; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ +- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { + if (ca) + percpu_ref_put(&ca->io_ref); + goto hole; + } + + iter.bi_size = pick.crc.compressed_size << 9; +- goto get_bio; +- } +- +- if (!(flags & BCH_READ_LAST_FRAGMENT) || +- bio_flagged(&orig->bio, BIO_CHAIN)) +- flags |= BCH_READ_MUST_CLONE; +- +- narrow_crcs = !(flags & BCH_READ_IN_RETRY) && +- bch2_can_narrow_extent_crcs(k, pick.crc); +- +- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) +- flags |= BCH_READ_MUST_BOUNCE; +- +- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); +- +- if (crc_is_compressed(pick.crc) || +- (pick.crc.csum_type != BCH_CSUM_none && +- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || +- (bch2_csum_type_is_encryption(pick.crc.csum_type) && +- (flags & BCH_READ_USER_MAPPED)) || +- (flags & BCH_READ_MUST_BOUNCE)))) { +- read_full = true; +- bounce = true; + } + +- if (orig->opts.promote_target)// || failed) +- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, +- &rbio, &bounce, &read_full, failed); ++ if (orig->opts.promote_target || have_io_error(failed)) ++ rbio = promote_alloc(trans, iter, k, &pick, flags, orig, ++ &bounce, &read_full, failed); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); +@@ -963,7 +1007,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +-get_bio: ++ + if (rbio) { + /* + * promote already allocated bounce rbio: +@@ -978,17 +1022,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + +- rbio = rbio_init(bio_alloc_bioset(NULL, ++ rbio = rbio_init_fragment(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; +- rbio->split = true; +- } else if (flags & BCH_READ_MUST_CLONE) { ++ } else if (flags & BCH_READ_must_clone) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't +@@ -997,11 +1040,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ +- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, ++ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + rbio->bio.bi_iter = iter; +- rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; +@@ -1010,11 +1052,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + +- rbio->c = c; + rbio->submit_time = local_clock(); +- if (rbio->split) +- rbio->parent = orig; +- else ++ if (!rbio->split) + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; +@@ -1024,20 +1063,14 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; +- /* XXX: only initialize this if needed */ +- rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->bversion; +- rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + +- if (flags & BCH_READ_NODECODE) +- orig->pick = pick; +- + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; +@@ -1052,21 +1085,25 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ +- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) ++ if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + +- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); } if (!rbio->pick.idx) { @@ -14575,7 +16304,72 @@ index b3b934a87c6d..34a3569d085a 100644 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } -@@ -1164,7 +1215,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +@@ -1076,10 +1113,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } else { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); +@@ -1097,11 +1134,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + goto out; + } + +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } + out: +- if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ if (likely(!(flags & BCH_READ_in_retry))) { + return 0; + } else { + int ret; +@@ -1124,7 +1161,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } + + err: +- if (flags & BCH_READ_IN_RETRY) ++ if (flags & BCH_READ_in_retry) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; +@@ -1132,16 +1169,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + hole: + /* +- * won't normally happen in the BCH_READ_NODECODE ++ * won't normally happen in the BCH_READ_data_update + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ +- if (flags & BCH_READ_NODECODE) ++ if (flags & BCH_READ_data_update) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + out_read_done: +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + bch2_rbio_done(orig); + return 0; + } +@@ -1156,7 +1193,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bkey_s_c k; + int ret; + +- BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_data_update); + + bch2_bkey_buf_init(&sk); + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +@@ -1164,7 +1201,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { @@ -14583,7 +16377,7 @@ index b3b934a87c6d..34a3569d085a 100644 enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); -@@ -1184,9 +1234,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +@@ -1184,9 +1220,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; @@ -14595,7 +16389,7 @@ index b3b934a87c6d..34a3569d085a 100644 bch2_bkey_buf_reassemble(&sk, c, k); -@@ -1201,9 +1251,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +@@ -1201,13 +1237,13 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: */ @@ -14607,7 +16401,21 @@ index b3b934a87c6d..34a3569d085a 100644 swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) -@@ -1229,16 +1279,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, +@@ -1215,7 +1251,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + if (ret) + goto err; + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(bvec_iter.bi_size, bytes); +@@ -1229,16 +1265,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, } bch2_trans_iter_exit(trans, &iter); @@ -14634,7 +16442,7 @@ index b3b934a87c6d..34a3569d085a 100644 void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h -index d9c18bb7d403..a82e8a94ccb6 100644 +index d9c18bb7d403..f54c9943e34a 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ @@ -14645,7 +16453,37 @@ index d9c18bb7d403..a82e8a94ccb6 100644 struct bch_read_bio { struct bch_fs *c; -@@ -79,19 +80,32 @@ struct bch_devs_mask; +@@ -34,9 +35,9 @@ struct bch_read_bio { + u16 flags; + union { + struct { +- u16 bounce:1, ++ u16 promote:1, ++ bounce:1, + split:1, +- kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, +@@ -46,8 +47,6 @@ struct bch_read_bio { + u16 _state; + }; + +- struct bch_devs_list devs_have; +- + struct extent_ptr_decoded pick; + + /* +@@ -64,8 +63,6 @@ struct bch_read_bio { + struct bpos data_pos; + struct bversion version; + +- struct promote_op *promote; +- + struct bch_io_opts opts; + + struct work_struct work; +@@ -79,32 +76,54 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; @@ -14684,9 +16522,90 @@ index d9c18bb7d403..a82e8a94ccb6 100644 + return 0; } ++#define BCH_READ_FLAGS() \ ++ x(retry_if_stale) \ ++ x(may_promote) \ ++ x(user_mapped) \ ++ x(data_update) \ ++ x(last_fragment) \ ++ x(must_bounce) \ ++ x(must_clone) \ ++ x(in_retry) ++ ++enum __bch_read_flags { ++#define x(n) __BCH_READ_##n, ++ BCH_READ_FLAGS() ++#undef x ++}; ++ enum bch_read_flags { +- BCH_READ_RETRY_IF_STALE = 1 << 0, +- BCH_READ_MAY_PROMOTE = 1 << 1, +- BCH_READ_USER_MAPPED = 1 << 2, +- BCH_READ_NODECODE = 1 << 3, +- BCH_READ_LAST_FRAGMENT = 1 << 4, +- +- /* internal: */ +- BCH_READ_MUST_BOUNCE = 1 << 5, +- BCH_READ_MUST_CLONE = 1 << 6, +- BCH_READ_IN_RETRY = 1 << 7, ++#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), ++ BCH_READ_FLAGS() ++#undef x + }; + + int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, +@@ -131,24 +150,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + BUG_ON(rbio->_state); + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, +- BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE| +- BCH_READ_USER_MAPPED); ++ BCH_READ_retry_if_stale| ++ BCH_READ_may_promote| ++ BCH_READ_user_mapped); + } + +-static inline struct bch_read_bio *rbio_init(struct bio *bio, +- struct bch_io_opts opts) ++static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, ++ struct bch_read_bio *orig) + { + struct bch_read_bio *rbio = to_rbio(bio); + ++ rbio->c = orig->c; + rbio->_state = 0; +- rbio->promote = NULL; +- rbio->opts = opts; ++ rbio->split = true; ++ rbio->parent = orig; ++ rbio->opts = orig->opts; ++ return rbio; ++} ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_fs *c, ++ struct bch_io_opts opts, ++ bio_end_io_t end_io) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->start_time = local_clock(); ++ rbio->c = c; ++ rbio->_state = 0; ++ rbio->opts = opts; ++ rbio->bio.bi_end_io = end_io; + return rbio; + } + diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c -index 96720adcfee0..3e71860f66b9 100644 +index 96720adcfee0..92abc239599d 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, @@ -14706,7 +16625,7 @@ index 96720adcfee0..3e71860f66b9 100644 BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) -@@ -369,7 +370,7 @@ static int bch2_write_index_default(struct bch_write_op *op) +@@ -369,11 +370,11 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); @@ -14715,6 +16634,11 @@ index 96720adcfee0..3e71860f66b9 100644 bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, +- op->flags & BCH_WRITE_CHECK_ENOSPC); ++ op->flags & BCH_WRITE_check_enospc); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -395,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ @@ -14726,10 +16650,10 @@ index 96720adcfee0..3e71860f66b9 100644 + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", -+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); ++ op->flags & BCH_WRITE_move ? "(internal move)" : ""); +} + -+static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) ++void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +{ + __bch2_write_op_error(out, op, op->pos.offset); +} @@ -14737,6 +16661,33 @@ index 96720adcfee0..3e71860f66b9 100644 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, +@@ -467,7 +483,7 @@ static void bch2_write_done(struct closure *cl) + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + +- if (!(op->flags & BCH_WRITE_MOVE)) ++ if (!(op->flags & BCH_WRITE_move)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + +@@ -513,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op) + unsigned dev; + int ret = 0; + +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; +@@ -522,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op) + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + +- ret = !(op->flags & BCH_WRITE_MOVE) ++ ret = !(op->flags & BCH_WRITE_move) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + @@ -531,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); @@ -14758,7 +16709,27 @@ index 96720adcfee0..3e71860f66b9 100644 } if (ret) -@@ -621,9 +637,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) +@@ -554,7 +570,7 @@ static void __bch2_write_index(struct bch_write_op *op) + err: + keys->top = keys->keys; + op->error = ret; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + goto out; + } + +@@ -589,8 +605,8 @@ static CLOSURE_CALLBACK(bch2_write_index) + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + +- if ((op->flags & BCH_WRITE_SUBMITTED) && +- (op->flags & BCH_WRITE_MOVE)) ++ if ((op->flags & BCH_WRITE_submitted) && ++ (op->flags & BCH_WRITE_move)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); +@@ -621,20 +637,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock_irq(&wp->writes_lock); @@ -14769,6 +16740,94 @@ index 96720adcfee0..3e71860f66b9 100644 wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); + if (!op) + break; + +- op->flags |= BCH_WRITE_IN_WORKER; ++ op->flags |= BCH_WRITE_in_worker; + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + __bch2_write(op); + else + bch2_write_done(&op->cl); +@@ -658,7 +672,7 @@ static void bch2_write_endio(struct bio *bio) + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); +- op->flags |= BCH_WRITE_IO_ERROR; ++ op->flags |= BCH_WRITE_io_error; + } + + if (wbio->nocow) { +@@ -705,7 +719,7 @@ static void init_append_extent(struct bch_write_op *op, + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, +- op->flags & BCH_WRITE_CACHED); ++ op->flags & BCH_WRITE_cached); + + bch2_keylist_push(&op->insert_keys); + } +@@ -822,7 +836,7 @@ static enum prep_encoded_ret { + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + +- if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ if (!(op->flags & BCH_WRITE_data_encoded)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); +@@ -859,7 +873,7 @@ static enum prep_encoded_ret { + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + +- if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ if (bch2_bio_uncompress_inplace(op, bio)) + return PREP_ENCODED_ERR; + } + +@@ -930,9 +944,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + if (ec_buf || + op->compression_opt || + (op->csum_type && +- !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ !(op->flags & BCH_WRITE_pages_stable)) || + (bch2_csum_type_is_encryption(op->csum_type) && +- !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ !(op->flags & BCH_WRITE_pages_owned))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); +@@ -952,7 +966,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + break; + + BUG_ON(op->compression_opt && +- (op->flags & BCH_WRITE_DATA_ENCODED) && ++ (op->flags & BCH_WRITE_data_encoded) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + +@@ -990,7 +1004,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + } + } + +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { +@@ -1022,7 +1036,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, @@ -1080,11 +1094,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; @@ -14815,7 +16874,64 @@ index 96720adcfee0..3e71860f66b9 100644 } if (ret) { -@@ -1339,17 +1356,19 @@ static void bch2_nocow_write(struct bch_write_op *op) +@@ -1193,9 +1210,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + + static void __bch2_nocow_write_done(struct bch_write_op *op) + { +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -EIO; +- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) ++ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) + bch2_nocow_write_convert_unwritten(op); + } + +@@ -1224,7 +1241,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + struct bucket_to_lock *stale_at; + int stale, ret; + +- if (op->flags & BCH_WRITE_MOVE) ++ if (op->flags & BCH_WRITE_move) + return; + + darray_init(&buckets); +@@ -1282,7 +1299,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + }), GFP_KERNEL|__GFP_NOFAIL); + + if (ptr->unwritten) +- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; ++ op->flags |= BCH_WRITE_convert_unwritten; + } + + /* Unlock before taking nocow locks, doing IO: */ +@@ -1290,7 +1307,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + bch2_trans_unlock(trans); + + bch2_cut_front(op->pos, op->insert_keys.top); +- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) ++ if (op->flags & BCH_WRITE_convert_unwritten) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + darray_for_each(buckets, i) { +@@ -1315,7 +1332,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + } + + op->pos.offset += bio_sectors(bio); +@@ -1329,7 +1346,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + break; + bch2_btree_iter_advance(&iter); + } +@@ -1339,23 +1356,25 @@ static void bch2_nocow_write(struct bch_write_op *op) if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -14832,19 +16948,48 @@ index 96720adcfee0..3e71860f66b9 100644 + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { +- if (!(op->flags & BCH_WRITE_SUBMITTED)) { ++ if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); -@@ -1462,14 +1481,14 @@ static void __bch2_write(struct bch_write_op *op) - if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; +- } else if (op->flags & BCH_WRITE_SYNC) { ++ } else if (op->flags & BCH_WRITE_sync) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl.work); + } else { +@@ -1407,7 +1426,7 @@ static void __bch2_write(struct bch_write_op *op) + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + goto out_nofs_restore; + } + again: +@@ -1437,7 +1456,7 @@ static void __bch2_write(struct bch_write_op *op) + ret = bch2_trans_run(c, lockrestart_do(trans, + bch2_alloc_sectors_start_trans(trans, + op->target, +- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), + op->write_point, + &op->devs_have, + op->nr_replicas, +@@ -1460,16 +1479,16 @@ static void __bch2_write(struct bch_write_op *op) + bch2_alloc_sectors_done_inlined(c, wp); + err: + if (ret <= 0) { +- op->flags |= BCH_WRITE_SUBMITTED; +- - if (ret < 0) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) - bch_err_inum_offset_ratelimited(c, @@ -14853,8 +16998,10 @@ index 96720adcfee0..3e71860f66b9 100644 - "%s(): %s error: %s", __func__, - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); ++ op->flags |= BCH_WRITE_submitted; ++ + if (unlikely(ret < 0)) { -+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { ++ if (!(op->flags & BCH_WRITE_alloc_nowait)) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); @@ -14864,7 +17011,47 @@ index 96720adcfee0..3e71860f66b9 100644 op->error = ret; break; } -@@ -1595,12 +1614,11 @@ CLOSURE_CALLBACK(bch2_write) +@@ -1495,14 +1514,14 @@ static void __bch2_write(struct bch_write_op *op) + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ +- if ((op->flags & BCH_WRITE_SYNC) || +- (!(op->flags & BCH_WRITE_SUBMITTED) && +- !(op->flags & BCH_WRITE_IN_WORKER))) { ++ if ((op->flags & BCH_WRITE_sync) || ++ (!(op->flags & BCH_WRITE_submitted) && ++ !(op->flags & BCH_WRITE_in_worker))) { + bch2_wait_on_allocator(c, &op->cl); + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + goto again; + bch2_write_done(&op->cl); + } else { +@@ -1523,8 +1542,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + + memset(&op->failed, 0, sizeof(op->failed)); + +- op->flags |= BCH_WRITE_WROTE_DATA_INLINE; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_wrote_data_inline; ++ op->flags |= BCH_WRITE_submitted; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + +@@ -1587,20 +1606,19 @@ CLOSURE_CALLBACK(bch2_write) + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + +- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) +- op->flags |= BCH_WRITE_ALLOC_NOWAIT; ++ if (op->flags & BCH_WRITE_only_specified_devs) ++ op->flags |= BCH_WRITE_alloc_nowait; + + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); + op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; @@ -14882,11 +17069,92 @@ index 96720adcfee0..3e71860f66b9 100644 op->error = -EIO; goto err; } +@@ -1610,7 +1628,7 @@ CLOSURE_CALLBACK(bch2_write) + goto err; + } + +- if (!(op->flags & BCH_WRITE_MOVE) && ++ if (!(op->flags & BCH_WRITE_move) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; +diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h +index 5400ce94ee57..02cca52be0bd 100644 +--- a/fs/bcachefs/io_write.h ++++ b/fs/bcachefs/io_write.h +@@ -20,22 +20,23 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw + void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + ++void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); ++ + #define BCH_WRITE_FLAGS() \ +- x(ALLOC_NOWAIT) \ +- x(CACHED) \ +- x(DATA_ENCODED) \ +- x(PAGES_STABLE) \ +- x(PAGES_OWNED) \ +- x(ONLY_SPECIFIED_DEVS) \ +- x(WROTE_DATA_INLINE) \ +- x(FROM_INTERNAL) \ +- x(CHECK_ENOSPC) \ +- x(SYNC) \ +- x(MOVE) \ +- x(IN_WORKER) \ +- x(SUBMITTED) \ +- x(IO_ERROR) \ +- x(CONVERT_UNWRITTEN) ++ x(alloc_nowait) \ ++ x(cached) \ ++ x(data_encoded) \ ++ x(pages_stable) \ ++ x(pages_owned) \ ++ x(only_specified_devs) \ ++ x(wrote_data_inline) \ ++ x(check_enospc) \ ++ x(sync) \ ++ x(move) \ ++ x(in_worker) \ ++ x(submitted) \ ++ x(io_error) \ ++ x(convert_unwritten) + + enum __bch_write_flags { + #define x(f) __BCH_WRITE_##f, +diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h +index 6e878a6f2f0b..3ef6df9145ef 100644 +--- a/fs/bcachefs/io_write_types.h ++++ b/fs/bcachefs/io_write_types.h +@@ -64,7 +64,7 @@ struct bch_write_op { + struct bpos pos; + struct bversion version; + +- /* For BCH_WRITE_DATA_ENCODED: */ ++ /* For BCH_WRITE_data_encoded: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -index 2dc0d60c1745..2cd20114b74b 100644 +index 2dc0d60c1745..cb2c3722f674 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c -@@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) +@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) + + static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) + { +- unsigned i; +- +- for (i = 0; i < ARRAY_SIZE(p->list); i++) +- INIT_LIST_HEAD(&p->list[i]); +- INIT_LIST_HEAD(&p->flushed); ++ for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) ++ INIT_LIST_HEAD(&p->unflushed[i]); ++ for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) ++ INIT_LIST_HEAD(&p->flushed[i]); + atomic_set(&p->count, count); + p->devs.nr = 0; + } +@@ -217,6 +216,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); @@ -14899,7 +17167,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 } /* -@@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t +@@ -251,6 +256,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t if (!__journal_entry_is_open(old)) return; @@ -14909,7 +17177,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); -@@ -373,6 +382,10 @@ static int journal_entry_open(struct journal *j) +@@ -373,6 +381,10 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; @@ -14920,7 +17188,58 @@ index 2dc0d60c1745..2cd20114b74b 100644 BUG_ON(!j->cur_entry_sectors); buf->expires = -@@ -664,7 +677,7 @@ void bch2_journal_entry_res_resize(struct journal *j, +@@ -588,6 +600,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + : -BCH_ERR_journal_res_get_blocked; + } + ++static unsigned max_dev_latency(struct bch_fs *c) ++{ ++ u64 nsecs = 0; ++ ++ for_each_rw_member(c, ca) ++ nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); ++ ++ return nsecs_to_jiffies(nsecs); ++} ++ + /* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. +@@ -599,17 +621,31 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + * btree node write locks. + */ + int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, +- unsigned flags) ++ unsigned flags, ++ struct btree_trans *trans) + { + int ret; + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), +- HZ * 10)) ++ HZ)) + return ret; + ++ if (trans) ++ bch2_trans_unlock_long(trans); ++ + struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); ++ ++ remaining_wait = max(0, remaining_wait - HZ); ++ ++ if (closure_wait_event_timeout(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || ++ (flags & JOURNAL_RES_GET_NONBLOCK), ++ remaining_wait)) ++ return ret; ++ + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", +@@ -664,7 +700,7 @@ void bch2_journal_entry_res_resize(struct journal *j, * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, @@ -14929,7 +17248,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary -@@ -687,7 +700,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, +@@ -687,7 +723,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { @@ -14938,7 +17257,16 @@ index 2dc0d60c1745..2cd20114b74b 100644 goto out; } -@@ -794,10 +807,11 @@ int bch2_journal_flush(struct journal *j) +@@ -714,7 +750,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + * livelock: + */ + sched_annotate_sleep(); +- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); + if (ret) + return ret; + +@@ -794,10 +830,11 @@ int bch2_journal_flush(struct journal *j) } /* @@ -14952,7 +17280,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; -@@ -806,15 +820,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +@@ -806,15 +843,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; @@ -14971,7 +17299,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); -@@ -831,19 +845,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +@@ -831,19 +868,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) return ret; } @@ -14986,7 +17314,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 - - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + struct journal_res res = {}; -+ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -14995,7 +17323,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 buf->must_flush = true; if (!buf->flush_time) { -@@ -856,27 +865,70 @@ int bch2_journal_meta(struct journal *j) +@@ -856,27 +888,70 @@ int bch2_journal_meta(struct journal *j) return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } @@ -15069,7 +17397,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 { struct journal_buf *ret = NULL; -@@ -893,13 +945,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou +@@ -893,13 +968,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { @@ -15091,7 +17419,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 ? ERR_PTR(-EAGAIN) : buf; break; -@@ -912,11 +968,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou +@@ -912,11 +991,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } @@ -15111,7 +17439,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 return ret; } -@@ -945,19 +1007,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, +@@ -945,19 +1030,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { @@ -15130,18 +17458,18 @@ index 2dc0d60c1745..2cd20114b74b 100644 + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; - ++ + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; -+ + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, -@@ -967,9 +1027,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, +@@ -967,9 +1050,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch_err_msg(c, ret, "marking new journal buckets"); break; } @@ -15153,7 +17481,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 } if (!nr_got) -@@ -1009,8 +1069,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, +@@ -1009,8 +1092,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; @@ -15163,7 +17491,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 /* Commit: */ if (c) -@@ -1044,9 +1103,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, +@@ -1044,9 +1126,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: @@ -15175,7 +17503,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 kfree(new_bucket_seq); kfree(new_buckets); -@@ -1193,7 +1251,7 @@ void bch2_fs_journal_stop(struct journal *j) +@@ -1193,7 +1274,7 @@ void bch2_fs_journal_stop(struct journal *j) * Always write a new journal entry, to make sure the clock hands are up * to date (and match the superblock) */ @@ -15184,7 +17512,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); -@@ -1217,6 +1275,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +@@ -1217,6 +1298,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; @@ -15196,7 +17524,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; -@@ -1474,6 +1537,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +@@ -1474,6 +1560,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) case JOURNAL_ENTRY_CLOSED_VAL: prt_printf(out, "closed\n"); break; @@ -15206,7 +17534,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 default: prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; -@@ -1499,6 +1565,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +@@ -1499,6 +1588,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { @@ -15216,7 +17544,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) -@@ -1508,6 +1577,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +@@ -1508,6 +1600,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); @@ -15224,7 +17552,7 @@ index 2dc0d60c1745..2cd20114b74b 100644 printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); -@@ -1519,6 +1589,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +@@ -1519,6 +1612,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); } @@ -15233,8 +17561,63 @@ index 2dc0d60c1745..2cd20114b74b 100644 rcu_read_unlock(); --out->atomic; +@@ -1530,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); + } +- +-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +-{ +- struct journal_entry_pin_list *pin_list; +- struct journal_entry_pin *pin; +- +- spin_lock(&j->lock); +- if (!test_bit(JOURNAL_running, &j->flags)) { +- spin_unlock(&j->lock); +- return true; +- } +- +- *seq = max(*seq, j->pin.front); +- +- if (*seq >= j->pin.back) { +- spin_unlock(&j->lock); +- return true; +- } +- +- out->atomic++; +- +- pin_list = journal_seq_pin(j, *seq); +- +- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); +- printbuf_indent_add(out, 2); +- +- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) +- list_for_each_entry(pin, &pin_list->list[i], list) +- prt_printf(out, "\t%px %ps\n", pin, pin->flush); +- +- if (!list_empty(&pin_list->flushed)) +- prt_printf(out, "flushed:\n"); +- +- list_for_each_entry(pin, &pin_list->flushed, list) +- prt_printf(out, "\t%px %ps\n", pin, pin->flush); +- +- printbuf_indent_sub(out, 2); +- +- --out->atomic; +- spin_unlock(&j->lock); +- +- return false; +-} +- +-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +-{ +- u64 seq = 0; +- +- while (!bch2_journal_seq_pins_to_text(out, j, &seq)) +- seq++; +-} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -index 2762be6f9814..cb0df0663946 100644 +index 2762be6f9814..dccddd5420ad 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq @@ -15247,7 +17630,35 @@ index 2762be6f9814..cb0df0663946 100644 } /* -@@ -403,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); +@@ -311,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, + } + + int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, +- unsigned); ++ unsigned, struct btree_trans *); + + /* First bits for BCH_WATERMARK: */ + enum journal_res_flags { +@@ -367,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j, + } + + static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, +- unsigned u64s, unsigned flags) ++ unsigned u64s, unsigned flags, ++ struct btree_trans *trans) + { + int ret; + +@@ -379,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re + if (journal_res_get_fast(j, res, flags)) + goto out; + +- ret = bch2_journal_res_get_slowpath(j, res, flags); ++ ret = bch2_journal_res_get_slowpath(j, res, flags, trans); + if (ret) + return ret; + out: +@@ -403,7 +405,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); @@ -15256,7 +17667,7 @@ index 2762be6f9814..cb0df0663946 100644 int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); -@@ -411,7 +412,7 @@ void bch2_journal_halt(struct journal *); +@@ -411,7 +413,7 @@ void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL @@ -15265,7 +17676,7 @@ index 2762be6f9814..cb0df0663946 100644 } struct bch_dev; -@@ -424,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) +@@ -424,12 +426,10 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); @@ -15274,8 +17685,13 @@ index 2762be6f9814..cb0df0663946 100644 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +-void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + + int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -index fb35dd336331..e1773ac27824 100644 +index fb35dd336331..b89d77717de4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,8 @@ @@ -15621,19 +18037,7 @@ index fb35dd336331..e1773ac27824 100644 if (bytes > (sectors_read << 9) && sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; -@@ -1096,8 +1114,10 @@ static int journal_read_bucket(struct bch_dev *ca, - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), -- err.buf))) -+ err.buf))) { - saw_bad = true; -+ bch2_fatal_error(c); -+ } - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, -@@ -1231,8 +1251,6 @@ int bch2_journal_read(struct bch_fs *c, +@@ -1231,8 +1249,6 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { @@ -15642,7 +18046,7 @@ index fb35dd336331..e1773ac27824 100644 i = *_i; if (journal_replay_ignore(i)) -@@ -1252,6 +1270,10 @@ int bch2_journal_read(struct bch_fs *c, +@@ -1252,6 +1268,10 @@ int bch2_journal_read(struct bch_fs *c, continue; } @@ -15653,7 +18057,7 @@ index fb35dd336331..e1773ac27824 100644 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, jset_last_seq_newer_than_seq, -@@ -1411,27 +1433,50 @@ int bch2_journal_read(struct bch_fs *c, +@@ -1411,27 +1431,50 @@ int bch2_journal_read(struct bch_fs *c, /* journal write: */ @@ -15714,7 +18118,7 @@ index fb35dd336331..e1773ac27824 100644 /* * Check that we can use this device, and aren't already using -@@ -1477,65 +1522,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) +@@ -1477,65 +1520,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -15802,7 +18206,15 @@ index fb35dd336331..e1773ac27824 100644 } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -@@ -2023,19 +2056,21 @@ CLOSURE_CALLBACK(bch2_journal_write) +@@ -1732,6 +1763,7 @@ static CLOSURE_CALLBACK(journal_write_submit) + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; ++ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; +@@ -2023,19 +2055,21 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } @@ -15841,7 +18253,7 @@ index 2ca9cde30ea8..12b39fcb4424 100644 struct jset_entry *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -index ace291f175dd..3c8242606da7 100644 +index ace291f175dd..6a9cefb635d6 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, @@ -15885,7 +18297,110 @@ index ace291f175dd..3c8242606da7 100644 } void bch2_journal_space_available(struct journal *j) -@@ -758,10 +767,12 @@ static int bch2_journal_reclaim_thread(void *arg) +@@ -318,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) + popped = true; + } + +- if (popped) ++ if (popped) { + bch2_journal_space_available(j); ++ __closure_wake_up(&j->reclaim_flush_wait); ++ } + } + + bool __bch2_journal_pin_put(struct journal *j, u64 seq) +@@ -353,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, + pin->seq = 0; + list_del_init(&pin->list); + ++ if (j->reclaim_flush_wait.list.first) ++ __closure_wake_up(&j->reclaim_flush_wait); ++ + /* + * Unpinning a journal entry may make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: +@@ -374,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) + { + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) +- return JOURNAL_PIN_btree; ++ return JOURNAL_PIN_TYPE_btree; + else if (fn == bch2_btree_key_cache_journal_flush) +- return JOURNAL_PIN_key_cache; ++ return JOURNAL_PIN_TYPE_key_cache; + else +- return JOURNAL_PIN_other; ++ return JOURNAL_PIN_TYPE_other; + } + + static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, +@@ -397,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; +- list_add(&pin->list, &pin_list->list[type]); ++ ++ if (list_empty(&pin_list->unflushed[type]) && ++ j->reclaim_flush_wait.list.first) ++ __closure_wake_up(&j->reclaim_flush_wait); ++ ++ list_add(&pin->list, &pin_list->unflushed[type]); + } + + void bch2_journal_pin_copy(struct journal *j, +@@ -490,16 +509,15 @@ journal_get_next_pin(struct journal *j, + { + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; +- unsigned i; + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > seq_to_flush && !allowed_above_seq) + break; + +- for (i = 0; i < JOURNAL_PIN_NR; i++) +- if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || +- ((1U << i) & allowed_above_seq)) { +- ret = list_first_entry_or_null(&pin_list->list[i], ++ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) ++ if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || ++ (BIT(i) & allowed_above_seq)) { ++ ret = list_first_entry_or_null(&pin_list->unflushed[i], + struct journal_entry_pin, list); + if (ret) + return ret; +@@ -535,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j, + } + + if (min_key_cache) { +- allowed_above |= 1U << JOURNAL_PIN_key_cache; +- allowed_below |= 1U << JOURNAL_PIN_key_cache; ++ allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); ++ allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); + } + + cond_resched(); +@@ -544,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j, + j->last_flushed = jiffies; + + spin_lock(&j->lock); +- pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); ++ pin = journal_get_next_pin(j, seq_to_flush, ++ allowed_below, ++ allowed_above, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; +@@ -567,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j, + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) +- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); ++ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); +@@ -758,10 +778,12 @@ static int bch2_journal_reclaim_thread(void *arg) journal_empty = fifo_empty(&j->pin); spin_unlock(&j->lock); @@ -15900,8 +18415,159 @@ index ace291f175dd..3c8242606da7 100644 else break; } +@@ -805,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j) + return 0; + } + ++static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, ++ unsigned types) ++{ ++ struct journal_entry_pin_list *pin_list; ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { ++ if (seq > seq_to_flush) ++ break; ++ ++ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) ++ if ((BIT(i) & types) && ++ (!list_empty(&pin_list->unflushed[i]) || ++ !list_empty(&pin_list->flushed[i]))) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ } ++ spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, ++ unsigned types) ++{ ++ return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || ++ journal_pins_still_flushing(j, seq_to_flush, types); ++} ++ + static int journal_flush_done(struct journal *j, u64 seq_to_flush, + bool *did_work) + { +- int ret; ++ int ret = 0; + + ret = bch2_journal_error(j); + if (ret) +@@ -816,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + + mutex_lock(&j->reclaim_lock); + +- if (journal_flush_pins(j, seq_to_flush, +- (1U << JOURNAL_PIN_key_cache)| +- (1U << JOURNAL_PIN_other), 0, 0, 0) || +- journal_flush_pins(j, seq_to_flush, +- (1U << JOURNAL_PIN_btree), 0, 0, 0)) ++ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, ++ BIT(JOURNAL_PIN_TYPE_key_cache)| ++ BIT(JOURNAL_PIN_TYPE_other))) { + *did_work = true; ++ goto unlock; ++ } ++ ++ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, ++ BIT(JOURNAL_PIN_TYPE_btree))) { ++ *did_work = true; ++ goto unlock; ++ } + + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); +@@ -836,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + !fifo_used(&j->pin); + + spin_unlock(&j->lock); ++unlock: + mutex_unlock(&j->reclaim_lock); + + return ret; +@@ -849,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) + if (!test_bit(JOURNAL_running, &j->flags)) + return false; + +- closure_wait_event(&j->async_wait, ++ closure_wait_event(&j->reclaim_flush_wait, + journal_flush_done(j, seq_to_flush, &did_work)); + + return did_work; +@@ -915,3 +975,54 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + + return ret; + } ++ ++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ ++ spin_lock(&j->lock); ++ if (!test_bit(JOURNAL_running, &j->flags)) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ ++ *seq = max(*seq, j->pin.front); ++ ++ if (*seq >= j->pin.back) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ ++ out->atomic++; ++ ++ pin_list = journal_seq_pin(j, *seq); ++ ++ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "unflushed:\n"); ++ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) ++ list_for_each_entry(pin, &pin_list->unflushed[i], list) ++ prt_printf(out, "\t%px %ps\n", pin, pin->flush); ++ ++ prt_printf(out, "flushed:\n"); ++ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) ++ list_for_each_entry(pin, &pin_list->flushed[i], list) ++ prt_printf(out, "\t%px %ps\n", pin, pin->flush); ++ ++ printbuf_indent_sub(out, 2); ++ ++ --out->atomic; ++ spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ u64 seq = 0; ++ ++ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) ++ seq++; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index ec84c3345281..0a73d7134e1c 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) + + int bch2_journal_flush_device_pins(struct journal *, int); + ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); ++ + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -index 19183fcf7ad7..e9bd716fbb71 100644 +index 19183fcf7ad7..3ba433a48eb8 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -9,6 +9,9 @@ @@ -15914,6 +18580,28 @@ index 19183fcf7ad7..e9bd716fbb71 100644 #define JOURNAL_BUF_BITS 2 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) +@@ -50,15 +53,15 @@ struct journal_buf { + */ + + enum journal_pin_type { +- JOURNAL_PIN_btree, +- JOURNAL_PIN_key_cache, +- JOURNAL_PIN_other, +- JOURNAL_PIN_NR, ++ JOURNAL_PIN_TYPE_btree, ++ JOURNAL_PIN_TYPE_key_cache, ++ JOURNAL_PIN_TYPE_other, ++ JOURNAL_PIN_TYPE_NR, + }; + + struct journal_entry_pin_list { +- struct list_head list[JOURNAL_PIN_NR]; +- struct list_head flushed; ++ struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; ++ struct list_head flushed[JOURNAL_PIN_TYPE_NR]; + atomic_t count; + struct bch_devs_list devs; + }; @@ -112,6 +115,7 @@ union journal_res_state { */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) @@ -15930,6 +18618,14 @@ index 19183fcf7ad7..e9bd716fbb71 100644 unsigned buf_size_want; /* +@@ -221,6 +226,7 @@ struct journal { + /* Used when waiting because the journal was full */ + wait_queue_head_t wait; + struct closure_waitlist async_wait; ++ struct closure_waitlist reclaim_flush_wait; + + struct delayed_work write_work; + struct workqueue_struct *wq; diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 60e00702d1a4..75f27ec26f85 100644 --- a/fs/bcachefs/logged_ops.c @@ -16011,7 +18707,7 @@ index e6a7d8241bb8..f31a6cf1514c 100644 void bch2_lru_pos_to_text(struct printbuf *, struct bpos); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -index 0ef4a86850bb..c493ea625553 100644 +index 0ef4a86850bb..ff787d3d50d2 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -21,6 +21,8 @@ @@ -16023,7 +18719,61 @@ index 0ef4a86850bb..c493ea625553 100644 #include "replicas.h" #include "snapshot.h" #include "super-io.h" -@@ -196,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) +@@ -72,11 +74,7 @@ struct moving_io { + unsigned read_sectors; + unsigned write_sectors; + +- struct bch_read_bio rbio; +- + struct data_update write; +- /* Must be last since it is variable size */ +- struct bio_vec bi_inline_vecs[]; + }; + + static void move_free(struct moving_io *io) +@@ -86,13 +84,12 @@ static void move_free(struct moving_io *io) + if (io->b) + atomic_dec(&io->b->count); + +- bch2_data_update_exit(&io->write); +- + mutex_lock(&ctxt->lock); + list_del(&io->io_list); + wake_up(&ctxt->wait); + mutex_unlock(&ctxt->lock); + ++ bch2_data_update_exit(&io->write); + kfree(io); + } + +@@ -112,7 +109,7 @@ static void move_write_done(struct bch_write_op *op) + + static void move_write(struct moving_io *io) + { +- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) { + move_free(io); + return; + } +@@ -130,7 +127,7 @@ static void move_write(struct moving_io *io) + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); + +- bch2_data_update_read_done(&io->write, io->rbio.pick.crc); ++ bch2_data_update_read_done(&io->write); + } + + struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) +@@ -143,7 +140,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx + + static void move_read_endio(struct bio *bio) + { +- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); + struct moving_context *ctxt = io->write.ctxt; + + atomic_sub(io->read_sectors, &ctxt->read_sectors); +@@ -196,6 +193,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); @@ -16037,7 +18787,91 @@ index 0ef4a86850bb..c493ea625553 100644 bch2_trans_put(ctxt->trans); memset(ctxt, 0, sizeof(*ctxt)); } -@@ -379,34 +388,42 @@ int bch2_move_extent(struct moving_context *ctxt, +@@ -249,11 +253,6 @@ int bch2_move_extent(struct moving_context *ctxt, + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- struct moving_io *io; +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + + trace_move_extent2(c, k, &io_opts, &data_opts); +@@ -276,13 +275,7 @@ int bch2_move_extent(struct moving_context *ctxt, + */ + bch2_trans_unlock(trans); + +- /* write path might have to decompress data: */ +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); +- +- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); +- io = kzalloc(sizeof(struct moving_io) + +- sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + if (!io) + goto err; + +@@ -291,29 +284,13 @@ int bch2_move_extent(struct moving_context *ctxt, + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; + +- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- bio_set_prio(&io->write.op.wbio.bio, +- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- +- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, +- GFP_KERNEL)) +- goto err_free; +- +- io->rbio.c = c; +- io->rbio.opts = io_opts; +- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- io->rbio.bio.bi_vcnt = pages; +- bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- io->rbio.bio.bi_iter.bi_size = sectors << 9; +- +- io->rbio.bio.bi_opf = REQ_OP_READ; +- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); +- io->rbio.bio.bi_end_io = move_read_endio; +- + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, iter->btree_id, k); + if (ret) +- goto err_free_pages; ++ goto err_free; ++ ++ io->write.rbio.bio.bi_end_io = move_read_endio; ++ bio_set_prio(&io->write.rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + io->write.op.end_io = move_write_done; + +@@ -347,18 +324,16 @@ int bch2_move_extent(struct moving_context *ctxt, + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); +- bch2_read_extent(trans, &io->rbio, ++ bch2_read_extent(trans, &io->write.rbio, + bkey_start_pos(k.k), + iter->btree_id, k, 0, +- BCH_READ_NODECODE| +- BCH_READ_LAST_FRAGMENT); ++ BCH_READ_data_update| ++ BCH_READ_last_fragment); + return 0; +-err_free_pages: +- bio_free_pages(&io->write.op.wbio.bio); + err_free: + kfree(io); + err: +- if (ret == -BCH_ERR_data_update_done) ++ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; + + if (bch2_err_matches(ret, EROFS) || +@@ -379,34 +354,42 @@ int bch2_move_extent(struct moving_context *ctxt, return ret; } @@ -16086,7 +18920,7 @@ index 0ef4a86850bb..c493ea625553 100644 } ret = ret ?: trans_was_restarted(trans, restart_count); -@@ -415,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +@@ -415,43 +398,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) @@ -16154,7 +18988,7 @@ index 0ef4a86850bb..c493ea625553 100644 } int bch2_move_ratelimit(struct moving_context *ctxt) -@@ -509,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, +@@ -509,9 +495,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, struct per_snapshot_io_opts snapshot_io_opts; struct bch_io_opts *io_opts; struct bkey_buf sk; @@ -16171,7 +19005,7 @@ index 0ef4a86850bb..c493ea625553 100644 int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); -@@ -531,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, +@@ -531,6 +523,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { @@ -16180,7 +19014,7 @@ index 0ef4a86850bb..c493ea625553 100644 bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); -@@ -549,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, +@@ -549,10 +543,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -16218,7 +19052,7 @@ index 0ef4a86850bb..c493ea625553 100644 ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; -@@ -568,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, +@@ -568,12 +588,12 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); @@ -16227,7 +19061,13 @@ index 0ef4a86850bb..c493ea625553 100644 if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; -@@ -589,6 +643,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + +- if (ret2 == -ENOMEM) { ++ if (bch2_err_matches(ret2, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; +@@ -589,6 +609,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_btree_iter_advance(&iter); } @@ -16235,7 +19075,7 @@ index 0ef4a86850bb..c493ea625553 100644 bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); per_snapshot_io_opts_exit(&snapshot_io_opts); -@@ -654,16 +709,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -654,16 +675,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); @@ -16255,7 +19095,7 @@ index 0ef4a86850bb..c493ea625553 100644 int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); -@@ -672,6 +723,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -672,6 +689,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, trace_bucket_evacuate(c, &bucket); @@ -16264,7 +19104,7 @@ index 0ef4a86850bb..c493ea625553 100644 bch2_bkey_buf_init(&sk); /* -@@ -679,21 +732,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -679,21 +698,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); @@ -16288,7 +19128,7 @@ index 0ef4a86850bb..c493ea625553 100644 ret = bch2_btree_write_buffer_tryflush(trans); bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) -@@ -705,18 +750,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -705,18 +716,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); @@ -16318,7 +19158,7 @@ index 0ef4a86850bb..c493ea625553 100644 ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; -@@ -728,7 +778,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -728,7 +744,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); @@ -16327,7 +19167,7 @@ index 0ef4a86850bb..c493ea625553 100644 if (ret) { bch2_trans_iter_exit(trans, &iter); continue; -@@ -738,14 +788,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -738,14 +754,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; @@ -16350,7 +19190,7 @@ index 0ef4a86850bb..c493ea625553 100644 } i++; } -@@ -765,14 +819,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -765,14 +785,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->stats) @@ -16369,7 +19209,7 @@ index 0ef4a86850bb..c493ea625553 100644 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) -@@ -796,15 +851,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, +@@ -796,15 +817,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_moved); } @@ -16390,6 +19230,15 @@ index 0ef4a86850bb..c493ea625553 100644 return ret; } +@@ -1158,7 +1182,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) +- bch2_write_op_to_text(out, &io->write.op); ++ bch2_data_update_inflight_to_text(out, &io->write); + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9baf3093a678..51e0505a8156 100644 --- a/fs/bcachefs/move.h @@ -16407,7 +19256,7 @@ index 9baf3093a678..51e0505a8156 100644 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -index d658be90f737..85c361e78ba5 100644 +index d658be90f737..21805509ab9e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, @@ -16419,7 +19268,39 @@ index d658be90f737..85c361e78ba5 100644 lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ -@@ -350,9 +350,9 @@ static int bch2_copygc_thread(void *arg) +@@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt, + }; + move_buckets buckets = { 0 }; + struct move_bucket_in_flight *f; +- u64 moved = atomic64_read(&ctxt->stats->sectors_moved); ++ u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); ++ u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); +@@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt, + *did_work = true; + } + err: +- darray_exit(&buckets); + + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) +@@ -254,8 +254,11 @@ static int bch2_copygc(struct moving_context *ctxt, + if (ret < 0 && !bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "from bch2_move_data()"); + +- moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; +- trace_and_count(c, copygc, c, moved, 0, 0, 0); ++ sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; ++ sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; ++ trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); ++ ++ darray_exit(&buckets); + return ret; + } + +@@ -350,9 +353,9 @@ static int bch2_copygc_thread(void *arg) bch2_trans_unlock_long(ctxt.trans); cond_resched(); @@ -16509,7 +19390,7 @@ index 0e2ee262fbd4..6772faf385a5 100644 bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -index 23dda014e331..e763d52e0f38 100644 +index 23dda014e331..a182b5d454ba 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,7 +16,8 @@ extern const char * const bch2_version_upgrade_opts[]; @@ -16570,13 +19451,13 @@ index 23dda014e331..e763d52e0f38 100644 NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ + x(copygc_enabled, u8, \ -+ OPT_FS|OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable copygc: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ + x(rebalance_enabled, u8, \ -+ OPT_FS|OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable rebalance: disable for debugging, or to\n"\ @@ -16808,7 +19689,7 @@ index 40a20192eee8..bef2aa1b8bcd 100644 if (may_sleep && unlikely(process_finished_items(pending, p, flags))) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -index cd6647374353..4adc74cd3f70 100644 +index cd6647374353..90dbf04c07a1 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -24,6 +24,192 @@ @@ -17035,7 +19916,7 @@ index cd6647374353..4adc74cd3f70 100644 struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); int ret = PTR_ERR_OR_ZERO(n); if (ret) -@@ -134,31 +322,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, +@@ -134,32 +322,28 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bpos work_pos, struct btree_iter *extent_iter, @@ -17069,11 +19950,13 @@ index cd6647374353..4adc74cd3f70 100644 - data_opts->rewrite_ptrs = - bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); - data_opts->target = r->target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { + /* @@ -178,12 +362,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (trace_rebalance_extent_enabled()) { struct printbuf buf = PRINTBUF; @@ -17124,7 +20007,7 @@ index cd6647374353..4adc74cd3f70 100644 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); /* -@@ -253,20 +449,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, +@@ -253,21 +449,9 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -17142,11 +20025,13 @@ index cd6647374353..4adc74cd3f70 100644 - - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); - data_opts->target = target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; return data_opts->rewrite_ptrs != 0; } + @@ -338,9 +522,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); @@ -18895,10 +21780,10 @@ index 62ea478215d0..fdcf598f08b1 100644 enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c -index 8767c33c2b51..051214fdc735 100644 +index 8767c33c2b51..14f6b6a5fb38 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c -@@ -81,7 +81,16 @@ +@@ -81,7 +81,19 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(inode_has_child_snapshots, \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ @@ -18912,11 +21797,14 @@ index 8767c33c2b51..051214fdc735 100644 + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ -+ BCH_FSCK_ERR_accounting_key_junk_at_end) ++ BCH_FSCK_ERR_accounting_key_junk_at_end) \ ++ x(directory_size, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ ++ BCH_FSCK_ERR_directory_size_mismatch) \ #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ -@@ -117,7 +126,19 @@ +@@ -117,7 +129,19 @@ BCH_FSCK_ERR_bkey_version_in_future) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ @@ -18938,7 +21826,7 @@ index 8767c33c2b51..051214fdc735 100644 struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h -index 9feb6739f77a..80b6d589808b 100644 +index 9feb6739f77a..ea0a18364751 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -5,9 +5,8 @@ @@ -18953,11 +21841,13 @@ index 9feb6739f77a..80b6d589808b 100644 }; #define BCH_SB_ERRS() \ -@@ -59,7 +58,7 @@ enum bch_fsck_flags { +@@ -58,8 +57,8 @@ enum bch_fsck_flags { + x(bset_wrong_sector_offset, 44, 0) \ x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, 0) \ +- x(bset_blacklisted_journal_seq, 47, 0) \ - x(first_bset_blacklisted_journal_seq, 48, 0) \ ++ x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ + x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ @@ -19070,21 +21960,47 @@ index 9feb6739f77a..80b6d589808b 100644 x(sb_clean_entry_overrun, 267, 0) \ x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ -@@ -306,7 +311,9 @@ enum bch_fsck_flags { +@@ -306,7 +311,10 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 295, 0) + x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ + x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ -+ x(MAX, 303, 0) ++ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ ++ x(MAX, 304, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c -index 617d07e53b20..537bf049618f 100644 +index 617d07e53b20..7e7c66a1e1a6 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c +@@ -491,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + +- if (unlikely(acquired)) ++ if (unlikely(acquired)) { + do_six_unlock_type(lock, type); ++ } else if (type == SIX_LOCK_write) { ++ six_clear_bitmask(lock, SIX_LOCK_HELD_write); ++ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); ++ } + break; + } + +@@ -501,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + + __set_current_state(TASK_RUNNING); + out: +- if (ret && type == SIX_LOCK_write) { +- six_clear_bitmask(lock, SIX_LOCK_HELD_write); +- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); +- } + trace_contention_end(lock, 0); + + return ret; @@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long if (type != SIX_LOCK_write) @@ -20017,10 +22933,10 @@ index 29c94716293e..00373cf32e7b 100644 static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c new file mode 100644 -index 000000000000..f5977c5c6743 +index 000000000000..d78451c2a0c6 --- /dev/null +++ b/fs/bcachefs/str_hash.c -@@ -0,0 +1,286 @@ +@@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -20054,11 +22970,11 @@ index 000000000000..f5977c5c6743 + } +} + -+static int fsck_rename_dirent(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct bkey_s_c_dirent old) ++static noinline int fsck_rename_dirent(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); @@ -20094,11 +23010,11 @@ index 000000000000..f5977c5c6743 + return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +} + -+static int hash_pick_winner(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct bkey_s_c k1, -+ struct bkey_s_c k2) ++static noinline int hash_pick_winner(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_s_c k1, ++ struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) @@ -20165,8 +23081,8 @@ index 000000000000..f5977c5c6743 + * All versions of the same inode in different snapshots must have the same hash + * seed/type: verify that the hash info we're using matches the root + */ -+static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, -+ struct bch_hash_info *hash_info) ++static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, ++ struct bch_hash_info *hash_info) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; @@ -20190,10 +23106,19 @@ index 000000000000..f5977c5c6743 + goto err; + + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); -+ if (memcmp(hash_info, &hash2, sizeof(hash2))) { ++ if (hash_info->type != hash2.type || ++ memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { + ret = repair_inode_hash_info(trans, &inode); + if (!ret) { -+ bch_err(c, "inode hash info mismatch with root, but mismatch not found"); ++ bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" ++ "%u %llx %llx\n" ++ "%u %llx %llx", ++ hash_info->type, ++ hash_info->siphash_key.k0, ++ hash_info->siphash_key.k1, ++ hash2.type, ++ hash2.siphash_key.k0, ++ hash2.siphash_key.k1); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } @@ -21364,7 +24289,7 @@ index fb5c1543e52f..6c6469814637 100644 0, k, 0)); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h -index 5597b9d6297f..9d40b7d4ea29 100644 +index 5597b9d6297f..56a5a7fbc0fd 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, @@ -21417,7 +24342,7 @@ index 5597b9d6297f..9d40b7d4ea29 100644 __field(int, ret ) ), -@@ -867,15 +890,14 @@ TRACE_EVENT(evacuate_bucket, +@@ -867,45 +890,42 @@ TRACE_EVENT(evacuate_bucket, __entry->bucket = bucket->offset; __entry->sectors = sectors; __entry->bucket_size = bucket_size; @@ -21435,7 +24360,49 @@ index 5597b9d6297f..9d40b7d4ea29 100644 ); TRACE_EVENT(copygc, -@@ -1316,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, + TP_PROTO(struct bch_fs *c, +- u64 sectors_moved, u64 sectors_not_moved, +- u64 buckets_moved, u64 buckets_not_moved), +- TP_ARGS(c, +- sectors_moved, sectors_not_moved, +- buckets_moved, buckets_not_moved), ++ u64 buckets, ++ u64 sectors_seen, ++ u64 sectors_moved), ++ TP_ARGS(c, buckets, sectors_seen, sectors_moved), + + TP_STRUCT__entry( + __field(dev_t, dev ) ++ __field(u64, buckets ) ++ __field(u64, sectors_seen ) + __field(u64, sectors_moved ) +- __field(u64, sectors_not_moved ) +- __field(u64, buckets_moved ) +- __field(u64, buckets_not_moved ) + ), + + TP_fast_assign( + __entry->dev = c->dev; ++ __entry->buckets = buckets; ++ __entry->sectors_seen = sectors_seen; + __entry->sectors_moved = sectors_moved; +- __entry->sectors_not_moved = sectors_not_moved; +- __entry->buckets_moved = buckets_moved; +- __entry->buckets_not_moved = buckets_moved; + ), + +- TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->sectors_moved, __entry->sectors_not_moved, +- __entry->buckets_moved, __entry->buckets_not_moved) ++ __entry->buckets, ++ __entry->sectors_seen, ++ __entry->sectors_moved) + ); + + TRACE_EVENT(copygc_wait, +@@ -1316,6 +1336,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); @@ -21448,7 +24415,7 @@ index 5597b9d6297f..9d40b7d4ea29 100644 TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, -@@ -1352,10 +1380,21 @@ TRACE_EVENT(path_downgrade, +@@ -1352,10 +1378,21 @@ TRACE_EVENT(path_downgrade, __entry->pos_snapshot) ); @@ -21474,7 +24441,7 @@ index 5597b9d6297f..9d40b7d4ea29 100644 ); TRACE_EVENT(write_buffer_flush, -@@ -1414,6 +1453,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, +@@ -1414,6 +1451,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); diff --git a/sys-kernel/hardened-kernel/hardened-kernel-6.12.8.ebuild b/sys-kernel/hardened-kernel/hardened-kernel-6.12.10.ebuild similarity index 100% rename from sys-kernel/hardened-kernel/hardened-kernel-6.12.8.ebuild rename to sys-kernel/hardened-kernel/hardened-kernel-6.12.10.ebuild